From 53143f58645614ac657f36c8a5109f2f1b621529 Mon Sep 17 00:00:00 2001
From: Shenggan <csg19971016@gmail.com>
Date: Mon, 28 Feb 2022 05:50:20 +0000
Subject: [PATCH] init commit

---
 README.md                        | 29 +++++++++++++++++++++++++++--
 benchmark/perf.py                | 24 ++++++++++++------------
 fastfold/__init__.py             |  1 +
 fastfold/distributed/__init__.py |  4 ++--
 fastfold/distributed/core.py     |  4 ++--
 fastfold/model/evoformer.py      |  2 +-
 setup.py                         |  2 +-
 7 files changed, 46 insertions(+), 20 deletions(-)

diff --git a/README.md b/README.md
index b0d80282..c362fac9 100644
--- a/README.md
+++ b/README.md
@@ -13,9 +13,9 @@ FastFold provides a **high-performance implementation of Evoformer** with the fo
 1. Excellent kernel performance on GPU platform
 2. Supporting Dynamic Axial Parallelism(DAP)
     * Break the memory limit of single GPU and reduce the overall training time
-    * Distributed inference can significantly speed up inference and make extremely long sequence inference possible
+    * DAP can significantly speed up inference and make ultra-long sequence inference possible
 3. Ease of use
-    * Replace a few lines and you can use FastFold in your project
+    * Huge performance gains with a few lines changes
     * You don't need to care about how the parallel part is implemented
 
 ## Installation
@@ -38,6 +38,24 @@ cd FastFold
 python setup.py install --cuda_ext
 ```
 
+## Usage
+
+You can use `Evoformer` as `nn.Module` in your project after `from fastfold.model import Evoformer`:
+
+```python
+from fastfold.model import Evoformer
+evoformer_layer = Evoformer()
+```
+
+If you want to use Dynamic Axial Parallelism, add a line of initialize with `fastfold.distributed.init_dap` after `torch.distributed.init_process_group`.
+
+```python
+from fastfold.distributed import init_dap
+
+torch.distributed.init_process_group(backend='nccl', init_method='env://')
+init_dap(args.dap_size)
+```
+
 ## Performance Benchmark
 
 We have included a performance benchmark script in `./benchmark`. You can benchmark the performance of Evoformer using different settings.
@@ -47,6 +65,13 @@ cd ./benchmark
 torchrun --nproc_per_node=1 perf.py --msa-length 128 --res-length 256
 ```
 
+Benchmark Dynamic Axial Parallelism with 2 GPUs:
+
+```shell
+cd ./benchmark
+torchrun --nproc_per_node=2 perf.py --msa-length 128 --res-length 256 --dap-size 2
+```
+
 If you want to benchmark with [OpenFold](https://github.com/aqlaboratory/openfold), you need to install OpenFold first and benchmark with option `--openfold`:
 
 ```shell
diff --git a/benchmark/perf.py b/benchmark/perf.py
index e8444ce0..c68a9f8c 100644
--- a/benchmark/perf.py
+++ b/benchmark/perf.py
@@ -4,34 +4,34 @@
 import torch
 import torch.nn as nn
 
-from fastfold.distributed import init_shadowcore
+from fastfold.distributed import init_dap
 from fastfold.model import Evoformer
 
 
 def main():
 
-    parser = argparse.ArgumentParser(description='MSA Attention Standalone Perf Benchmark')
-    parser.add_argument("--dap-size", default=1, type=int)
+    parser = argparse.ArgumentParser(description='Evoformer Standalone Perf Benchmark')
+    parser.add_argument("--dap-size", default=1, type=int, help='batch size')
     parser.add_argument('--batch-size', default=1, type=int, help='batch size')
-    parser.add_argument('--msa-length', default=132, type=int, help='Sequence Length of Input')
+    parser.add_argument('--msa-length', default=132, type=int, help='Sequence Length of MSA')
     parser.add_argument('--res-length',
                         default=256,
                         type=int,
-                        help='Start Range of Number of Sequences')
+                        help='Sequence Length of Residues')
     parser.add_argument('--trials', default=50, type=int, help='Number of Trials to Execute')
     parser.add_argument('--warmup-trials', default=5, type=int, help='Warmup Trials to discard')
     parser.add_argument('--layers',
                         default=12,
                         type=int,
-                        help='Attention Layers to Execute to Gain CPU/GPU Time Overlap')
+                        help='Evoformer Layers to Execute')
     parser.add_argument('--cm', default=256, type=int, help='MSA hidden dimension')
     parser.add_argument('--cz', default=128, type=int, help='Pair hidden dimension')
     parser.add_argument('--heads', default=8, type=int, help='Number of Multihead Attention heads')
     parser.add_argument('--openfold',
                         action='store_true',
-                        help='torch.nn.MultitheadAttention Version.')
+                        help='Benchmark with Evoformer Implementation from OpenFold.')
     parser.add_argument('--fwd', action='store_true', help='Only execute Fwd Pass.')
-    parser.add_argument('--prof', action='store_true', help='Only execute Fwd Pass.')
+    parser.add_argument('--prof', action='store_true', help='run with profiler.')
 
     args = parser.parse_args()
 
@@ -48,10 +48,10 @@ def main():
     print(
         'Training in distributed mode with multiple processes, 1 GPU per process. Process %d, total %d.'
         % (args.global_rank, args.world_size))
-    init_shadowcore(args.tensor_model_parallel_size)
+    init_dap(args.dap_size)
 
     precision = torch.bfloat16
-    if args.tensor_model_parallel_size > 1:
+    if args.dap_size > 1:
         # (PyTorch issue) Currently All2All communication does not support the Bfloat16 datatype in PyTorch
         precision = torch.float16
 
@@ -111,13 +111,13 @@ def forward(self, node, pair, node_mask, pair_mask):
         stop_evt_bwd.append(torch.cuda.Event(enable_timing=True))
 
     inputs_node = torch.randn(args.batch_size,
-                              args.msa_length // args.tensor_model_parallel_size,
+                              args.msa_length // args.dap_size,
                               args.res_length,
                               args.cm,
                               dtype=precision,
                               device=torch.device("cuda")).requires_grad_(True)
     inputs_pair = torch.randn(args.batch_size,
-                              args.res_length // args.tensor_model_parallel_size,
+                              args.res_length // args.dap_size,
                               args.res_length,
                               args.cz,
                               dtype=precision,
diff --git a/fastfold/__init__.py b/fastfold/__init__.py
index e69de29b..1798ea26 100644
--- a/fastfold/__init__.py
+++ b/fastfold/__init__.py
@@ -0,0 +1 @@
+VERSION = "0.1.0-beta"
\ No newline at end of file
diff --git a/fastfold/distributed/__init__.py b/fastfold/distributed/__init__.py
index ca45829a..cda9f434 100644
--- a/fastfold/distributed/__init__.py
+++ b/fastfold/distributed/__init__.py
@@ -1,11 +1,11 @@
-from .core import (init_shadowcore, shadowcore_is_initialized, get_tensor_model_parallel_group,
+from .core import (init_dap, dap_is_initialized, get_tensor_model_parallel_group,
                    get_data_parallel_group, get_tensor_model_parallel_world_size,
                    get_tensor_model_parallel_rank, get_data_parallel_world_size,
                    get_data_parallel_rank, get_tensor_model_parallel_src_rank)
 from .comm import (_reduce, _split, _gather, copy, scatter, reduce, gather, col_to_row, row_to_col)
 
 __all__ = [
-    'init_shadowcore', 'shadowcore_is_initialized', 'get_tensor_model_parallel_group',
+    'init_dap', 'dap_is_initialized', 'get_tensor_model_parallel_group',
     'get_data_parallel_group', 'get_tensor_model_parallel_world_size',
     'get_tensor_model_parallel_rank', 'get_data_parallel_world_size', 'get_data_parallel_rank',
     'get_tensor_model_parallel_src_rank', '_reduce', '_split', '_gather', 'copy', 'scatter',
diff --git a/fastfold/distributed/core.py b/fastfold/distributed/core.py
index b8c36295..afebefa3 100644
--- a/fastfold/distributed/core.py
+++ b/fastfold/distributed/core.py
@@ -15,7 +15,7 @@ def ensure_divisibility(numerator, denominator):
     assert numerator % denominator == 0, '{} is not divisible by {}'.format(numerator, denominator)
 
 
-def init_shadowcore(tensor_model_parallel_size_=1):
+def init_dap(tensor_model_parallel_size_=1):
 
     assert dist.is_initialized()
 
@@ -51,7 +51,7 @@ def init_shadowcore(tensor_model_parallel_size_=1):
         print('> initialize data parallel with size {}'.format(data_parallel_size_))
 
 
-def shadowcore_is_initialized():
+def dap_is_initialized():
     """Check if model and data parallel groups are initialized."""
     if _TENSOR_MODEL_PARALLEL_GROUP is None or \
         _DATA_PARALLEL_GROUP is None:
diff --git a/fastfold/model/evoformer.py b/fastfold/model/evoformer.py
index f94a76fb..6da5253b 100644
--- a/fastfold/model/evoformer.py
+++ b/fastfold/model/evoformer.py
@@ -6,7 +6,7 @@
 
 class Evoformer(nn.Module):
 
-    def __init__(self, d_node, d_pair):
+    def __init__(self, d_node=256, d_pair=128):
         super(Evoformer, self).__init__()
 
         self.msa_stack = MSAStack(d_node, d_pair, p_drop=0.15)
diff --git a/setup.py b/setup.py
index e78011ba..836b5a1a 100644
--- a/setup.py
+++ b/setup.py
@@ -141,7 +141,7 @@ def cuda_ext_helper(name, sources, extra_cuda_flags):
 
 setup(
     name='fastfold',
-    version='0.0.1-beta',
+    version='0.1.0-beta',
     packages=find_packages(exclude=(
         'assets',
         'benchmark',