mlfoundations · shahromil16 · Feb 21, 2024 · Feb 21, 2024 · Feb 29, 2024 · Mar 1, 2024
diff --git a/open_lm/attention.py b/open_lm/attention.py
@@ -4,6 +4,15 @@
 from torch.nn import functional as F
 import xformers.ops as xops
 
+# Adding flag if using TE FP8
+using_te = False
+try:
+    import transformer_engine.pytorch as te
+
+    using_te = True
+except ImportError as ie:
+    using_te = False
+
 
 def get_rectangular_causal_mask(shape, q_seq_len, k_seq_len, device, dtype):
     """Create a rectangular causal mask.
@@ -137,6 +146,55 @@ def torch_attn(queries, keys, values, is_causal, attention_mask=None):
         )
 
 
+def torch_attn_te(queries, keys, values, is_causal, attention_mask=None):
+    _, num_q_heads, _, _ = queries.shape
+    _, _, hidden_dim_k, _ = values.shape
+    scaleddotproductattn_module = te.DotProductAttention(num_attention_heads=num_q_heads, kv_channels=hidden_dim_k)
+    if is_causal and keys.shape[1] > queries.shape[1] > 1:
+        q_seq_len = queries.shape[1]
+        k_seq_len = keys.shape[1]
+        # Same as above, we would like to use:
+        # mask = xops.fmha.attn_bias.LowerTriangularFromBottomRightMask().materialize((1, 1, q_seq_len, k_seq_len), queries.dtype, queries.device)
+        mask = get_rectangular_causal_mask((1, 1), q_seq_len, k_seq_len, queries.device, queries.dtype)
+        if attention_mask is not None:
+            apply_attention_mask_(mask, attention_mask, queries_dtype=queries.dtype)
+        return (
+            scaleddotproductattn_module(
+                queries.transpose(1, 2), keys.transpose(1, 2), values.transpose(1, 2), attention_mask=mask
+            )
+            .transpose(1, 2)
+            .contiguous()
+        )
+    else:
+        if attention_mask is None:
+            bias = None
+            # If we only have one query, assume we don't need to be in causal mode (can attend to all keys).
+            if queries.shape == 1:
+                is_causal = False
+        else:
+            if not is_causal:
+                raise NotImplementedError("attention_mask with is_causal=False is not yet implemented.")
+            # Build causal mask that assumes queries are in the end of the sequence.
+            batch, q_seq_len, heads, _ = queries.shape
+            k_seq_len = keys.shape[1]
+            bias = get_rectangular_causal_mask((batch, heads), q_seq_len, k_seq_len, queries.device, queries.dtype)
+            if attention_mask is not None:
+                apply_attention_mask_(bias, attention_mask, queries_dtype=queries.dtype)
+            # We apply causal mask in attention instead of using is_causal=True.
+            is_causal = False
+        return (
+            scaleddotproductattn_module(
+                queries.transpose(1, 2),
+                keys.transpose(1, 2),
+                values.transpose(1, 2),
+                attention_mask=bias,
+                attn_mask_type="causal" if is_causal else None,
+            )
+            .transpose(1, 2)
+            .contiguous()
+        )
+
+
 ATTN_ACTIVATIONS = {
     "relu": F.relu,
     "relu_squared": lambda x: torch.pow(F.relu(x), 2),
@@ -189,12 +247,7 @@ def custom_attn(
     return torch.einsum("bhqk,bkhd->bqhd", attn_weight, values)
 
 
-def get_attn_func(
-    attn_name,
-    attn_activation=None,
-    attn_seq_scalar=None,
-    alpha=None,
-):
+def get_attn_func(attn_name, attn_activation=None, attn_seq_scalar=None, alpha=None, use_fp8=False):
     if attn_name == "auto":
         return xformers_attn if torch.cuda.is_available() else torch_attn
     elif attn_name == "xformers_attn":
@@ -206,6 +259,8 @@ def get_attn_func(
         # call .contiguous() on the output tensor. [#188]
         return lambda *args, **kwargs: xformers_attn(*args, **kwargs).contiguous()
     elif attn_name == "torch_attn":
+        # if using_te and use_fp8:
+        #     return torch_attn_te
         return torch_attn
     elif attn_name == "custom_attn":
         assert (

diff --git a/open_lm/distributed.py b/open_lm/distributed.py
@@ -57,6 +57,7 @@ def init_distributed_device(args):
     args.world_size = 1
     args.rank = 0  # global rank
     args.local_rank = 0
+    args.world_group = None
     # For testing, allow forcing distributed mode to test distributed code path even on one gpu.
     if is_using_distributed() or args.force_distributed:
         if "SLURM_PROCID" in os.environ:
@@ -74,7 +75,7 @@ def init_distributed_device(args):
             os.environ["LOCAL_RANK"] = str(args.local_rank)
             os.environ["RANK"] = str(args.rank)
             os.environ["WORLD_SIZE"] = str(args.world_size)
-            torch.distributed.init_process_group(
+            args.world_group = torch.distributed.init_process_group(
                 backend=args.dist_backend,
                 init_method=args.dist_url,
                 world_size=args.world_size,
@@ -85,7 +86,9 @@ def init_distributed_device(args):
             # Note that this currently assumes that the world size is all gpus in a node.
             assert args.preset_world_size is None, "--preset_world_size with torchrun is not currently supported."
             args.local_rank, _, _ = world_info_from_env()
-            torch.distributed.init_process_group(backend=args.dist_backend, init_method=args.dist_url)
+            args.world_group = torch.distributed.init_process_group(
+                backend=args.dist_backend, init_method=args.dist_url
+            )
             args.world_size = torch.distributed.get_world_size()
             args.rank = torch.distributed.get_rank()
         args.distributed = True

diff --git a/open_lm/main.py b/open_lm/main.py
@@ -65,7 +65,6 @@
     terminate_sync_process,
 )
 
-
 LATEST_CHECKPOINT_NAME = "epoch_latest.pt"
 
 
@@ -466,13 +465,18 @@ def main(args):
 
     random_seed(args.seed, 0)
 
+    tensor_parallel_group = None
+    if args.use_fp8:
+        tensor_parallel_group = torch.distributed.new_group(ranks=[0], backend="nccl")
+        logging.info("Using FP8 to run training.")
+
     model = None
     if args.hf_model is not None:
         model = create_wrapped_hf_model(args)
     else:
         # Optional: Use meta device
         with torch.device("meta" if args.experimental_meta_device and args.fsdp else args.device):
-            model = create_model(args)
+            model = create_model(args, tensor_parallel_group)
 
     args.vocab_size = model.vocab_size
     args.seq_len = model.seq_len
@@ -548,8 +552,10 @@ def main(args):
 
             # Initialize FSDP. Use the same seed across workers to ensure reset_parameters is the same across workers.
             random_seed(args.seed, rank=0)
+
             model = FSDP(
                 model,
+                process_group=args.world_group,
                 auto_wrap_policy=transformer_auto_wrapper_policy,
                 device_id=device,
                 mixed_precision=mp_policy,
@@ -832,6 +838,7 @@ def main(args):
             total_steps=total_steps,
             args=args,
             tb_writer=writer,
+            data_parallel_group=args.world_group,
         )
 
         if args.distributed:

diff --git a/open_lm/model.py b/open_lm/model.py
@@ -35,6 +35,24 @@
 except ImportError:
     MambaLMHeadModel = None
 
+# Adding flag if using TE FP8
+using_te = False
+LinearTE = nn.Linear
+try:
+    import transformer_engine.pytorch as te
+
+    using_te = True
+
+    class LinearTE(te.Linear):
+        def __init__(self, *args, **kwargs):
+            super().__init__(*args, **kwargs)
+
+        def forward(self, inp: torch.Tensor, is_first_microbatch: bool = True):
+            return super().forward(inp, is_first_microbatch=True)
+
+except ImportError as ie:
+    using_te = False
+
 # from openclip
 _MODEL_CONFIG_PATHS = [Path(__file__).parent / f"model_configs/"]
 _MODEL_CONFIGS = {}  # directory (model_name: config) of model architecture configs
@@ -86,7 +104,9 @@ class Params:
     seq_len: int = 2048
     post_embed_norm: bool = False
     weight_tying: bool = False
-    norm_type: nn.Module = nn.LayerNorm
+    norm_type: nn.Module = te.LayerNorm if using_te else nn.LayerNorm
+    linear_type: nn.Module = LinearTE if using_te else nn.Linear
+    te_device: str = "cuda" if using_te else None
     attn_func: Callable = xformers_attn if torch.cuda.is_available() else torch_attn
     apply_qk_norm: bool = False
     moe_loss_weight: float = 0.1
@@ -119,8 +139,8 @@ def __init__(self, layer_id, args: Params):
         super().__init__()
         self.n_heads = args.n_heads
         self.head_dim = args.dim // args.n_heads
-        self.in_proj = nn.Linear(args.dim, 3 * args.n_heads * self.head_dim, bias=False)
-        self.out_proj = nn.Linear(args.n_heads * self.head_dim, args.dim, bias=False)
+        self.in_proj = args.linear_type(args.dim, 3 * args.n_heads * self.head_dim, bias=False, device=args.te_device)
+        self.out_proj = args.linear_type(args.n_heads * self.head_dim, args.dim, bias=False, device=args.te_device)
         self.pos_embed = get_pos_embed(args)
         self.attn_fn = args.attn_func
         self.apply_qk_norm = args.apply_qk_norm
@@ -130,6 +150,7 @@ def __init__(self, layer_id, args: Params):
             args.norm_type(
                 args.n_heads * self.head_dim,
                 eps=args.norm_eps,
+                device=args.te_device,
             )
             if self.apply_qk_norm
             else nn.Identity()
@@ -138,6 +159,7 @@ def __init__(self, layer_id, args: Params):
             args.norm_type(
                 args.n_heads * self.head_dim,
                 eps=args.norm_eps,
+                device=args.te_device,
             )
             if self.apply_qk_norm
             else nn.Identity()
@@ -195,13 +217,13 @@ class GemmaMLP(nn.Module):
     Modified from https://github.com/google/gemma_pytorch/blob/01062c9ef4cf89ac0c985b25a734164ede017d0b/gemma/model.py#L182-L201
     """
 
-    def __init__(self, dim: int, hidden_dim: int, layer_id: int):
+    def __init__(self, dim: int, hidden_dim: int, layer_id: int, args: Params):
         super().__init__()
         self.dim = dim
         self.hidden_dim = hidden_dim
-        self.gate_proj = nn.Linear(dim, hidden_dim)
-        self.up_proj = nn.Linear(dim, hidden_dim)
-        self.down_proj = nn.Linear(hidden_dim, dim)
+        self.gate_proj = nn.Linear(dim, hidden_dim, device=args.te_device)
+        self.up_proj = nn.Linear(dim, hidden_dim, device=args.te_device)
+        self.down_proj = nn.Linear(hidden_dim, dim, device=args.te_device)
         self._layer_id = layer_id
 
     def forward(self, x):
@@ -225,10 +247,10 @@ def reset_parameters(self):
 # Same as pseudocode provided from xformers SwiGLU
 # https://github.com/facebookresearch/xformers
 class SwiGLUTorch(nn.Module):
-    def __init__(self, in_dim, hidden_dim, out_dim, bias=True):
+    def __init__(self, in_dim, hidden_dim, out_dim, args: Params = Params, bias=True):
         super().__init__()
-        self.w12 = nn.Linear(in_dim, 2 * hidden_dim, bias=bias)
-        self.w3 = nn.Linear(hidden_dim, out_dim, bias=bias)
+        self.w12 = nn.Linear(in_dim, 2 * hidden_dim, bias=bias, device=args.te_device)
+        self.w3 = nn.Linear(hidden_dim, out_dim, bias=bias, device=args.te_device)
 
     def forward(self, x):
         gate, x = self.w12(x).chunk(2, dim=-1)
@@ -252,17 +274,17 @@ def __init__(self, layer_id, args: Params):
         elif args.ffn_type == "swiglu_torch":
             # this follows llama / lit llama -- go to multiple of 256
             self.hidden_dim = 256 * ((int(2 * 4 * args.dim / 3) + 256 - 1) // 256)
-            self.feed_forward = SwiGLUTorch(args.dim, self.hidden_dim, args.dim, bias=False)
+            self.feed_forward = SwiGLUTorch(args.dim, self.hidden_dim, args.dim, args, bias=False)
         elif args.ffn_type == "gelu":
             # Follows mosaic mpt7b, but without a bias.
             self.hidden_dim = args.dim * 4
-            self._ff_w1 = nn.Linear(args.dim, self.hidden_dim, bias=False)
-            self._ff_w2 = nn.Linear(self.hidden_dim, args.dim, bias=False)
+            self._ff_w1 = nn.Linear(args.dim, self.hidden_dim, bias=False, device=args.te_device)
+            self._ff_w2 = nn.Linear(self.hidden_dim, args.dim, bias=False, device=args.te_device)
             self.feed_forward = nn.Sequential(self._ff_w1, nn.GELU(approximate="none"), self._ff_w2)
         elif args.ffn_type == "gemma_geglu":
             # this follows llama / lit llama -- go to multiple of 256
             self.hidden_dim = 256 * ((int(2 * 4 * args.dim / 3) + 256 - 1) // 256)
-            self.feed_forward = GemmaMLP(args.dim, self.hidden_dim, layer_id)
+            self.feed_forward = GemmaMLP(args.dim, self.hidden_dim, layer_id, args)
         elif args.ffn_type == "moe":
             moe_args = MoEArgs(
                 hidden_size=args.dim,
@@ -283,10 +305,12 @@ def __init__(self, layer_id, args: Params):
         self.attention_norm = args.norm_type(
             args.dim,
             eps=args.norm_eps,
+            device=args.te_device,
         )
         self.ffn_norm = args.norm_type(
             args.dim,
             eps=args.norm_eps,
+            device=args.te_device,
         )
         self.attention.seq_len = args.seq_len
         self.reset_parameters()
@@ -455,9 +479,15 @@ def create_params(args):
             vocab_size=cfg["vocab_size"],
             post_embed_norm=cfg["post_embed_norm"],
             weight_tying=cfg["weight_tying"],
-            norm_type=get_norm_class(cfg.get("model_norm", args.model_norm)),
+            norm_type=get_norm_class(cfg.get("model_norm", args.model_norm), args.use_fp8),
+            linear_type=LinearTE if (using_te and args.use_fp8) else nn.Linear,
+            te_device="cuda" if (using_te and args.use_fp8) else None,
             attn_func=get_attn_func(
-                args.attn_name, args.attn_activation, args.attn_seq_scalar, args.attn_seq_scalar_alpha
+                args.attn_name,
+                args.attn_activation,
+                args.attn_seq_scalar,
+                args.attn_seq_scalar_alpha,
+                use_fp8=args.use_fp8,
             ),
             apply_qk_norm=cfg.get("qk_norm", args.qk_norm),
             positional_embedding_type=cfg.get("positional_embedding_type", args.positional_embedding_type),
@@ -495,10 +525,23 @@ def forward(self, x):
         return out, None, None
 
 
-def create_model(args):
+def te_linear_ops(model, exclude_modules=["output"], tensor_parallel_group=None):
+    for name, module in model.named_children():
+        if len(list(module.children())) > 0:
+            te_linear_ops(module, exclude_modules, tensor_parallel_group)
+        if isinstance(module, te.Linear):
+            model._modules[name].set_tensor_parallel_group(tensor_parallel_group)
+    return model
+
+
+def create_model(args, tensor_parallel_group=None):
     if "mamba" in args.model:
         model = Mamba(create_params(args))
+        if tensor_parallel_group is not None and using_te:
+            model = te_linear_ops(model.to(torch.bfloat16).cuda(), tensor_parallel_group)
         return model
     else:
         model = Transformer(create_params(args))
+        if tensor_parallel_group is not None and using_te:
+            model = te_linear_ops(model.to(torch.bfloat16).cuda(), tensor_parallel_group)
         return model