just copy paste and do a version with plain transformer for RL purposes

lucidrains · Oct 17, 2024 · 8023931 · 8023931
1 parent a79389b
commit 8023931
Show file tree

Hide file tree

Showing 4 changed files with 442 additions and 4 deletions.
diff --git a/nGPT_pytorch/__init__.py b/nGPT_pytorch/__init__.py
@@ -7,3 +7,5 @@
     Attention,
     nGPT
 )
+
+from nGPT_pytorch.nTransformer import nTransformer
diff --git a/nGPT_pytorch/nGPT.py b/nGPT_pytorch/nGPT.py
@@ -306,9 +306,9 @@ class nGPT(Module):
     def __init__(
         self,
         *,
-        num_tokens,
         dim,
         depth,
+        num_tokens = None,
         dim_head = 64,
         heads = 8,
         attn_norm_qk = True,  # they say the query/key normalization is optional
@@ -347,7 +347,12 @@ def __init__(
         self.causal = causal
         alpha_init = default(alpha_init, 1. / depth)
 
-        self.token_embed = NormLinear_(dim, num_tokens)
+        # allow for plain stack of attention and feedforward, for trying to use in a different setting
+
+        only_transformer = not exists(num_tokens)
+        self.only_transformer = only_transformer
+
+        self.token_embed = NormLinear_(dim, num_tokens) if not only_transformer else None
 
         self.layers = ModuleList([])
 
@@ -421,7 +426,7 @@ def __init__(
 
             self.layers.append(ModuleList([attn_with_residual, ff_with_residual]))
 
-        self.to_logits = NormLinear_(dim, num_tokens) if not tied_embedding else None
+        self.to_logits = NormLinear_(dim, num_tokens) if (not tied_embedding or only_transformer) or not exists(num_tokens) else None
 
         self.logit_scale = Scale(num_tokens, s_logit_init, default(s_logit_scale, dim ** -0.5))