remove dependence on rotary embeddings

AnFreTh · AnFreTh · commit 161f6de0c1eb · 2025-02-12T14:14:42.000+01:00
diff --git a/mambular/arch_utils/layer_utils/attention_utils.py b/mambular/arch_utils/layer_utils/attention_utils.py
@@ -5,7 +5,6 @@
 import torch.nn as nn
 import torch.nn.functional as F
 from einops import rearrange
-from rotary_embedding_torch import RotaryEmbedding
 
 
 class GEGLU(nn.Module):
@@ -25,7 +24,7 @@ def FeedForward(dim, mult=4, dropout=0.0):
 
 
 class Attention(nn.Module):
-    def __init__(self, dim, heads=8, dim_head=64, dropout=0.0, rotary=False):
+    def __init__(self, dim, heads=8, dim_head=64, dropout=0.0):
         super().__init__()
         inner_dim = dim_head * heads
         self.heads = heads
@@ -34,18 +33,13 @@ def __init__(self, dim, heads=8, dim_head=64, dropout=0.0, rotary=False):
         self.to_qkv = nn.Linear(dim, inner_dim * 3, bias=False)
         self.to_out = nn.Linear(inner_dim, dim, bias=False)
         self.dropout = nn.Dropout(dropout)
-        self.rotary = rotary
         dim = np.int64(dim / 2)
-        self.rotary_embedding = RotaryEmbedding(dim=dim)
 
     def forward(self, x):
         h = self.heads
         x = self.norm(x)
         q, k, v = self.to_qkv(x).chunk(3, dim=-1)
         q, k, v = map(lambda t: rearrange(t, "b n (h d) -> b h n d", h=h), (q, k, v))  # type: ignore
-        if self.rotary:
-            q = self.rotary_embedding.rotate_queries_or_keys(q)
-            k = self.rotary_embedding.rotate_queries_or_keys(k)
         q = q * self.scale
 
         sim = torch.einsum("b h i d, b h j d -> b h i j", q, k)
@@ -61,7 +55,7 @@ def forward(self, x):
 
 
 class Transformer(nn.Module):
-    def __init__(self, dim, depth, heads, dim_head, attn_dropout, ff_dropout, rotary=False):
+    def __init__(self, dim, depth, heads, dim_head, attn_dropout, ff_dropout):
         super().__init__()
         self.layers = nn.ModuleList([])
 
@@ -74,7 +68,6 @@ def __init__(self, dim, depth, heads, dim_head, attn_dropout, ff_dropout, rotary
                             heads=heads,
                             dim_head=dim_head,
                             dropout=attn_dropout,
-                            rotary=rotary,
                         ),
                         FeedForward(dim, dropout=ff_dropout),
                     ]