enabling gated attn only for connectors+a2v and v2a cross attn

prishajain1 · prishajain1 · commit 72fdd17fc623 · 2026-04-12T16:41:35.000+05:30
diff --git a/src/maxdiffusion/models/ltx2/transformer_ltx2.py b/src/maxdiffusion/models/ltx2/transformer_ltx2.py
@@ -145,7 +145,7 @@ def __init__(
         rope_type=rope_type,
         flash_block_sizes=flash_block_sizes,
         flash_min_seq_length=flash_min_seq_length,
-        gated_attn=gated_attn,
+        gated_attn=False,
     )
 
     self.audio_norm1 = nnx.RMSNorm(
@@ -172,7 +172,7 @@ def __init__(
         rope_type=rope_type,
         flash_block_sizes=flash_block_sizes,
         flash_min_seq_length=flash_min_seq_length,
-        gated_attn=gated_attn,
+        gated_attn=False,
     )
 
     # 2. Prompt Cross-Attention
@@ -200,7 +200,7 @@ def __init__(
         attention_kernel=self.attention_kernel,
         rope_type=rope_type,
         flash_block_sizes=flash_block_sizes,
-        gated_attn=gated_attn,
+        gated_attn=False,
     )
 
     self.audio_norm2 = nnx.RMSNorm(
@@ -228,7 +228,7 @@ def __init__(
         rope_type=rope_type,
         flash_block_sizes=flash_block_sizes,
         flash_min_seq_length=flash_min_seq_length,
-        gated_attn=gated_attn,
+        gated_attn=False,
     )
 
     # 3. Audio-to-Video (a2v) and Video-to-Audio (v2a) Cross-Attention
@@ -257,7 +257,7 @@ def __init__(
         rope_type=rope_type,
         flash_block_sizes=flash_block_sizes,
         flash_min_seq_length=0,
-        gated_attn=gated_attn,
+        gated_attn=self.cross_attn_mod,
     )
 
     self.video_to_audio_norm = nnx.RMSNorm(
@@ -285,7 +285,7 @@ def __init__(
         rope_type=rope_type,
         flash_block_sizes=flash_block_sizes,
         flash_min_seq_length=flash_min_seq_length,
-        gated_attn=gated_attn,
+        gated_attn=self.cross_attn_mod,
     )
 
     # 4. Feed Forward
@@ -1145,14 +1145,17 @@ def __call__(
       )
       audio_cross_attn_v2a_gate = audio_cross_attn_v2a_gate.reshape(batch_size, -1, audio_cross_attn_v2a_gate.shape[-1])
 
-      # 4. Prepare prompt embeddings
       if self.use_prompt_embeddings:
-        encoder_hidden_states = self.caption_projection(encoder_hidden_states, timestep)
-        encoder_hidden_states = encoder_hidden_states.reshape(batch_size, -1, hidden_states.shape[-1])
+        if self.cross_attn_mod:
+          encoder_hidden_states = self.caption_projection(encoder_hidden_states, timestep)
+          audio_encoder_hidden_states = self.audio_caption_projection(
+              audio_encoder_hidden_states, audio_timestep if audio_timestep is not None else timestep
+          )
+        else:
+          encoder_hidden_states = self.caption_projection(encoder_hidden_states)
+          audio_encoder_hidden_states = self.audio_caption_projection(audio_encoder_hidden_states)
 
-        audio_encoder_hidden_states = self.audio_caption_projection(
-            audio_encoder_hidden_states, audio_timestep if audio_timestep is not None else timestep
-        )
+        encoder_hidden_states = encoder_hidden_states.reshape(batch_size, -1, hidden_states.shape[-1])
         audio_encoder_hidden_states = audio_encoder_hidden_states.reshape(batch_size, -1, audio_hidden_states.shape[-1])
 
     # Construct perturbation_mask_per_layer for STG
diff --git a/src/maxdiffusion/pipelines/ltx2/ltx2_pipeline.py b/src/maxdiffusion/pipelines/ltx2/ltx2_pipeline.py
@@ -132,9 +132,9 @@ def create_model(rngs: nnx.Rngs, ltx2_config: dict):
         "caption_channels": 3840,
         "audio_caption_channels": 2048,
         "use_prompt_embeddings": False,
-        "gated_attn": True,
+        "gated_attn": False,
         "cross_attn_mod": True,
-        "audio_gated_attn": True,
+        "audio_gated_attn": False,
         "audio_cross_attn_mod": True,
     }
   else:
@@ -159,7 +159,7 @@ def create_model(rngs: nnx.Rngs, ltx2_config: dict):
   ltx2_config["names_which_can_be_offloaded"] = config.names_which_can_be_offloaded
 
   if getattr(config, "model_name", "") == "ltx2.3":
-    ltx2_config["gated_attn"] = True
+    ltx2_config["gated_attn"] = False
     ltx2_config["cross_attn_mod"] = True
     ltx2_config["perturbed_attn"] = True
     ltx2_config["use_prompt_embeddings"] = False