using ltx2adalayernormsingle

prishajain1 · prishajain1 · commit 06e8de88db57 · 2026-04-12T18:07:03.000+05:30
diff --git a/src/maxdiffusion/models/ltx2/ltx2_3_utils.py b/src/maxdiffusion/models/ltx2/ltx2_3_utils.py
@@ -63,10 +63,6 @@ def rename_for_ltx2_3_transformer(key):
   # key = key.replace("audio_prompt_scale_shift_table", "audio_scale_shift_table")
   # key = key.replace("prompt_scale_shift_table", "scale_shift_table")
 
-  if "prompt_adaln" in key:
-    key = key.replace("prompt_adaln", "caption_projection")
-  if "audio_prompt_adaln" in key:
-    key = key.replace("audio_prompt_adaln", "audio_caption_projection")
   if "video_text_proj_in" in key:
     key = key.replace("video_text_proj_in", "feature_extractor.video_linear")
   if "audio_text_proj_in" in key:
diff --git a/src/maxdiffusion/models/ltx2/transformer_ltx2.py b/src/maxdiffusion/models/ltx2/transformer_ltx2.py
@@ -737,17 +737,23 @@ def __init__(
 
     # 2. Prompt embeddings
     if self.use_prompt_embeddings:
-      self.caption_projection = NNXPixArtAlphaTextProjection(
+      self.caption_projection = None
+      self.audio_caption_projection = None
+      self.cross_attn_mod = True  # Force True for LTX-2.3 prompt modulation
+      
+      self.prompt_adaln = LTX2AdaLayerNormSingle(
           rngs=rngs,
-          in_features=self.caption_channels,
-          hidden_size=inner_dim,
+          embedding_dim=inner_dim,
+          num_mod_params=2,
+          use_additional_conditions=False,
           dtype=self.dtype,
           weights_dtype=self.weights_dtype,
       )
-      self.audio_caption_projection = NNXPixArtAlphaTextProjection(
+      self.audio_prompt_adaln = LTX2AdaLayerNormSingle(
           rngs=rngs,
-          in_features=self.audio_caption_channels,
-          hidden_size=audio_inner_dim,
+          embedding_dim=audio_inner_dim,
+          num_mod_params=2,
+          use_additional_conditions=False,
           dtype=self.dtype,
           weights_dtype=self.weights_dtype,
       )
@@ -1077,7 +1083,7 @@ def __call__(
       temb_audio = temb_audio.reshape(batch_size, -1, temb_audio.shape[-1])
       audio_embedded_timestep = audio_embedded_timestep.reshape(batch_size, -1, audio_embedded_timestep.shape[-1])
 
-      if self.cross_attn_mod and sigma is not None:
+      if self.use_prompt_embeddings and sigma is not None:
         audio_sigma = audio_sigma if audio_sigma is not None else sigma
         temb_prompt, _ = self.prompt_adaln(
             sigma.flatten(),