del unused NNXCombinedTimestepTextProjEmbeddings and use NNXPixArtAlphaTextProjection

prishajain1 · prishajain1 · commit 53cfaae876d9 · 2026-04-12T17:50:10.000+05:30
diff --git a/src/maxdiffusion/models/embeddings_flax.py b/src/maxdiffusion/models/embeddings_flax.py
@@ -473,56 +473,6 @@ def __call__(self, timestep, pooled_projection):
     conditioning = timestep_emb + pooled_projections
     return conditioning
 
-class NNXCombinedTimestepTextProjEmbeddings(nnx.Module):
-  def __init__(
-      self,
-      rngs: nnx.Rngs,
-      in_features: int,
-      hidden_size: int,
-      embedding_dim: int,
-      out_features: int = None,
-      act_fn: str = "gelu_tanh",
-      dtype: jnp.dtype = jnp.float32,
-      weights_dtype: jnp.dtype = jnp.float32,
-      precision: jax.lax.Precision = None,
-  ):
-    if out_features is None:
-      out_features = hidden_size
-
-    self.linear = nnx.Linear(
-        rngs=rngs,
-        in_features=in_features,
-        out_features=out_features,
-        use_bias=True,
-        dtype=jnp.float32,
-        param_dtype=weights_dtype,
-        precision=precision,
-        kernel_init=nnx.with_partitioning(nnx.initializers.xavier_uniform(), ("mlp", "embed")),
-        bias_init=nnx.with_partitioning(nnx.initializers.zeros, ("embed",)),
-    )
-
-    self.time_proj = NNXTimesteps(num_channels=256, flip_sin_to_cos=True, downscale_freq_shift=0)
-
-    class EmbWrapper(nnx.Module):
-      def __init__(self, rngs: nnx.Rngs, embedding_dim: int, weights_dtype: jnp.dtype):
-        self.timestep_embedder = NNXTimestepEmbedding(
-            rngs=rngs,
-            in_channels=256,
-            time_embed_dim=embedding_dim,
-            dtype=jnp.float32,
-            weights_dtype=weights_dtype,
-        )
-
-    self.emb = EmbWrapper(rngs, embedding_dim, weights_dtype)
-
-  def __call__(self, caption, timestep):
-    hidden_states = self.linear(caption)
-
-    timesteps_proj = self.time_proj(timestep)
-    timesteps_emb = self.emb.timestep_embedder(timesteps_proj)
-
-    return hidden_states + timesteps_emb[:, None, :]
-
 
 class CombinedTimestepGuidanceTextProjEmbeddings(nn.Module):
   embedding_dim: int
diff --git a/src/maxdiffusion/models/ltx2/ltx2_3_utils.py b/src/maxdiffusion/models/ltx2/ltx2_3_utils.py
@@ -63,10 +63,10 @@ def rename_for_ltx2_3_transformer(key):
   # key = key.replace("audio_prompt_scale_shift_table", "audio_scale_shift_table")
   # key = key.replace("prompt_scale_shift_table", "scale_shift_table")
 
-  # if "prompt_adaln" in key:
-  #   key = key.replace("prompt_adaln", "caption_projection")
-  # if "audio_prompt_adaln" in key:
-  #   key = key.replace("audio_prompt_adaln", "audio_caption_projection")
+  if "prompt_adaln" in key:
+    key = key.replace("prompt_adaln", "caption_projection")
+  if "audio_prompt_adaln" in key:
+    key = key.replace("audio_prompt_adaln", "audio_caption_projection")
   if "video_text_proj_in" in key:
     key = key.replace("video_text_proj_in", "feature_extractor.video_linear")
   if "audio_text_proj_in" in key:
diff --git a/src/maxdiffusion/models/ltx2/transformer_ltx2.py b/src/maxdiffusion/models/ltx2/transformer_ltx2.py
@@ -737,38 +737,20 @@ def __init__(
 
     # 2. Prompt embeddings
     if self.use_prompt_embeddings:
-      if self.cross_attn_mod:
-        self.caption_projection = NNXCombinedTimestepTextProjEmbeddings(
-            rngs=rngs,
-            in_features=self.caption_channels,
-            hidden_size=self.cross_attention_dim,
-            embedding_dim=self.cross_attention_dim,
-            dtype=self.dtype,
-            weights_dtype=self.weights_dtype,
-        )
-        self.audio_caption_projection = NNXCombinedTimestepTextProjEmbeddings(
-            rngs=rngs,
-            in_features=self.audio_caption_channels,
-            hidden_size=self.audio_cross_attention_dim,
-            embedding_dim=self.audio_cross_attention_dim,
-            dtype=self.dtype,
-            weights_dtype=self.weights_dtype,
-        )
-      else:
-        self.caption_projection = NNXPixArtAlphaTextProjection(
-            rngs=rngs,
-            in_features=self.caption_channels,
-            hidden_size=inner_dim,
-            dtype=self.dtype,
-            weights_dtype=self.weights_dtype,
-        )
-        self.audio_caption_projection = NNXPixArtAlphaTextProjection(
-            rngs=rngs,
-            in_features=self.audio_caption_channels,
-            hidden_size=audio_inner_dim,
-            dtype=self.dtype,
-            weights_dtype=self.weights_dtype,
-        )
+      self.caption_projection = NNXPixArtAlphaTextProjection(
+          rngs=rngs,
+          in_features=self.caption_channels,
+          hidden_size=inner_dim,
+          dtype=self.dtype,
+          weights_dtype=self.weights_dtype,
+      )
+      self.audio_caption_projection = NNXPixArtAlphaTextProjection(
+          rngs=rngs,
+          in_features=self.audio_caption_channels,
+          hidden_size=audio_inner_dim,
+          dtype=self.dtype,
+          weights_dtype=self.weights_dtype,
+      )
     else:
       self.caption_projection = None
       self.audio_caption_projection = None
@@ -1146,14 +1128,8 @@ def __call__(
       audio_cross_attn_v2a_gate = audio_cross_attn_v2a_gate.reshape(batch_size, -1, audio_cross_attn_v2a_gate.shape[-1])
 
       if self.use_prompt_embeddings:
-        if self.cross_attn_mod:
-          encoder_hidden_states = self.caption_projection(encoder_hidden_states, timestep)
-          audio_encoder_hidden_states = self.audio_caption_projection(
-              audio_encoder_hidden_states, audio_timestep if audio_timestep is not None else timestep
-          )
-        else:
-          encoder_hidden_states = self.caption_projection(encoder_hidden_states)
-          audio_encoder_hidden_states = self.audio_caption_projection(audio_encoder_hidden_states)
+        encoder_hidden_states = self.caption_projection(encoder_hidden_states)
+        audio_encoder_hidden_states = self.audio_caption_projection(audio_encoder_hidden_states)
 
         encoder_hidden_states = encoder_hidden_states.reshape(batch_size, -1, hidden_states.shape[-1])
         audio_encoder_hidden_states = audio_encoder_hidden_states.reshape(batch_size, -1, audio_hidden_states.shape[-1])