Text Encoder Layer Stacking

prishajain1 · prishajain1 · commit c3cafb7a8874 · 2026-04-11T14:42:50.000+05:30
diff --git a/src/maxdiffusion/configs/ltx2_3_video.yml b/src/maxdiffusion/configs/ltx2_3_video.yml
@@ -27,12 +27,12 @@ sampler: "from_checkpoint"
 # Generation parameters
 global_batch_size_to_train_on: 1
 num_inference_steps: 40
-guidance_scale: 3.0
-audio_guidance_scale: 7.0
+guidance_scale: 4.0
+audio_guidance_scale: 4.0
 stg_scale: 1.0
 audio_stg_scale: 1.0
-modality_scale: 3.0
-audio_modality_scale: 3.0
+modality_scale: 1.0
+audio_modality_scale: 1.0
 spatio_temporal_guidance_blocks: [28]
 fps: 24
 pipeline_type: multi-scale
diff --git a/src/maxdiffusion/pipelines/ltx2/ltx2_pipeline.py b/src/maxdiffusion/pipelines/ltx2/ltx2_pipeline.py
@@ -366,6 +366,7 @@ def create_model(rngs: nnx.Rngs, config: HyperParameters):
       connector_kwargs = {
           "dtype": jnp.float32,
           "weights_dtype": config.weights_dtype if hasattr(config, "weights_dtype") else jnp.float32,
+          "attention_kernel": config.attention if hasattr(config, "attention") else "flash",
       }
       if getattr(config, "model_name", "") == "ltx2.3":
         connector_kwargs.update(
@@ -884,13 +885,10 @@ def _get_gemma_prompt_embeds(
       text_encoder_hidden_states = text_encoder_outputs.hidden_states
       del text_encoder_outputs  # Free memory
 
-      prompt_embeds_list = []
-      # Iterate instead of stacking eagerly to avoid 5.7+ GB HBM allocations outside JIT
-      for state in text_encoder_hidden_states:
-        state_np = state.cpu().to(torch.float32).numpy()
-        prompt_embeds_list.append(jnp.array(state_np, dtype=jnp.bfloat16))
-
-      prompt_embeds = prompt_embeds_list
+      states_np = [state.cpu().to(torch.float32).numpy() for state in text_encoder_hidden_states]
+      stacked_np = np.stack(states_np, axis=-1)
+      flattened_np = stacked_np.reshape(batch_size, text_input_ids.shape[1], -1)
+      prompt_embeds = jnp.array(flattened_np, dtype=jnp.bfloat16)
       del text_encoder_hidden_states  # Free PyTorch tensor memory
 
       prompt_attention_mask = jnp.array(prompt_attention_mask.cpu().to(torch.float32).numpy(), dtype=jnp.bool_)