test for multihost without scan.

entrpn · entrpn · commit f0efb5cb3479 · 2025-07-31T20:51:42.000Z
diff --git a/src/maxdiffusion/models/attention_flax.py b/src/maxdiffusion/models/attention_flax.py
@@ -665,7 +665,7 @@ def __init__(
     # None axes corresponds to the stacked weights across all blocks
     # because of the use of nnx.vmap and nnx.scan.
     # Dims are [num_blocks, embed, heads]
-    kernel_axes = (None, "embed", "heads")
+    kernel_axes = ("embed", "heads")
     qkv_init_kernel = nnx.with_partitioning(nnx.initializers.lecun_normal(), kernel_axes)
 
     self.query = nnx.Linear(
@@ -679,7 +679,6 @@ def __init__(
         bias_init=nnx.with_partitioning(
             nnx.initializers.zeros,
             (
-                None,
                 "embed",
             ),
         ),
@@ -696,7 +695,6 @@ def __init__(
         bias_init=nnx.with_partitioning(
             nnx.initializers.zeros,
             (
-                None,
                 "embed",
             ),
         ),
@@ -713,7 +711,6 @@ def __init__(
         bias_init=nnx.with_partitioning(
             nnx.initializers.zeros,
             (
-                None,
                 "embed",
             ),
         ),
@@ -723,10 +720,16 @@ def __init__(
         rngs=rngs,
         in_features=self.inner_dim,
         out_features=self.inner_dim,
-        kernel_init=nnx.with_partitioning(nnx.initializers.lecun_normal(), (None, "heads", "embed")),
+        kernel_init=nnx.with_partitioning(nnx.initializers.lecun_normal(), ("heads", "embed")),
         dtype=dtype,
         param_dtype=weights_dtype,
         precision=precision,
+        bias_init=nnx.with_partitioning(
+            nnx.initializers.zeros,
+            (
+                "heads",
+            ),
+        ),
     )
 
     self.norm_q = None
@@ -740,7 +743,6 @@ def __init__(
           scale_init=nnx.with_partitioning(
               nnx.initializers.ones,
               (
-                  None,
                   "norm",
               ),
           ),
@@ -754,7 +756,6 @@ def __init__(
           scale_init=nnx.with_partitioning(
               nnx.initializers.ones,
               (
-                  None,
                   "norm",
               ),
           ),
@@ -780,6 +781,7 @@ def _apply_rope(self, xq: jax.Array, xk: jax.Array, freqs_cis: jax.Array) -> Tup
   def __call__(
       self, hidden_states: jax.Array, encoder_hidden_states: jax.Array = None, rotary_emb: Optional[jax.Array] = None
   ) -> jax.Array:
+    #breakpoint()
     hidden_states = jax.lax.with_sharding_constraint(hidden_states, PartitionSpec("data", "fsdp", "tensor"))
     encoder_hidden_states = jax.lax.with_sharding_constraint(encoder_hidden_states, PartitionSpec("data", "fsdp", "tensor"))
     dtype = hidden_states.dtype
@@ -799,10 +801,12 @@ def __call__(
       value_proj = _unflatten_heads(value_proj, self.heads)
       # output of _unflatten_heads Batch, heads, seq_len, head_dim
       query_proj, key_proj = self._apply_rope(query_proj, key_proj, rotary_emb)
+      #query_proj = query_proj
 
     attn_output = self.attention_op.apply_attention(query_proj, key_proj, value_proj)
-
+    attn_output = jax.lax.with_sharding_constraint(attn_output, PartitionSpec("data", "fsdp", "tensor"))
     attn_output = attn_output.astype(dtype=dtype)
+    
 
     hidden_states = self.proj_attn(attn_output)
     return hidden_states
diff --git a/src/maxdiffusion/models/wan/transformers/transformer_wan.py b/src/maxdiffusion/models/wan/transformers/transformer_wan.py
@@ -171,6 +171,17 @@ def __init__(
         dtype=dtype,
         param_dtype=weights_dtype,
         precision=precision,
+        # kernel_init=nnx.with_partitioning(
+        #     nnx.initializers.xavier_uniform(),
+        #     ("blockwise", None, None),
+        # ),
+        # bias_init=nnx.with_partitioning(
+        #     nnx.initializers.zeros,
+        #     (
+        #         "blockwise",
+        #         None,
+        #     ),
+        # ),
     )
 
   def __call__(self, x: jax.Array) -> jax.Array:
@@ -218,10 +229,18 @@ def __init__(
         kernel_init=nnx.with_partitioning(
             nnx.initializers.xavier_uniform(),
             (
+#                "blockwise",
                 "mlp",
                 "embed",
             ),
         ),
+        # bias_init=nnx.with_partitioning(
+        #     nnx.initializers.zeros,
+        #     (
+        #         "blockwise",
+        #         "mlp",
+        #     ),
+        # ),
     )
 
   def __call__(self, hidden_states: jax.Array) -> jax.Array:
@@ -389,10 +408,9 @@ def __init__(
     )
 
     # 3. Transformer blocks
-    @nnx.split_rngs(splits=num_layers)
-    @nnx.vmap(in_axes=0, out_axes=0)
-    def init_block(rngs):
-      return WanTransformerBlock(
+    blocks = []
+    for _ in range(num_layers):
+      block = WanTransformerBlock(
           rngs=rngs,
           dim=inner_dim,
           ffn_dim=ffn_dim,
@@ -408,10 +426,15 @@ def init_block(rngs):
           precision=precision,
           attention=attention,
       )
+      blocks.append(block)
+    self.blocks = blocks
 
-    self.gradient_checkpoint = GradientCheckpointType.from_str(remat_policy)
+    # 2. Use a predicate to create a "state-free" version.
+    # The lambda function `lambda _: False` simply tells nnx.state_if
+    # to filter out ALL state components (params, variables, etc.).
+    # self.block_template = nnx.state_if(lambda _: False, template_block_with_state)
 
-    self.blocks = init_block(rngs)
+    self.gradient_checkpoint = GradientCheckpointType.from_str(remat_policy)
 
     self.norm_out = FP32LayerNorm(rngs=rngs, dim=inner_dim, eps=eps, elementwise_affine=False)
     self.proj_out = nnx.Linear(
@@ -426,7 +449,7 @@ def init_block(rngs):
     key = rngs.params()
     self.scale_shift_table = nnx.Param(
         jax.random.normal(key, (1, 2, inner_dim)) / inner_dim**0.5,
-        kernel_init=nnx.with_partitioning(nnx.initializers.xavier_uniform(), (None, None, "embed")),
+        kernel_init=nnx.with_partitioning(nnx.initializers.xavier_uniform(), (None, "embed")),
     )
 
   def __call__(
@@ -456,22 +479,12 @@ def __call__(
 
     if encoder_hidden_states_image is not None:
       raise NotImplementedError("img2vid is not yet implemented.")
-
-    def scan_fn(carry, block):
-      hidden_states, encoder_hidden_states, timestep_proj, rotary_emb = carry
-      hidden_states = block(hidden_states, encoder_hidden_states, timestep_proj, rotary_emb)
-      return (hidden_states, encoder_hidden_states, timestep_proj, rotary_emb)
-
-    initial_carry = (hidden_states, encoder_hidden_states, timestep_proj, rotary_emb)
-    rematted_block_forward = self.gradient_checkpoint.apply(scan_fn)
-    final_carry = nnx.scan(
-        rematted_block_forward,
-        length=self.num_layers,
-        in_axes=(nnx.Carry, 0),
-        out_axes=nnx.Carry,
-    )(initial_carry, self.blocks)
-
-    hidden_states = final_carry[0]
+    
+    for block in self.blocks:
+      def block_forward(hidden_states, encoder_hidden_states, timestep_proj, rotary_emb):
+        return block(hidden_states, encoder_hidden_states, timestep_proj, rotary_emb)
+      rematted_block_forward = self.gradient_checkpoint.apply(block_forward)
+      hidden_states = rematted_block_forward(hidden_states, encoder_hidden_states, timestep_proj, rotary_emb)
 
     shift, scale = jnp.split(self.scale_shift_table + jnp.expand_dims(temb, axis=1), 2, axis=1)
 
diff --git a/src/maxdiffusion/models/wan/wan_utils.py b/src/maxdiffusion/models/wan/wan_utils.py
@@ -171,9 +171,7 @@ def load_wan_transformer(
     return load_base_wan_transformer(pretrained_model_name_or_path, eval_shapes, device, hf_download, num_layers)
 
 
-def load_base_wan_transformer(
-    pretrained_model_name_or_path: str, eval_shapes: dict, device: str, hf_download: bool = True, num_layers: int = 40
-):
+def load_base_wan_transformer(pretrained_model_name_or_path: str, eval_shapes: dict, device: str, hf_download: bool = True, num_layers: int = 40):
   device = jax.local_devices(backend=device)[0]
   subfolder = "transformer"
   filename = "diffusion_pytorch_model.safetensors.index.json"
@@ -231,22 +229,9 @@ def load_base_wan_transformer(
       renamed_pt_key = renamed_pt_key.replace("norm2", "norm2.layer_norm")
       pt_tuple_key = tuple(renamed_pt_key.split("."))
 
-      if "blocks" in pt_tuple_key:
-        new_key = ("blocks",) + pt_tuple_key[2:]
-        block_index = int(pt_tuple_key[1])
-        pt_tuple_key = new_key
-      flax_key, flax_tensor = rename_key_and_reshape_tensor(
-          pt_tuple_key, tensor, random_flax_state_dict, model_type=WAN_MODEL
-      )
+      flax_key, flax_tensor = rename_key_and_reshape_tensor(pt_tuple_key, tensor, random_flax_state_dict)
       flax_key = rename_for_nnx(flax_key)
       flax_key = _tuple_str_to_int(flax_key)
-
-      if "blocks" in flax_key:
-        if flax_key in flax_state_dict:
-          new_tensor = flax_state_dict[flax_key]
-        else:
-          new_tensor = jnp.zeros((num_layers,) + flax_tensor.shape)
-        flax_tensor = new_tensor.at[block_index].set(flax_tensor)
       flax_state_dict[flax_key] = jax.device_put(jnp.asarray(flax_tensor), device=cpu)
     validate_flax_state_dict(eval_shapes, flax_state_dict)
     flax_state_dict = unflatten_dict(flax_state_dict)
diff --git a/src/maxdiffusion/trainers/wan_trainer.py b/src/maxdiffusion/trainers/wan_trainer.py
@@ -79,8 +79,7 @@ def __init__(self, config):
     if config.train_text_encoder:
       raise ValueError("this script currently doesn't support training text_encoders")
 
-    #self.global_batch_size = self.config.per_device_batch_size * jax.device_count()
-    self.global_batch_size = config.global_batch_size if config.global_batch_size > 0 else config.per_device_batch_size * jax.device_count()
+    self.global_batch_size = config.per_device_batch_size * jax.device_count()
 
   def post_training_steps(self, pipeline, params, train_states, msg=""):
     pass
@@ -97,7 +96,8 @@ def calculate_tflops(self, pipeline):
     return 0
   
   def get_data_shardings(self, mesh):
-    data_sharding = jax.sharding.NamedSharding(mesh, P(*self.config.data_sharding[0]))
+    p_spec = P(*self.config.data_sharding)
+    data_sharding = jax.sharding.NamedSharding(mesh, p_spec)
     data_sharding = {
       "latents" : data_sharding,
       "encoder_hidden_states" : data_sharding
@@ -143,7 +143,6 @@ def prepare_sample(features):
   def start_training(self):
 
     pipeline = self.load_checkpoint()
-    # del pipeline.vae
 
     # Generate a sample before training to compare against generated sample after training.
     #pretrained_video_path = generate_sample(self.config, pipeline, filename_prefix="pre-training-")
@@ -178,6 +177,7 @@ def training_loop(self, pipeline, optimizer, learning_rate_scheduler, data_itera
       state = jax.lax.with_sharding_constraint(state, state_spec)
       state_shardings = nnx.get_named_sharding(state, mesh)
     data_shardings = self.get_data_shardings(mesh)
+    #breakpoint()
 
     writer = max_utils.initialize_summary_writer(self.config)
     writer_thread = threading.Thread(target=_tensorboard_writer_worker, args=(writer, self.config), daemon=True)