wip - add dropout change sharding

entrpn · entrpn · commit fb126028d100 · 2025-08-25T20:29:31.000Z
diff --git a/src/maxdiffusion/configs/base_wan_14b.yml b/src/maxdiffusion/configs/base_wan_14b.yml
@@ -56,8 +56,9 @@ jit_initializers: True
 # Set true to load weights from pytorch
 from_pt: True
 split_head_dim: True
-attention: 'flash' # Supported attention: dot_product, flash, cudnn_flash_te
+attention: 'flash' # Supported attention: dot_product, flash, cudnn_flash_te, ring
 flash_min_seq_length: 4096
+dropout: 0.1
 
 flash_block_sizes: {}
 # Use on v6e
diff --git a/src/maxdiffusion/input_pipeline/_tfds_data_processing.py b/src/maxdiffusion/input_pipeline/_tfds_data_processing.py
@@ -78,9 +78,18 @@ def make_tf_iterator(
   train_iter = multihost_dataloading.MultiHostDataLoadIterator(train_ds, mesh)
   return train_iter
 
+
 # TODO - https://github.com/google/array_record/blob/main/beam/examples/example_gcs_conversion.py
 def _make_tfrecord_iterator(
-    config, dataloading_host_index, dataloading_host_count, mesh, global_batch_size, feature_description_fn, prepare_sample_fn, dataset_path, is_training: bool
+    config,
+    dataloading_host_index,
+    dataloading_host_count,
+    mesh,
+    global_batch_size,
+    feature_description_fn,
+    prepare_sample_fn,
+    dataset_path,
+    is_training: bool,
 ):
   # set load_tfrecord_cached to True in config to use pre-processed tfrecord dataset.
   # pedagogical_examples/dataset_tf_cache_to_tfrecord.py to convert tf preprocessed dataset to tfrecord.
@@ -93,10 +102,10 @@ def _make_tfrecord_iterator(
   # Determine whether to use the "cached" dataset, which requires externally
   # provided parsing functions, or the default one with its internal parsing logic.
   make_cached_tfrecord_iterator = (
-    config.cache_latents_text_encoder_outputs
-    and is_dataset_dir_valid
-    and "load_tfrecord_cached" in config.get_keys()
-    and config.load_tfrecord_cached
+      config.cache_latents_text_encoder_outputs
+      and is_dataset_dir_valid
+      and "load_tfrecord_cached" in config.get_keys()
+      and config.load_tfrecord_cached
   )
 
   feature_description = {
@@ -121,42 +130,47 @@ def prepare_sample(features):
   if not is_training:
     num_eval_samples = 0
     for _ in ds:
-        num_eval_samples += 1
+      num_eval_samples += 1
 
     remainder = num_eval_samples % global_batch_size
     if remainder != 0:
-        num_to_pad = global_batch_size - remainder
-        # Create a dataset of padding samples from the beginning
-        padding_ds = ds.take(num_to_pad)
-        # Add the padding samples to the end
-        ds = ds.concatenate(padding_ds)
-        max_logging.log(f"Padded evaluation dataset with {num_to_pad} samples.")
+      num_to_pad = global_batch_size - remainder
+      # Create a dataset of padding samples from the beginning
+      padding_ds = ds.take(num_to_pad)
+      # Add the padding samples to the end
+      ds = ds.concatenate(padding_ds)
+      max_logging.log(f"Padded evaluation dataset with {num_to_pad} samples.")
 
   used_prepare_sample = prepare_sample_fn if make_cached_tfrecord_iterator else prepare_sample
   ds = (
-    ds.shard(num_shards=dataloading_host_count, index=dataloading_host_index)
-    .map(_parse_tfrecord_fn, num_parallel_calls=AUTOTUNE)
-    .map(used_prepare_sample, num_parallel_calls=AUTOTUNE)
+      ds.shard(num_shards=dataloading_host_count, index=dataloading_host_index)
+      .map(_parse_tfrecord_fn, num_parallel_calls=AUTOTUNE)
+      .map(used_prepare_sample, num_parallel_calls=AUTOTUNE)
   )
   if is_training:
     ds = (
-      ds.shuffle(global_batch_size * 10)
-      .batch(global_batch_size // dataloading_host_count, drop_remainder=True)
-      .repeat(-1)
-      .prefetch(AUTOTUNE)
+        ds.shuffle(global_batch_size * 10)
+        .batch(global_batch_size // dataloading_host_count, drop_remainder=True)
+        .repeat(-1)
+        .prefetch(AUTOTUNE)
     )
   # For Evaluation
   else:
-    ds = (
-      ds.batch(global_batch_size // dataloading_host_count, drop_remainder=False)
-      .prefetch(AUTOTUNE)
-    )
+    ds = ds.batch(global_batch_size // dataloading_host_count, drop_remainder=False).prefetch(AUTOTUNE)
 
   iter = multihost_dataloading.MultiHostDataLoadIterator(ds, mesh)
   return iter
 
+
 def make_tfrecord_iterator(
-    config, dataloading_host_index, dataloading_host_count, mesh, global_batch_size, feature_description, prepare_sample_fn, is_training
+    config,
+    dataloading_host_index,
+    dataloading_host_count,
+    mesh,
+    global_batch_size,
+    feature_description,
+    prepare_sample_fn,
+    is_training,
 ):
   """Iterator for TFRecord format. For Laion dataset,
   check out preparation script
@@ -165,4 +179,14 @@ def make_tfrecord_iterator(
   # Currently only support evaluation on tfrecord. To avoid influencing previous reference, judge whether is training dataset.
   # TODO: refactor to support evaluation on all dataset format.
   dataset_path = config.train_data_dir if is_training else config.eval_data_dir
-  return _make_tfrecord_iterator(config, dataloading_host_index, dataloading_host_count, mesh, global_batch_size, feature_description, prepare_sample_fn, dataset_path, is_training)
+  return _make_tfrecord_iterator(
+      config,
+      dataloading_host_index,
+      dataloading_host_count,
+      mesh,
+      global_batch_size,
+      feature_description,
+      prepare_sample_fn,
+      dataset_path,
+      is_training,
+  )
diff --git a/src/maxdiffusion/input_pipeline/input_pipeline_interface.py b/src/maxdiffusion/input_pipeline/input_pipeline_interface.py
@@ -107,7 +107,7 @@ def make_data_iterator(
         global_batch_size,
         feature_description,
         prepare_sample_fn,
-        is_training
+        is_training,
     )
   else:
     assert False, f"Unknown dataset_type {config.dataset_type}, dataset_type must be in (tf, tfrecord, hf, grain)"
diff --git a/src/maxdiffusion/models/attention_flax.py b/src/maxdiffusion/models/attention_flax.py
@@ -734,7 +734,7 @@ def __init__(
     # None axes corresponds to the stacked weights across all blocks
     # because of the use of nnx.vmap and nnx.scan.
     # Dims are [num_blocks, embed, heads]
-    kernel_axes = (None, "embed", "heads")
+    kernel_axes = ("embed", None, "heads")
     qkv_init_kernel = nnx.with_partitioning(nnx.initializers.lecun_normal(), kernel_axes)
 
     self.query = nnx.Linear(
@@ -748,8 +748,8 @@ def __init__(
         bias_init=nnx.with_partitioning(
             nnx.initializers.zeros,
             (
-                None,
                 "embed",
+                "heads",
             ),
         ),
     )
@@ -765,8 +765,8 @@ def __init__(
         bias_init=nnx.with_partitioning(
             nnx.initializers.zeros,
             (
-                None,
                 "embed",
+                "heads",
             ),
         ),
     )
@@ -782,8 +782,8 @@ def __init__(
         bias_init=nnx.with_partitioning(
             nnx.initializers.zeros,
             (
-                None,
                 "embed",
+                "heads"
             ),
         ),
     )
@@ -792,12 +792,21 @@ def __init__(
         rngs=rngs,
         in_features=self.inner_dim,
         out_features=self.inner_dim,
-        kernel_init=nnx.with_partitioning(nnx.initializers.lecun_normal(), (None, "heads", "embed")),
+        kernel_init=nnx.with_partitioning(nnx.initializers.lecun_normal(), ("embed", "heads", None)),
         dtype=dtype,
         param_dtype=weights_dtype,
         precision=precision,
+        bias_init=nnx.with_partitioning(
+            nnx.initializers.zeros,
+            (
+                "embed",
+                None
+            ),
+        ),
     )
 
+    self.drop_out = nnx.Dropout(dropout)
+
     self.norm_q = None
     self.norm_k = None
     if qk_norm is not None:
@@ -847,7 +856,8 @@ def _apply_rope(self, xq: jax.Array, xk: jax.Array, freqs_cis: jax.Array) -> Tup
     return xq_out, xk_out
 
   def __call__(
-      self, hidden_states: jax.Array, encoder_hidden_states: jax.Array = None, rotary_emb: Optional[jax.Array] = None
+      self, hidden_states: jax.Array, encoder_hidden_states: jax.Array = None, rotary_emb: Optional[jax.Array] = None,
+      deterministic: bool = True, rngs: nnx.Rngs = None,
   ) -> jax.Array:
     hidden_states = jax.lax.with_sharding_constraint(hidden_states, PartitionSpec("data", "fsdp", "tensor"))
     encoder_hidden_states = jax.lax.with_sharding_constraint(encoder_hidden_states, PartitionSpec("data", "fsdp", "tensor"))
@@ -877,6 +887,7 @@ def __call__(
     attn_output = attn_output.astype(dtype=dtype)
     attn_output = checkpoint_name(attn_output, "attn_output")
     hidden_states = self.proj_attn(attn_output)
+    hidden_states = self.drop_out(hidden_states, deterministic=deterministic, rngs=rngs)
     return hidden_states
 
 
diff --git a/src/maxdiffusion/models/wan/transformers/transformer_wan.py b/src/maxdiffusion/models/wan/transformers/transformer_wan.py
@@ -175,12 +175,11 @@ def __init__(
         kernel_init=nnx.with_partitioning(
             nnx.initializers.xavier_uniform(),
             (
+                "embed",
                 None,
                 "mlp",
-                "embed",
             ),
         ),
-        bias_init=nnx.with_partitioning(nnx.initializers.zeros, (None, "embed")),
     )
 
   def __call__(self, x: jax.Array) -> jax.Array:
@@ -217,6 +216,8 @@ def __init__(
     else:
       raise NotImplementedError(f"{activation_fn} is not implemented.")
 
+    self.drop_out = nnx.Dropout(dropout)
+
     self.proj_out = nnx.Linear(
         rngs=rngs,
         in_features=inner_dim,
@@ -228,15 +229,16 @@ def __init__(
         kernel_init=nnx.with_partitioning(
             nnx.initializers.xavier_uniform(),
             (
-                None,
                 "embed",
                 "mlp",
+                None,
             ),
         ),
     )
 
-  def __call__(self, hidden_states: jax.Array) -> jax.Array:
+  def __call__(self, hidden_states: jax.Array, deterministic: bool = True, rngs: nnx.Rngs = None) -> jax.Array:
     hidden_states = self.act_fn(hidden_states)
+    hidden_states = self.drop_out(hidden_states, deterministic=deterministic, rngs=rngs)
     return self.proj_out(hidden_states)
 
 
@@ -260,6 +262,7 @@ def __init__(
       weights_dtype: jnp.dtype = jnp.float32,
       precision: jax.lax.Precision = None,
       attention: str = "dot_product",
+      dropout: float = 0.0,
   ):
 
     # 1. Self-attention
@@ -278,6 +281,7 @@ def __init__(
         weights_dtype=weights_dtype,
         precision=precision,
         attention_kernel=attention,
+        dropout=dropout
     )
 
     # 1. Cross-attention
@@ -295,6 +299,7 @@ def __init__(
         weights_dtype=weights_dtype,
         precision=precision,
         attention_kernel=attention,
+        dropout=dropout
     )
     assert cross_attn_norm is True
     self.norm2 = FP32LayerNorm(rngs=rngs, dim=dim, eps=eps, elementwise_affine=True)
@@ -308,13 +313,16 @@ def __init__(
         dtype=dtype,
         weights_dtype=weights_dtype,
         precision=precision,
+        dropout=dropout
     )
     self.norm3 = FP32LayerNorm(rngs=rngs, dim=dim, eps=eps, elementwise_affine=False)
 
     key = rngs.params()
-    self.adaln_scale_shift_table = nnx.Param(jax.random.normal(key, (1, 6, dim)) / dim**0.5)
+    self.adaln_scale_shift_table = nnx.Param(
+      jax.random.normal(key, (1, 6, dim)) / dim**0.5,
+      sharding=("embed",))
 
-  def __call__(self, hidden_states: jax.Array, encoder_hidden_states: jax.Array, temb: jax.Array, rotary_emb: jax.Array):
+  def __call__(self, hidden_states: jax.Array, encoder_hidden_states: jax.Array, temb: jax.Array, rotary_emb: jax.Array, deterministic: bool = True, rngs: nnx.Rngs = None,):
     shift_msa, scale_msa, gate_msa, c_shift_msa, c_scale_msa, c_gate_msa = jnp.split(
         (self.adaln_scale_shift_table + temb), 6, axis=1
     )
@@ -324,18 +332,18 @@ def __call__(self, hidden_states: jax.Array, encoder_hidden_states: jax.Array, t
     # 1. Self-attention
     norm_hidden_states = (self.norm1(hidden_states) * (1 + scale_msa) + shift_msa).astype(hidden_states.dtype)
     attn_output = self.attn1(
-        hidden_states=norm_hidden_states, encoder_hidden_states=norm_hidden_states, rotary_emb=rotary_emb
+        hidden_states=norm_hidden_states, encoder_hidden_states=norm_hidden_states, rotary_emb=rotary_emb, deterministic=deterministic, rngs=rngs
     )
     hidden_states = (hidden_states + attn_output * gate_msa).astype(hidden_states.dtype)
 
     # 2. Cross-attention
     norm_hidden_states = self.norm2(hidden_states)
-    attn_output = self.attn2(hidden_states=norm_hidden_states, encoder_hidden_states=encoder_hidden_states)
+    attn_output = self.attn2(hidden_states=norm_hidden_states, encoder_hidden_states=encoder_hidden_states, deterministic=deterministic, rngs=rngs)
     hidden_states = hidden_states + attn_output
 
     # 3. Feed-forward
     norm_hidden_states = (self.norm3(hidden_states) * (1 + c_scale_msa) + c_shift_msa).astype(hidden_states.dtype)
-    ff_output = self.ffn(norm_hidden_states)
+    ff_output = self.ffn(norm_hidden_states, deterministic=deterministic, rngs=rngs)
     hidden_states = (hidden_states + ff_output * c_gate_msa).astype(hidden_states.dtype)
     return hidden_states
 
@@ -356,6 +364,7 @@ def __init__(
       freq_dim: int = 256,
       ffn_dim: int = 13824,
       num_layers: int = 40,
+      dropout: float = 0.0,
       cross_attn_norm: bool = True,
       qk_norm: Optional[str] = "rms_norm_across_heads",
       eps: float = 1e-6,
@@ -424,6 +433,7 @@ def init_block(rngs):
           weights_dtype=weights_dtype,
           precision=precision,
           attention=attention,
+          dropout=dropout,
       )
 
     self.gradient_checkpoint = GradientCheckpointType.from_str(remat_policy)
@@ -454,6 +464,8 @@ def __call__(
       encoder_hidden_states_image: Optional[jax.Array] = None,
       return_dict: bool = True,
       attention_kwargs: Optional[Dict[str, Any]] = None,
+      deterministic: bool = True,
+      rngs: nnx.Rngs = None,
   ) -> Union[jax.Array, Dict[str, jax.Array]]:
     batch_size, _, num_frames, height, width = hidden_states.shape
     p_t, p_h, p_w = self.config.patch_size
@@ -476,20 +488,21 @@ def __call__(
       raise NotImplementedError("img2vid is not yet implemented.")
 
     def scan_fn(carry, block):
-      hidden_states, encoder_hidden_states, timestep_proj, rotary_emb = carry
-      hidden_states = block(hidden_states, encoder_hidden_states, timestep_proj, rotary_emb)
-      return (hidden_states, encoder_hidden_states, timestep_proj, rotary_emb)
+      hidden_states_carry, rngs_carry = carry
+      hidden_states = block(hidden_states_carry, encoder_hidden_states, timestep_proj, rotary_emb, deterministic, rngs_carry)
+      new_carry = (hidden_states, rngs_carry)
+      return new_carry, None
 
-    initial_carry = (hidden_states, encoder_hidden_states, timestep_proj, rotary_emb)
     rematted_block_forward = self.gradient_checkpoint.apply(scan_fn)
-    final_carry = nnx.scan(
+    initial_carry = (hidden_states, rngs)
+    final_carry, _ = nnx.scan(
         rematted_block_forward,
         length=self.num_layers,
         in_axes=(nnx.Carry, 0),
-        out_axes=nnx.Carry,
+        out_axes=(nnx.Carry, 0),
     )(initial_carry, self.blocks)
 
-    hidden_states = final_carry[0]
+    hidden_states, _ = final_carry
 
     shift, scale = jnp.split(self.scale_shift_table + jnp.expand_dims(temb, axis=1), 2, axis=1)
 
diff --git a/src/maxdiffusion/pipelines/wan/wan_pipeline.py b/src/maxdiffusion/pipelines/wan/wan_pipeline.py
@@ -82,6 +82,7 @@ def create_model(rngs: nnx.Rngs, wan_config: dict):
   wan_config["flash_block_sizes"] = get_flash_block_sizes(config)
   wan_config["remat_policy"] = config.remat_policy
   wan_config["flash_min_seq_length"] = config.flash_min_seq_length
+  wan_config["dropout"] = config.dropout
 
   # 2. eval_shape - will not use flops or create weights on device
   # thus not using HBM memory.
diff --git a/src/maxdiffusion/tests/wan_transformer_test.py b/src/maxdiffusion/tests/wan_transformer_test.py
diff --git a/src/maxdiffusion/trainers/wan_trainer.py b/src/maxdiffusion/trainers/wan_trainer.py

Original file line number	Diff line number	Diff line change
`@@ -107,7 +107,7 @@ def make_data_iterator(`
`107`	`107`	`global_batch_size,`
`108`	`108`	`feature_description,`
`109`	`109`	`prepare_sample_fn,`
`110`		`- is_training`
	`110`	`+ is_training,`
`111`	`111`	`)`
`112`	`112`	`else:`
`113`	`113`	`assert False, f"Unknown dataset_type {config.dataset_type}, dataset_type must be in (tf, tfrecord, hf, grain)"`