add support for < 1 batch item per device.

entrpn · entrpn · commit 00bf1cd6d0e7 · 2025-07-30T20:53:12.000Z
diff --git a/src/maxdiffusion/configs/base_wan_14b.yml b/src/maxdiffusion/configs/base_wan_14b.yml
@@ -54,6 +54,7 @@ jit_initializers: True
 from_pt: True
 split_head_dim: True
 attention: 'flash' # Supported attention: dot_product, flash, cudnn_flash_te
+flash_min_seq_length: 4096
 
 flash_block_sizes: {}
 # Use on v6e
@@ -131,6 +132,7 @@ logical_axis_rules: [
                       ['activation_batch', 'data'],
                       ['mlp','tensor'],
                       ['embed','fsdp'],
+                      ['heads', 'tensor'],
                       ['norm', 'tensor'],
                       ['conv_batch', ['data','fsdp']],
                       ['out_channels', 'tensor'],
diff --git a/src/maxdiffusion/input_pipeline/_tfds_data_processing.py b/src/maxdiffusion/input_pipeline/_tfds_data_processing.py
@@ -105,7 +105,7 @@ def _parse_tfrecord_fn(example):
   )
 
   # This wraps the tf.data.Dataset for use in the multi-host JAX environment.
-  train_iter = multihost_dataloading.MultiHostDataLoadIterator(train_ds, mesh)
+  train_iter = multihost_dataloading.MultiHostDataLoadIterator(train_ds, mesh, config.global_batch_size)
   return train_iter
 
 
diff --git a/src/maxdiffusion/models/attention_flax.py b/src/maxdiffusion/models/attention_flax.py
@@ -187,11 +187,6 @@ def _tpu_flash_attention(
   value, _, _ = _reshape_data_for_flash(value, heads, block_sizes.block_kv_compute, num_fsdp_shards)
   q_axis_names = nn.logical_to_mesh_axes(axis_names_q)
   kv_axis_names = nn.logical_to_mesh_axes(axis_names_kv)
-  flash_axis_names_splash_kernel: AxisNames = (HEAD, KV_LENGTH)
-  axis_names_splash_kernel = nn.logical_to_mesh_axes(flash_axis_names_splash_kernel)
-  named_sharding = jax.sharding.NamedSharding(mesh, axis_names_splash_kernel)
-
-  shard_head_size = mesh.shape["tensor"]
 
   @functools.partial(
       shard_map.shard_map,
@@ -215,6 +210,9 @@ def wrap_flash_attention(query, key, value):
         q_seq_shards=1,  # the sizes of the axis is sharding over seq_len
         block_sizes=block_sizes,
     )
+    # jax.debug.print("query.shape: {x}", x=query.shape)
+    # jax.debug.print("key.shape: {x}", x=key.shape)
+    # jax.debug.print("value.shape: {x}", x=value.shape)
     attention_output = jax.vmap(splash_kernel)(query, key, value)
     return attention_output
 
@@ -799,6 +797,7 @@ def __call__(
       query_proj = _unflatten_heads(query_proj, self.heads)
       key_proj = _unflatten_heads(key_proj, self.heads)
       value_proj = _unflatten_heads(value_proj, self.heads)
+      # output of _unflatten_heads Batch, heads, seq_len, head_dim
       query_proj, key_proj = self._apply_rope(query_proj, key_proj, rotary_emb)
 
     attn_output = self.attention_op.apply_attention(query_proj, key_proj, value_proj)
diff --git a/src/maxdiffusion/multihost_dataloading.py b/src/maxdiffusion/multihost_dataloading.py
@@ -37,20 +37,23 @@
 
 
 def _build_global_shape_and_sharding(
-    local_shape: tuple[int, ...], global_mesh: Mesh
+    local_shape: tuple[int, ...], global_mesh: Mesh, global_batch_size: int = 0
 ) -> tuple[tuple[int, ...], NamedSharding]:
-  sharding = NamedSharding(global_mesh, PartitionSpec(global_mesh.axis_names))
+  #Handle sharding for setting a gbs < jax.device_count
+  if global_batch_size > 0:
+    sharding = NamedSharding(global_mesh, PartitionSpec(*global_mesh.axis_names))
+  else:
+    sharding = NamedSharding(global_mesh, PartitionSpec(global_mesh.axis_names))
 
   global_shape = (jax.process_count() * local_shape[0],) + local_shape[1:]
-
   return global_shape, sharding
 
 
-def _form_global_array(path, array: np.ndarray, global_mesh: Mesh) -> jax.Array:
+def _form_global_array(path, array: np.ndarray, global_mesh: Mesh, global_batch_size: int = 0, split_axis_index: int = 0) -> jax.Array:
   """Put local sharded array into local devices"""
-  global_shape, sharding = _build_global_shape_and_sharding(np.shape(array), global_mesh)
+  global_shape, sharding = _build_global_shape_and_sharding(np.shape(array), global_mesh, global_batch_size)
   try:
-    local_device_arrays = np.split(array, len(global_mesh.local_devices), axis=0)
+    local_device_arrays = np.split(array, len(global_mesh.local_devices), axis=split_axis_index)
   except ValueError as array_split_error:
     raise ValueError(
         f"Unable to put to devices shape {array.shape} with "
@@ -62,7 +65,7 @@ def _form_global_array(path, array: np.ndarray, global_mesh: Mesh) -> jax.Array:
   return jax.make_array_from_single_device_arrays(global_shape, sharding, local_device_buffers)
 
 
-def get_next_batch_sharded(local_dataset: Iterator, global_mesh: Mesh) -> jax.Array:
+def get_next_batch_sharded(local_dataset: Iterator, global_mesh: Mesh, global_batch_size: int = 0, split_axis_index: int = 0) -> jax.Array:
   """Splits the host loaded data equally over all devices."""
 
   SLEEP_TIME = 10
@@ -83,17 +86,33 @@ def get_next_batch_sharded(local_dataset: Iterator, global_mesh: Mesh) -> jax.Ar
   if not loaded_data_success:
     local_data = local_dataset.next()
 
-  input_gdas = jtu.tree_map_with_path(partial(_form_global_array, global_mesh=global_mesh), local_data)
+  input_gdas = jtu.tree_map_with_path(partial(_form_global_array, global_mesh=global_mesh, global_batch_size=global_batch_size, split_axis_index=split_axis_index), local_data)
 
   return input_gdas
 
 
 class MultiHostDataLoadIterator:
   """fold get_next_batch_sharded into a iterator class"""
 
-  def __init__(self, dataloader: Union[tf.data.Dataset, Iterable], global_mesh: Mesh):
+  def __init__(self, dataloader: Union[tf.data.Dataset, Iterable], global_mesh: Mesh, global_batch_size: int = 0):
     self.global_mesh = global_mesh
     self.dataloader = dataloader
+    # Handles sharding for when gbs < number of devices
+    self.global_batch_size = global_batch_size
+    # Use the correct axis for splitting the data across when using global_batch_size
+    split_axis_name = max(global_mesh.shape, key=global_mesh.shape.get)
+    split_axis_index = 0
+    if global_batch_size > 0:
+      max_logging.log(f"global_batch_size was set to {global_batch_size}, splitting data across {split_axis_name}.")
+      if split_axis_name == "data":
+        split_axis_index = 0
+      elif split_axis_name == "fsdp":
+        split_axis_index = 1
+      elif split_axis_name == "tensor":
+        split_axis_index = 2
+      else:
+        raise ValueError(f"Could not find {split_axis_name} to split data over.") 
+    self.split_axis_index = split_axis_index
     if isinstance(self.dataloader, tf.data.Dataset):
       self.local_iterator = self.dataloader.as_numpy_iterator()
     elif isinstance(self.dataloader, Iterable):
@@ -114,4 +133,4 @@ def __iter__(self):
     return self
 
   def __next__(self):
-    return get_next_batch_sharded(self.local_iterator, self.global_mesh)
+    return get_next_batch_sharded(self.local_iterator, self.global_mesh, self.global_batch_size, self.split_axis_index)
diff --git a/src/maxdiffusion/pipelines/wan/wan_pipeline.py b/src/maxdiffusion/pipelines/wan/wan_pipeline.py
@@ -79,6 +79,7 @@ def create_model(rngs: nnx.Rngs, wan_config: dict):
   wan_config["precision"] = get_precision(config)
   wan_config["flash_block_sizes"] = get_flash_block_sizes(config)
   wan_config["remat_policy"] = config.remat_policy
+  wan_config["flash_min_seq_length"] = config.flash_min_seq_length
 
   # 2. eval_shape - will not use flops or create weights on device
   # thus not using HBM memory.
diff --git a/src/maxdiffusion/pyconfig.py b/src/maxdiffusion/pyconfig.py
@@ -181,6 +181,8 @@ def user_init(raw_keys):
     raw_keys["total_train_batch_size"] = max_utils.get_global_batch_size(raw_keys["per_device_batch_size"])
     raw_keys["num_slices"] = get_num_slices(raw_keys)
     raw_keys["quantization_local_shard_count"] = get_quantization_local_shard_count(raw_keys)
+    if "global_batch_size" not in raw_keys.keys():
+      raw_keys["global_batch_size"] = 0
 
 
 def get_num_slices(raw_keys):
diff --git a/src/maxdiffusion/trainers/wan_trainer.py b/src/maxdiffusion/trainers/wan_trainer.py
@@ -69,7 +69,8 @@ def __init__(self, config):
     if config.train_text_encoder:
       raise ValueError("this script currently doesn't support training text_encoders")
 
-    self.global_batch_size = self.config.per_device_batch_size * jax.device_count()
+    #self.global_batch_size = self.config.per_device_batch_size * jax.device_count()
+    self.global_batch_size = config.global_batch_size if config.global_batch_size > 0 else config.per_device_batch_size * jax.device_count()
 
   def post_training_steps(self, pipeline, params, train_states, msg=""):
     pass

Original file line number	Diff line number	Diff line change
`@@ -105,7 +105,7 @@ def _parse_tfrecord_fn(example):`
`105`	`105`	`)`
`106`	`106`
`107`	`107`	`# This wraps the tf.data.Dataset for use in the multi-host JAX environment.`
`108`		`- train_iter = multihost_dataloading.MultiHostDataLoadIterator(train_ds, mesh)`
	`108`	`+ train_iter = multihost_dataloading.MultiHostDataLoadIterator(train_ds, mesh, config.global_batch_size)`
`109`	`109`	`return train_iter`
`110`	`110`
`111`	`111`