add remat policy. Remove sharding for shard_map splash to lower memory footprint

entrpn · entrpn · commit 28dbe570678e · 2025-07-29T23:01:45.000Z
diff --git a/src/maxdiffusion/configs/base_wan_14b.yml b/src/maxdiffusion/configs/base_wan_14b.yml
@@ -182,6 +182,14 @@ transform_images_num_proc: 4
 reuse_example_batch: False
 enable_data_shuffling: True
 
+# Defines the type of gradient checkpoint to enable.
+# NONE - means no gradient checkpoint
+# FULL - means full gradient checkpoint, whenever possible (minimum memory usage)
+# MATMUL_WITHOUT_BATCH - means gradient checkpoint for every linear/matmul operation,
+# except for ones that involve batch dimension - that means that all attention and projection
+# layers will have gradient checkpoint, but not the backward with respect to the parameters
+remat_policy: "NONE"
+
 # checkpoint every number of samples, -1 means don't checkpoint.
 checkpoint_every: -1
 # enables one replica to read the ckpt then broadcast to the rest
diff --git a/src/maxdiffusion/models/attention_flax.py b/src/maxdiffusion/models/attention_flax.py
@@ -187,44 +187,34 @@ def _tpu_flash_attention(
   value, _, _ = _reshape_data_for_flash(value, heads, block_sizes.block_kv_compute, num_fsdp_shards)
   q_axis_names = nn.logical_to_mesh_axes(axis_names_q)
   kv_axis_names = nn.logical_to_mesh_axes(axis_names_kv)
-  flash_axis_names_splash_kernel: AxisNames = (HEAD, LENGTH)
+  flash_axis_names_splash_kernel: AxisNames = (HEAD, KV_LENGTH)
   axis_names_splash_kernel = nn.logical_to_mesh_axes(flash_axis_names_splash_kernel)
   named_sharding = jax.sharding.NamedSharding(mesh, axis_names_splash_kernel)
 
   shard_head_size = mesh.shape["tensor"]
 
-  @functools.partial(
-      jax.jit,
-      static_argnames=["multi_head_mask", "shard_head_size"],
-  )
-  def wrap_splash_kernel(multi_head_mask, shard_head_size=1):
-    splash_kernel = splash_attention_kernel.make_splash_mha(
-        mask=multi_head_mask,
-        head_shards=shard_head_size,  # the sizes of the axis is sharding over heads
-        q_seq_shards=1,  # the sizes of the axis is sharding over seq_len
-        block_sizes=block_sizes,
-    )
-    return splash_kernel
-
-  mask = splash_attention_mask.FullMask(_shape=(query.shape[2], key.shape[2]))
-
-  multi_head_mask = splash_attention_mask.MultiHeadMask(masks=(mask,) * query.shape[1])
-  splash_kernel = wrap_splash_kernel(multi_head_mask, int(shard_head_size))
-  segment_axis_names_splash_kernel = splash_kernel.manual_sharding_spec(named_sharding)
-
   @functools.partial(
       shard_map.shard_map,
       mesh=mesh,
       in_specs=(
           q_axis_names,
           kv_axis_names,
           kv_axis_names,
-          segment_axis_names_splash_kernel,
       ),
       out_specs=q_axis_names,
       check_rep=False,
   )
-  def wrap_flash_attention(query, key, value, splash_kernel):
+  def wrap_flash_attention(query, key, value):
+    mask = splash_attention_mask.FullMask(_shape=(query.shape[2], key.shape[2]))
+    multi_head_mask = splash_attention_mask.MultiHeadMask(masks=(mask,) * query.shape[1])
+    # make_splash_mha is wrapped around shardmap and seq and head is already 
+    # sharded based on in_specs, therefore setting head_shards=1 and q_seq_shards=1.
+    splash_kernel = splash_attention_kernel.make_splash_mha(
+        mask=multi_head_mask,
+        head_shards=1,  # the sizes of the axis is sharding over heads
+        q_seq_shards=1,  # the sizes of the axis is sharding over seq_len
+        block_sizes=block_sizes,
+    )
     attention_output = jax.vmap(splash_kernel)(query, key, value)
     return attention_output
 
@@ -236,7 +226,7 @@ def wrap_flash_attention(query, key, value, splash_kernel):
         "Warning, batch dimension should be shardable among the devices in data and fsdp"
         f" axis, batch dimension: {query.shape[0]}, devices_in_data_fsdp: {devices_in_data_fsdp}"
     )
-  x = wrap_flash_attention(query, key, value, splash_kernel)
+  x = wrap_flash_attention(query, key, value)
   x = x[:, :, :query_seq_len, :kv_size]
   x = _reshape_heads_to_head_dim(x)
 
@@ -632,7 +622,7 @@ def __init__(
       use_memory_efficient_attention: bool = False,
       split_head_dim: bool = False,
       attention_kernel: str = "flash",
-      flash_min_seq_length: int = 4096,
+      flash_min_seq_length: int = 0,
       flash_block_sizes: BlockSizes = None,
       mesh: jax.sharding.Mesh = None,
       dtype: jnp.dtype = jnp.float32,
diff --git a/src/maxdiffusion/models/gradient_checkpoint.py b/src/maxdiffusion/models/gradient_checkpoint.py
@@ -0,0 +1,70 @@
+from enum import Enum, auto
+from typing import Optional
+
+import jax
+from flax import nnx
+
+SKIP_GRADIENT_CHECKPOINT_KEY = "skip"
+
+# This class only works with NNX modules. 
+class GradientCheckpointType(Enum):
+  """
+  Defines the type of the gradient checkpoint we will have
+
+  NONE - means no gradient checkpoint
+  FULL - means full gradient checkpoint, wherever possible (minimum memory usage)
+  MATMUL_WITHOUT_BATCH - means gradient checkpoint for every linear/matmul operation,
+                          except for ones that involve batch dimension - that means that all attention and projection
+                          layers will have gradient checkpoint, but not the backward with respect to the parameters
+  """
+
+  NONE = auto()
+  FULL = auto()
+  MATMUL_WITHOUT_BATCH = auto()
+
+  @classmethod
+  def from_str(cls, s: Optional[str] = None) -> "GradientCheckpointType":
+    """
+    Constructs the gradient checkpoint type from a string
+
+    Args:
+        s (Optional[str], optional): The name of the gradient checkpointing policy. Defaults to None.
+
+    Returns:
+        GradientCheckpointType: The policy that corresponds to the string
+    """
+    if s is None:
+      s = "none"
+    return GradientCheckpointType[s.upper()]
+
+  def to_jax_policy(self):
+    """
+    Converts the gradient checkpoint type to a jax policy
+    """
+    match self:
+      case GradientCheckpointType.NONE:
+        return SKIP_GRADIENT_CHECKPOINT_KEY
+      case GradientCheckpointType.FULL:
+        return None
+      case GradientCheckpointType.MATMUL_WITHOUT_BATCH:
+        return jax.checkpoint_policies.checkpoint_dots_with_no_batch_dims
+
+  def apply(self, module: nnx.Module) -> nnx.Module:
+    """
+    Applies a gradient checkpoint policy to a module
+    if no policy is needed, it will return the module as is
+
+    Args:
+        module (nn.Module): the module to apply the policy to
+
+    Returns:
+        nn.Module: the module with the policy applied
+    """
+    policy = self.to_jax_policy()
+    if policy == SKIP_GRADIENT_CHECKPOINT_KEY:
+      return module
+    return nnx.remat(  # pylint: disable=invalid-name
+        module,
+        prevent_cse=False,
+        policy=policy,
+    )
diff --git a/src/maxdiffusion/models/wan/transformers/transformer_wan.py b/src/maxdiffusion/models/wan/transformers/transformer_wan.py
@@ -31,6 +31,7 @@
 )
 from ...normalization_flax import FP32LayerNorm
 from ...attention_flax import FlaxWanAttention
+from ...gradient_checkpoint import GradientCheckpointType
 
 BlockSizes = common_types.BlockSizes
 
@@ -356,6 +357,7 @@ def __init__(
       weights_dtype: jnp.dtype = jnp.float32,
       precision: jax.lax.Precision = None,
       attention: str = "dot_product",
+      remat_policy: str = "None"
   ):
     inner_dim = num_attention_heads * attention_head_dim
     out_channels = out_channels or in_channels
@@ -417,6 +419,8 @@ def init_block(rngs):
           attention=attention,
       )
 
+    self.gradient_checkpoint = GradientCheckpointType.from_str(remat_policy)
+
     self.blocks = init_block(rngs)
 
     self.norm_out = FP32LayerNorm(rngs=rngs, dim=inner_dim, eps=eps, elementwise_affine=False)
@@ -469,8 +473,9 @@ def scan_fn(carry, block):
       return (hidden_states, encoder_hidden_states, timestep_proj, rotary_emb)
 
     initial_carry = (hidden_states, encoder_hidden_states, timestep_proj, rotary_emb)
+    rematted_block_forward = self.gradient_checkpoint.apply(scan_fn)
     final_carry = nnx.scan(
-        scan_fn,
+        rematted_block_forward,
         length=self.num_layers,
         in_axes=(nnx.Carry, 0),
         out_axes=nnx.Carry,
diff --git a/src/maxdiffusion/pipelines/wan/wan_pipeline.py b/src/maxdiffusion/pipelines/wan/wan_pipeline.py
@@ -78,6 +78,7 @@ def create_model(rngs: nnx.Rngs, wan_config: dict):
   wan_config["attention"] = config.attention
   wan_config["precision"] = get_precision(config)
   wan_config["flash_block_sizes"] = get_flash_block_sizes(config)
+  wan_config["remat_policy"] = config.remat_policy
 
   # 2. eval_shape - will not use flops or create weights on device
   # thus not using HBM memory.