add cse remat

NuojCheng · NuojCheng · commit 09ee6774c9e4 · 2026-04-02T18:28:55.000Z
diff --git a/src/maxtext/configs/base.yml b/src/maxtext/configs/base.yml
@@ -968,7 +968,7 @@ xprof_e2e_enable_fw_power_level_event: False
 xprof_e2e_enable_fw_thermal_event: False
 profile_power_events: False # Set to True to enable TPU-specific power/thermal profiling events. Defaults to False to avoid breaking GPU xplane tracing.
 
-log_config: False # Prints the config (after defaults have been set by pyconfig logic)
+log_config: True # Prints the config (after defaults have been set by pyconfig logic)
 debug_sharding: False # Prints model weights sharding info
 
 # Checkpoint Structured logging
diff --git a/src/maxtext/configs/custom_mesh_and_rule/pipeline-large-moe.yml b/src/maxtext/configs/custom_mesh_and_rule/pipeline-large-moe.yml
@@ -20,48 +20,47 @@
 # The `data` axis is preserved for two reasons: first, the pipeline stage acts as a 
 # data parallel (DP) domain externally, making the `data` axis a necessary reference; 
 # second, it may be required for DCN communication. 
+# 
+# The `context` axis is used for supporting fractional per device batch size
 #
 # Finally, the `tensor` axis is used to shard weights when `pipeline_fsdp_ag_once` or 
 # `pipeline_fsdp_ag_per_repeat` is enabled, ensuring we have sufficient memory to 
 # store prefetched weights.
-mesh_axes: ['data', 'stage', 'fsdp', 'tensor', 'expert']
-data_sharding: [['data', 'stage', 'fsdp', 'tensor', 'expert']]
+mesh_axes: ['data', 'stage', 'fsdp', 'context', 'tensor', 'expert']
+data_sharding: [['data', 'stage', 'fsdp', 'context', 'tensor', 'expert']]
 logical_axis_rules: [
                       ['activation_batch', ['data', 'fsdp', 'expert']],
                       ['activation_batch_moe', ['data', 'fsdp', 'expert']],
                       ['activation_batch_no_exp', ['data', 'fsdp']],
                       ['activation_batch_no_exp_moe', ['data', 'fsdp']],
                       ['activation_embed_and_logits_batch', ['data', 'stage', 'fsdp', 'expert']],
-                      ['activation_embed_and_logits_batch_sequence', ['data', 'stage', 'fsdp', 'expert']],
+                      ['activation_embed_and_logits_batch_sequence', ['data', 'stage', 'fsdp', 'context', 'expert']],
                       ['activation_heads', ['tensor']],
                       ['activation_kv_heads', ['tensor']],
-                      ['activation_length', ['expert']],
-                      ['activation_attn_length', ['expert']],
-                      ['activation_q_length', ['expert']],
+                      ['activation_length', ['context', 'expert']],
+                      ['activation_attn_length', ['context', 'expert']],
+                      ['activation_q_length', ['context', 'expert']],
                       ['activation_attn_embed', ['tensor']],
                       ['activation_embed', ['tensor']],
                       ['activation_embed_moe', ['tensor']],
                       ['activation_mlp', ['tensor']],
                       ['activation_kv', ['tensor']],
-                      ['activation_prefill_kv_batch', ['data', 'fsdp', 'expert']],
                       ['activation_kv_batch', ['data', 'fsdp', 'expert']],
                       ['activation_kv_batch_no_exp', ['data', 'fsdp']],
                       ['activation_kv_head_dim', ['tensor']],
                       ['activation_vocab', ['tensor']],
                       ['activation_stage', 'stage'],
                       ['activation_exp', ['expert']],
-                      ['decode_batch', ['data', 'fsdp', 'expert']],
                       ['mlp', ['tensor']],
                       ['mlp_no_fsdp', ['tensor']],
                       ['vocab', ['tensor']],
                       ['heads', ['tensor']],
                       ['q_heads', ['tensor']],
                       ['kv_heads', ['tensor']],
-                      ['embed', ['fsdp', 'expert']],
+                      ['embed', ['fsdp', 'expert']], # remove context from embed sharding
                       ['embed_moe', ['fsdp', 'expert']],
                       ['embed_no_exp', ['fsdp']],
                       ['embed_no_exp_moe', ['fsdp']],
-                      ['embed_moe', ['fsdp']],
                       ['q_lora', ['fsdp']],
                       ['kv_lora', ['fsdp']],
                       ['norm', ['tensor']],
diff --git a/src/maxtext/layers/pipeline.py b/src/maxtext/layers/pipeline.py
@@ -1173,11 +1173,12 @@ def from_repeat_weights_to_bsw(
       self,
       repeat_weights,
       physical_partition_spec,
-      axes_to_gather=("fsdp", "fsdp_transpose", "expert"),  # three major FSDP-like axes
+      axes_to_gather=("fsdp", "fsdp_transpose", "context", "expert"),
+      # TODO (chengnuojin) set use_shardmap=true after JAX >= 10.0.0 and use all_gather(..., to='invarying')
       use_shardmap=False,  # using shardmap produces additional reduce-scatter in backward pass
   ):
     """Executes the FSDP-like all-gathers to fully materialize a block of weights for the BSW."""
-    axes_to_remove = ["fsdp", "fsdp_transpose"]
+    axes_to_remove = ["fsdp", "fsdp_transpose", "context"]
     bsw_pps = pipeline_utils.derive_stage_weight_partition_specs(physical_partition_spec, axes_to_remove)
 
     def _from_repeat_weights_to_bsw_shardmap(
@@ -1244,20 +1245,7 @@ def _apply_sharding_hint(weight, pspec):
       return _from_repeat_weights_to_bsw_shardmap(repeat_weights, physical_partition_spec, axes_to_gather=axes_to_gather)
     return _from_repeat_weights_to_bsw_hint(repeat_weights)
 
-  def both_weight_prefetching(self, weights, physical_partition_spec, loop_iteration):
-    """Triggers asynchronous FSDP-like all-gathers for the current and next pipeline steps.
-
-    By gathering weights for `loop_iteration + 1` right now, the network communication
-    can overlap with the compute happening in `loop_iteration`. The dual-buffers
-    are returned grouped in an explicit `jax.ad_checkpoint` to strictly control memory.
-    """
-    cur_repeat_weights = self.from_all_variables_to_repeat_weights(weights, loop_iteration)
-    nxt_repeat_weights = self.from_all_variables_to_repeat_weights(weights, loop_iteration + 1)
-    bsw_0 = self.from_repeat_weights_to_bsw(cur_repeat_weights, physical_partition_spec)
-    bsw_1 = self.from_repeat_weights_to_bsw(nxt_repeat_weights, physical_partition_spec)
-    return bsw_0, bsw_1
-
-  def one_weight_prefetching(self, weights, physical_partition_spec, loop_iteration):
+  def weight_prefetching(self, weights, physical_partition_spec, loop_iteration):
     """Triggers asynchronous FSDP-like all-gathers for the next pipeline steps.
 
     By gathering weights for `loop_iteration + 1` right now, the network communication
@@ -1351,7 +1339,6 @@ def __call__(
       segment_idx = None
 
     loop_state, bsw = self.init_states(inputs)
-    weights = self.layers.variables
     physical_partition_spec = logical_to_mesh(
         logical_partition_spec, mesh=self.mesh, rules=self.config.logical_axis_rules
     )
@@ -1388,41 +1375,34 @@ def run_iteration_scannable(model, loop_state, bsw):
 
     # base scannable function used twice for real and bubble runs
     base_scannable = functools.partial(
-        pipeline_utils.create_rematerialized_pipeline_stage,
+        pipeline_utils.create_pipeline_stage,
         deterministic=deterministic,
         model_mode=model_mode,
         logical_partition_spec=logical_partition_spec,
         physical_partition_spec=physical_partition_spec,
         positions=positions,
         segment_ids=segment_ids,
-        pipeline_weights=weights,
     )
 
     run_one_repeat_scannable = base_scannable(length=self.config.num_pipeline_microbatches)
-    # run_one_repeat_scannable = nn.remat(
-    #   run_one_repeat_scannable,
-    #   prevent_cse=True,
-    #   policy=self.get_pipeline_remat_policy()
-    # )
     run_bubbles_scannable = base_scannable(length=bubble_iterations)
-    # run_bubbles_scannable = nn.remat(
-    #   run_bubbles_scannable,
-    #   prevent_cse=True,
-    #   policy=self.get_pipeline_remat_policy()
-    # )
 
     run_repeats_scanned = pipeline_utils.create_flax_pipeline_scan(
         pipeline_stage_fn=run_one_repeat_scannable,
         length=self.config.num_pipeline_repeats,
+        remat_policy=self.get_pipeline_remat_policy(),
         use_scan=self.config.scan_pipeline_repeats,
     )
     run_bubbles_scanned = pipeline_utils.create_flax_pipeline_scan(
         pipeline_stage_fn=run_bubbles_scannable,
         length=1,
+        remat_policy=self.get_pipeline_remat_policy(),
         use_scan=self.config.scan_pipeline_repeats,
     )
-    (loop_state, w_curr), _ = run_repeats_scanned(self, (loop_state, bsw[0]))
-    (loop_state, _), _ = run_bubbles_scanned(self, (loop_state, w_curr))
+    initial_carry_repeats = (loop_state, bsw[0], self.layers.variables)
+    (loop_state, w_curr, pipeline_weights), _ = run_repeats_scanned(self, initial_carry_repeats)
+    initial_carry_bubbles = (loop_state, w_curr, pipeline_weights)
+    (loop_state, _, pipeline_weights), _ = run_bubbles_scanned(self, initial_carry_bubbles)
 
     final_output = self.realign_output_microbatches(loop_state["state_io"])
     final_output = jnp.reshape(
diff --git a/src/maxtext/utils/pipeline_utils.py b/src/maxtext/utils/pipeline_utils.py
@@ -248,24 +248,21 @@ def run_pipeline_microbatches_custom_bwd(residuals, g_final_state):
   return run_pipeline_microbatches_custom
 
 
-def create_rematerialized_pipeline_stage(
+def create_pipeline_stage(
     length,
     deterministic,
     model_mode,
     logical_partition_spec,
     physical_partition_spec,
     positions,
     segment_ids,
-    pipeline_weights,
 ):
-  """Builds a memory-checkpointed execution block for a single pipeline stage.
+  """Builds an execution block for a single pipeline stage.
 
   This function prepares the state for a specific chunk of pipeline execution by:
-  1. Prefetching the required weights for the current stage/loop iteration.
-  2. Executing `length` microbatches using either a memory-efficient `jax.lax.scan`
-     (if `scan_pipeline_iterations` is True) or an unrolled Python `for` loop.
-  3. Wrapping the entire stage block in `flax.linen.remat` to discard and recompute
-     activations during the backward pass based on the model's policy.
+  1. Prefetching the required weights (e.g., FSDP-gathered) for the current stage/loop iteration.
+  2. Executing `length` microbatches using a memory-efficient `jax.lax.scan` via a custom VJP
+     that manages collective communication overlap.
 
   Args:
     length: The number of microbatches to process in this stage.
@@ -275,14 +272,27 @@ def create_rematerialized_pipeline_stage(
     physical_partition_spec: Rules for physical device mesh mappings (used in prefetching).
     positions: Position IDs for the sequence.
     segment_ids: Segment/Attention routing IDs for the sequence.
-    pipeline_weights: The fully gathered pipeline weights explicitly passed via closure.
 
   Returns:
-    A function decorated with `nn.remat` that takes `(model, loop_state)` and returns
-    the updated `loop_state`.
+    A function that takes `(model, carry)` and returns the updated `carry` and `None` for the scan outputs.
   """
 
-  def execute_pipeline_stage_outer(model, loop_state_and_bsw):
+  def execute_pipeline_stage_flax(model, carry):
+    """
+    A non-pure Flax closure of the pipeline stage.
+
+    This function bridges the pure JAX custom VJP logic with Flax's object-oriented
+    lifting mechanisms. It unpacks the carry state and routes it through the pure VJP function.
+
+    Args:
+      model: CircularPipeline Flax linen model instance.
+      carry: A tuple containing (loop_state, w_curr, pipeline_weights).
+             - loop_state: The current execution state of the pipeline.
+             - w_curr: The gathered weights used for the current pipeline step.
+             - pipeline_weights: The fully sharded baseline weights.
+    """
+
+    loop_state, w_curr, pipeline_weights = carry
 
     scan_microbatches_fn = create_gradient_accumulation_scan(
         model=model,
@@ -292,71 +302,89 @@ def execute_pipeline_stage_outer(model, loop_state_and_bsw):
         logical_partition_spec=logical_partition_spec,
     )
 
-    remat_weight_prefetching = model.one_weight_prefetching
-
+    # Establish a pure function boundary to allow for custom VJP definition
     @jax.custom_vjp
-    def execute_pipeline_stage(loop_state_and_bsw, pipeline_weights):
-      return execute_pipeline_stage_custom_fwd(loop_state_and_bsw, pipeline_weights)[0]
+    def execute_pipeline_stage_pure(loop_state, w_curr, pipeline_weights):
+      return execute_pipeline_stage_pure_fwd(loop_state, w_curr, pipeline_weights)[0]
 
-    def execute_pipeline_stage_custom_fwd(loop_state_and_bsw, pipeline_weights):
-      loop_state, w_curr = loop_state_and_bsw
-      # # Retrieve the specific weights needed for this pipeline chunk
-      w_next = remat_weight_prefetching(
+    def execute_pipeline_stage_pure_fwd(loop_state, w_curr, pipeline_weights):
+      # Prefetch FSDP-sharded weights for the upcoming pipeline repeat
+      w_next = model.weight_prefetching(
           pipeline_weights,
           physical_partition_spec,
           loop_state["loop_iteration"],
       )
+      # Construct a buffered sliding window (BSW) of weights.
+      # w_curr: Weights actively used for the current microbatch steps.
+      # w_next: Newly gathered weights that will be carried forward as the new w_curr.
       bsw = (w_curr, w_next)
-      p_remat_weight_prefetching = functools.partial(
-          remat_weight_prefetching,
+      # Bind arguments to the weight prefetching function to prepare it for linear transpose
+      p_weight_prefetching = functools.partial(
+          model.weight_prefetching,
           physical_partition_spec=physical_partition_spec,
           loop_iteration=loop_state["loop_iteration"],
       )
-      remat_weight_prefetching_t = jax.linear_transpose(
-          p_remat_weight_prefetching,
+      # Since weight gathering (all-gather) is a linear operation, we can derive its dual
+      # (reduce-scatter) via jax.linear_transpose. This avoids redundant forward passes
+      weight_prefetching_t = jax.linear_transpose(
+          p_weight_prefetching,
           pipeline_weights,
       )
-      (loop_state, bsw), scan_fn_vjp = jax.vjp(scan_microbatches_fn, loop_state, bsw, positions, segment_ids)
-      w_curr, w_next = bsw
-      return (loop_state, w_next), (scan_fn_vjp, remat_weight_prefetching_t)
-
-    def execute_pipeline_stage_custom_bwd(residuals, g_outputs):
+      # Execute the forward pass of the microbatches and generate its VJP.
+      # The VJP captures necessary checkpoints to evaluate gradients later.
+      (loop_state, bsw), scan_microbatches_vjp = jax.vjp(scan_microbatches_fn, loop_state, bsw, positions, segment_ids)
+      # Discard the old weights (w_curr) and advance w_next to act as the current weights in the next iteration
+      _, w_next = bsw
+      return (loop_state, w_next), (scan_microbatches_vjp, weight_prefetching_t)
+
+    def execute_pipeline_stage_pure_bwd(residuals, g_outputs):
+      # Unpack forward pass residuals (VJP closures) and the incoming output gradients
       g_loop_state, g_w_next = g_outputs
-      scan_fn_vjp, remat_weight_prefetching_t = residuals
+      scan_microbatches_vjp, weight_prefetching_t = residuals
+      # Initialize zero cotangents for w_curr, as it was consumed in the forward pass
       g_w_curr = jax.tree.map(jnp.zeros_like, g_w_next)
       g_bsw = (g_w_curr, g_w_next)
-      g_loop_state, g_bsw, _, _ = scan_fn_vjp((g_loop_state, g_bsw))
+      # Backpropagate gradients through the dual microbatch execution block
+      g_loop_state, g_bsw, _, _ = scan_microbatches_vjp((g_loop_state, g_bsw))
+      # Apply the linear transpose of the weight prefetch to execute the reduce-scatter
+      # This maps the gradients of the gathered weights back to the FSDP-sharded parameter space
       g_w_curr, g_w_next = g_bsw
-      (g_pipeline_weights,) = remat_weight_prefetching_t(g_w_next)
-      return (g_loop_state, g_w_curr), g_pipeline_weights
-
-    execute_pipeline_stage.defvjp(execute_pipeline_stage_custom_fwd, execute_pipeline_stage_custom_bwd)
+      (g_pipeline_weights,) = weight_prefetching_t(g_w_next)
+      # Return gradients corresponding to the three original inputs of execute_pipeline_stage_pure
+      return g_loop_state, g_w_curr, g_pipeline_weights
 
-    return execute_pipeline_stage(loop_state_and_bsw, pipeline_weights), None
+    execute_pipeline_stage_pure.defvjp(execute_pipeline_stage_pure_fwd, execute_pipeline_stage_pure_bwd)
+    # Execute the pure pipeline stage. We unpack the two modified outputs (loop_state, w_next)
+    # and repack them alongside the unmodified pipeline_weights to maintain a consistent carry shape for nn.scan.
+    return (*execute_pipeline_stage_pure(loop_state, w_curr, pipeline_weights), pipeline_weights), None
 
-  return execute_pipeline_stage_outer
+  return execute_pipeline_stage_flax
 
 
-def create_flax_pipeline_scan(pipeline_stage_fn, length, use_scan=True):
-  """Wraps the pipeline stage execution in a `flax.linen.scan`.
+def create_flax_pipeline_scan(pipeline_stage_fn, length, remat_policy, use_scan=True):
+  """Wraps the pipeline stage execution in `flax.linen.remat` and `flax.linen.scan`.
 
-  This lifts the pipeline stage function so it can be repeated sequentially over
-  the specified length. It safely handles Flax-specific state collections, ensuring
-  that metrics, intermediate values, and PRNG keys do not collide or overwrite
-  each other across the loop iterations.
+  This explicitly wraps the pipeline step in a gradient checkpointing policy
+  and then lifts it so it can be repeated sequentially over the specified length.
+  It safely handles Flax-specific state collections, ensuring that metrics, intermediate
+  values, and PRNG keys do not collide or overwrite each other across loop iterations.
 
   Args:
     pipeline_stage_fn: The function representing a single pipeline stage
-                       (usually created by `create_rematerialized_pipeline_stage`).
+                       (usually created by `create_pipeline_stage`).
+    remat_policy: The checkpointing policy used by `nn.remat` to manage activation memory.
     length: The total number of pipeline stages/repeats to scan over.
-    use_scan: Either scan over repeats or unroll the scan.
+    use_scan: Whether to use `jax.lax.scan` (True) or unroll the loop (False).
 
   Returns:
     A Flax scanned function that executes the full pipeline schedule.
   """
   unroll_length = 1 if use_scan else length
   return nn.scan(
-      pipeline_stage_fn,
+      nn.remat(
+          pipeline_stage_fn,
+          policy=remat_policy,
+      ),
       variable_axes={
           "summaries": 0,
           "aux_loss": 0,