remove activation_length_no_exp

NuojCheng · NuojCheng · commit 83f4af0e6090 · 2026-04-06T18:40:50.000Z
diff --git a/src/maxtext/common/common_types.py b/src/maxtext/common/common_types.py
@@ -37,7 +37,6 @@
 ATTN_LENGTH_NO_EXP = "activation_attn_length_no_exp"
 
 LENGTH = "activation_length"
-LENGTH_NO_EXP = "activation_length_no_exp"
 PREFILL_LENGTH = "prefill_activation_length"
 Q_LENGTH = "activation_q_length"
 Q_LENGTH_NO_EXP = "activation_q_length_no_exp"
diff --git a/src/maxtext/configs/base.yml b/src/maxtext/configs/base.yml
@@ -455,16 +455,12 @@ logical_axis_rules: [
                       ['activation_embed_and_logits_batch_sequence', ['data', 'stage', 'fsdp', 'fsdp_transpose', 'sequence', 'context', 'expert']],
                       ['activation_heads', ['tensor', 'tensor_transpose', 'sequence','tensor_sequence','autoregressive']],
                       ['activation_kv_heads', ['tensor', 'tensor_transpose', 'sequence','tensor_sequence']],
-                      ['activation_length', ['sequence', 'context', 'expert']],
-                      ['activation_length', ['context', 'expert']],
-                      ['activation_attn_length', ['sequence', 'context', 'expert']],
-                      ['activation_attn_length', ['context', 'expert']],
-                      ['activation_attn_length_no_exp', ['sequence', 'context']],
-                      ['activation_attn_length_no_exp', ['context']],
-                      ['activation_length_no_exp', ['sequence', 'context']],
-                      ['activation_length_no_exp', ['context']],
-                      ['activation_length_no_exp_moe', ['sequence', 'context']],
-                      ['activation_length_no_exp_moe', ['context']],
+                      ['activation_length', ['sequence', 'context']],
+                      ['activation_length', ['context']],
+                      ['activation_attn_length', ['sequence', 'context']],
+                      ['activation_attn_length', ['context']],
+                      ['activation_length_moe', ['sequence', 'context']],
+                      ['activation_length_moe', ['context']],
                       ['activation_norm_length', ['tensor_sequence', 'context', 'sequence']],
                       ['activation_norm_length_moe', ['tensor_sequence', 'context', 'sequence']],
                       ['activation_q_length', ['context', 'expert']],
diff --git a/src/maxtext/configs/inference/vllm.yml b/src/maxtext/configs/inference/vllm.yml
@@ -36,12 +36,11 @@ logical_axis_rules: [
                       ['activation_embed_and_logits_batch_sequence', ['data', 'expert']],
                       ['activation_heads', ['model', 'expert']],
                       ['activation_kv_heads', ['model', 'expert']],
-                      ['activation_attn_length', ['expert']],
+                      ['activation_attn_length', ["expert"]],
                       ['activation_attn_length_no_exp', []],
-                      ['activation_length', ['data', 'expert']],
+                      ['activation_length', ['data']],
                       ['activation_length_moe', ['data', 'expert']],
-                      ['activation_length_no_exp', 'data'],
-                      ['activation_length_no_exp_moe', 'data'],
+                      ['activation_length_moe', 'data'],
                       ['activation_q_length', ['expert', 'attn_dp_expert']],
                       ['activation_attn_embed', 'model'],
                       ['activation_embed', ['model', 'attn_dp']],
diff --git a/src/maxtext/layers/attention_mla.py b/src/maxtext/layers/attention_mla.py
@@ -52,12 +52,10 @@
     HEAD,
     Q_LORA_UP_PROJ,
     KV_BATCH,
-    KV_BATCH_NO_EXP,
     KV_HEAD,
     KV_HEAD_DIM,
     KV_LORA_UP_PROJ,
     LENGTH,
-    LENGTH_NO_EXP,
     MODEL_MODE_PREFILL,
     MODEL_MODE_TRAIN,
     PREFILL_KV_BATCH,
@@ -424,14 +422,11 @@ def mla_as_linen(
     prefill_query_axis_names: AxisNames = (PREFILL_KV_BATCH, PREFILL_LENGTH, KV_HEAD, KV_HEAD_DIM),
     prefill_key_axis_names: AxisNames = (PREFILL_KV_BATCH, PREFILL_LENGTH, KV_HEAD, KV_HEAD_DIM),
     prefill_value_axis_names: AxisNames = (PREFILL_KV_BATCH, PREFILL_LENGTH, KV_HEAD, KV_HEAD_DIM),
-    query_axis_names: AxisNames = (KV_BATCH, LENGTH_NO_EXP, KV_HEAD, KV_HEAD_DIM),
-    key_axis_names: AxisNames = (KV_BATCH, LENGTH_NO_EXP, KV_HEAD, KV_HEAD_DIM),
-    value_axis_names: AxisNames = (KV_BATCH, LENGTH_NO_EXP, KV_HEAD, KV_HEAD_DIM),
-    ep_query_axis_names: AxisNames = (KV_BATCH_NO_EXP, LENGTH, KV_HEAD, KV_HEAD_DIM),
-    ep_key_axis_names: AxisNames = (KV_BATCH_NO_EXP, LENGTH, KV_HEAD, KV_HEAD_DIM),
-    ep_value_axis_names: AxisNames = (KV_BATCH_NO_EXP, LENGTH, KV_HEAD, KV_HEAD_DIM),
-    input_axis_names: AxisNames = (BATCH, LENGTH_NO_EXP, EMBED),
-    out_axis_names: AxisNames = (BATCH, LENGTH_NO_EXP, HEAD, D_KV),
+    query_axis_names: AxisNames = (KV_BATCH, LENGTH, KV_HEAD, KV_HEAD_DIM),
+    key_axis_names: AxisNames = (KV_BATCH, LENGTH, KV_HEAD, KV_HEAD_DIM),
+    value_axis_names: AxisNames = (KV_BATCH, LENGTH, KV_HEAD, KV_HEAD_DIM),
+    input_axis_names: AxisNames = (BATCH, LENGTH, EMBED),
+    out_axis_names: AxisNames = (BATCH, LENGTH, HEAD, D_KV),
     prefill_input_axis_names: AxisNames = (PREFILL_KV_BATCH, PREFILL_LENGTH, EMBED),
     decode_input_axis_names: AxisNames = (DECODE_BATCH, DECODE_LENGTH, EMBED),
     prefill_out_axis_names: AxisNames = (PREFILL_KV_BATCH, PREFILL_LENGTH, HEAD, D_KV),
@@ -496,9 +491,6 @@ def mla_as_linen(
       query_axis_names=query_axis_names,
       key_axis_names=key_axis_names,
       value_axis_names=value_axis_names,
-      ep_query_axis_names=ep_query_axis_names,
-      ep_key_axis_names=ep_key_axis_names,
-      ep_value_axis_names=ep_value_axis_names,
       input_axis_names=input_axis_names,
       out_axis_names=out_axis_names,
       prefill_input_axis_names=prefill_input_axis_names,
@@ -568,14 +560,11 @@ def __init__(
       prefill_query_axis_names: AxisNames = (PREFILL_KV_BATCH, PREFILL_LENGTH, KV_HEAD, KV_HEAD_DIM),
       prefill_key_axis_names: AxisNames = (PREFILL_KV_BATCH, PREFILL_LENGTH, KV_HEAD, KV_HEAD_DIM),
       prefill_value_axis_names: AxisNames = (PREFILL_KV_BATCH, PREFILL_LENGTH, KV_HEAD, KV_HEAD_DIM),
-      query_axis_names: AxisNames = (KV_BATCH, LENGTH_NO_EXP, KV_HEAD, KV_HEAD_DIM),
-      key_axis_names: AxisNames = (KV_BATCH, LENGTH_NO_EXP, KV_HEAD, KV_HEAD_DIM),
-      value_axis_names: AxisNames = (KV_BATCH, LENGTH_NO_EXP, KV_HEAD, KV_HEAD_DIM),
-      ep_query_axis_names: AxisNames = (KV_BATCH_NO_EXP, LENGTH, KV_HEAD, KV_HEAD_DIM),
-      ep_key_axis_names: AxisNames = (KV_BATCH_NO_EXP, LENGTH, KV_HEAD, KV_HEAD_DIM),
-      ep_value_axis_names: AxisNames = (KV_BATCH_NO_EXP, LENGTH, KV_HEAD, KV_HEAD_DIM),
-      input_axis_names: AxisNames = (BATCH, LENGTH_NO_EXP, EMBED),
-      out_axis_names: AxisNames = (BATCH, LENGTH_NO_EXP, HEAD, D_KV),
+      query_axis_names: AxisNames = (KV_BATCH, LENGTH, KV_HEAD, KV_HEAD_DIM),
+      key_axis_names: AxisNames = (KV_BATCH, LENGTH, KV_HEAD, KV_HEAD_DIM),
+      value_axis_names: AxisNames = (KV_BATCH, LENGTH, KV_HEAD, KV_HEAD_DIM),
+      input_axis_names: AxisNames = (BATCH, LENGTH, EMBED),
+      out_axis_names: AxisNames = (BATCH, LENGTH, HEAD, D_KV),
       prefill_input_axis_names: AxisNames = (PREFILL_KV_BATCH, PREFILL_LENGTH, EMBED),
       decode_input_axis_names: AxisNames = (DECODE_BATCH, DECODE_LENGTH, EMBED),
       prefill_out_axis_names: AxisNames = (PREFILL_KV_BATCH, PREFILL_LENGTH, HEAD, D_KV),
@@ -657,9 +646,6 @@ def __init__(
         query_axis_names=query_axis_names,
         key_axis_names=key_axis_names,
         value_axis_names=value_axis_names,
-        ep_query_axis_names=ep_query_axis_names,
-        ep_key_axis_names=ep_key_axis_names,
-        ep_value_axis_names=ep_value_axis_names,
         input_axis_names=input_axis_names,
         out_axis_names=out_axis_names,
         prefill_input_axis_names=prefill_input_axis_names,
@@ -873,12 +859,9 @@ def mla_query_projection(
     if model_mode == MODEL_MODE_PREFILL:
       query_logical_name = self.prefill_query_axis_names
       wqa_logical_name = (PREFILL_KV_BATCH, PREFILL_LENGTH, Q_LORA_UP_PROJ)
-    elif model_mode == MODEL_MODE_TRAIN and self.config.expert_shard_attention_option == EP_AS_CONTEXT:
-      query_logical_name = self.ep_query_axis_names
-      wqa_logical_name = (KV_BATCH_NO_EXP, LENGTH, Q_LORA_UP_PROJ)
     else:
       query_logical_name = self.query_axis_names
-      wqa_logical_name = (KV_BATCH, LENGTH_NO_EXP, Q_LORA_UP_PROJ)
+      wqa_logical_name = (KV_BATCH, LENGTH, Q_LORA_UP_PROJ)
     query_sharding = create_sharding(self.mesh, query_logical_name)
     wqa_out_sharding = create_sharding(self.mesh, wqa_logical_name)
     # Set softmax scaling.
@@ -1029,10 +1012,8 @@ def mla_kv_projection(self, inputs: Array, inputs_positions: Array, decoder_segm
     """MLA key/value projection with integrated rotary embedding."""
     if model_mode == MODEL_MODE_PREFILL:
       wka_logical_name = (PREFILL_KV_BATCH, PREFILL_LENGTH, KV_LORA_UP_PROJ)
-    elif model_mode == MODEL_MODE_TRAIN and self.config.expert_shard_attention_option == EP_AS_CONTEXT:
-      wka_logical_name = (KV_BATCH_NO_EXP, LENGTH, KV_LORA_UP_PROJ)
     else:
-      wka_logical_name = (KV_BATCH, LENGTH_NO_EXP, KV_LORA_UP_PROJ)
+      wka_logical_name = (KV_BATCH, LENGTH, KV_LORA_UP_PROJ)
     wkva_out_sharding = create_sharding(self.mesh, wka_logical_name)
     low_rank = self.wkv_a(inputs, out_sharding=wkva_out_sharding)
     low_rank = checkpoint_name(low_rank, "kv_wa_proj")
@@ -1172,7 +1153,7 @@ def __call__(
     else:
       inputs_q = self._maybe_shard_with_logical(inputs_q, self.input_axis_names)
       inputs_kv = self._maybe_shard_with_logical(inputs_kv, self.input_axis_names)
-      out_logical_name = (BATCH, LENGTH_NO_EXP, HEAD, D_KV)
+      out_logical_name = (BATCH, LENGTH, HEAD, D_KV)
 
     if model_mode != MODEL_MODE_TRAIN and decoder_segment_ids is None:
       decoder_segment_ids = jnp.ones(inputs_q.shape[:2], dtype=jnp.int32)
diff --git a/src/maxtext/layers/attention_op.py b/src/maxtext/layers/attention_op.py
@@ -1142,7 +1142,7 @@ def tpu_flash_attention(
     axis_names_splash_kernel = self._logical_to_mesh_axes(self.flash_axis_names_splash_kernel)
     axis_names_q = self._logical_to_mesh_axes(self.flash_axis_names_q)
     axis_names_kv = self._logical_to_mesh_axes(self.flash_axis_names_kv)
-    indexer_mask_axis_names = self._logical_to_mesh_axes((BATCH, Q_LENGTH, KV_LENGTH))
+    indexer_mask_axis_names = self._logical_to_mesh_axes((BATCH, Q_LENGTH_NO_EXP, KV_LENGTH))
 
     global global_block_q, global_block_kv, global_block_kv_compute, global_block_q_dkv, global_block_kv_dkv
     global global_block_kv_dkv_compute, global_block_q_dq, global_block_kv_dq, global_use_fused_bwd_kernel
diff --git a/src/maxtext/layers/decoders.py b/src/maxtext/layers/decoders.py
@@ -107,7 +107,7 @@ def __call__(
     if self.model_mode == MODEL_MODE_PREFILL:
       logical_axis_names = ("activation_batch", "prefill_activation_length", "activation_embed")
     else:
-      logical_axis_names = ("activation_batch", "activation_length_no_exp", "activation_embed")
+      logical_axis_names = ("activation_batch", "activation_length", "activation_embed")
 
     if model_mode == MODEL_MODE_PREFILL:
       inputs = _maybe_shard_with_logical(inputs, logical_axis_names)
@@ -690,7 +690,7 @@ def apply_output_head(self, shared_embedding: nn.Module | nnx.Module, y, determi
 
     cfg = self.config
     if cfg.shard_mode == ShardMode.EXPLICIT:
-      norm_out_sharding = create_sharding(self.mesh, ("activation_batch", "activation_length_no_exp", "activation_embed"))
+      norm_out_sharding = create_sharding(self.mesh, ("activation_batch", "activation_length", "activation_embed"))
     else:
       norm_out_sharding = None
 
@@ -708,7 +708,7 @@ def apply_output_head(self, shared_embedding: nn.Module | nnx.Module, y, determi
       out_sharding = create_sharding(self.mesh, (None, None, "activation_vocab"))
     else:
       out_sharding = create_sharding(
-          self.mesh, ("activation_embed_and_logits_batch", "activation_length_no_exp", "activation_vocab")
+          self.mesh, ("activation_embed_and_logits_batch", "activation_length", "activation_vocab")
       )
 
     # [batch, length, emb_dim] -> [batch, length, vocab_size]
diff --git a/src/maxtext/layers/embeddings.py b/src/maxtext/layers/embeddings.py
@@ -165,7 +165,7 @@ def __call__(self, inputs: Array, model_mode: str = MODEL_MODE_TRAIN) -> Array:
         if model_mode == MODEL_MODE_PREFILL
         else (
             "activation_embed_and_logits_batch",
-            "activation_length_no_exp",
+            "activation_length",
             "activation_embed",
         )
     )
@@ -850,7 +850,7 @@ def __init__(
     self.attention_scaling = attention_scaling
 
     self.freqs_sharding = (
-        create_sharding(mesh, ("activation_batch", "activation_length_no_exp", "q_heads"))
+        create_sharding(mesh, ("activation_batch", "activation_length", "q_heads"))
         if shard_mode == ShardMode.EXPLICIT
         else None
     )
@@ -976,7 +976,7 @@ def __call__(self, inputs: Array, position: None | Array = None) -> Array:
     inputs_complex = first_half + 1j * second_half  # shape: [B, S, N, half_dim]
     # Apply the rotary transformation via complex multiplication.
     rotated_sharding = (
-        create_sharding(self.mesh, ("activation_batch", "activation_length_no_exp", None, None))
+        create_sharding(self.mesh, ("activation_batch", "activation_length", None, None))
         if self.shard_mode == ShardMode.EXPLICIT
         else None
     )
diff --git a/src/maxtext/layers/linears.py b/src/maxtext/layers/linears.py
@@ -405,7 +405,7 @@ def __init__(
     if self.model_mode == MODEL_MODE_PREFILL:
       self.intermediate_logical = ("activation_batch", "prefill_activation_length", "activation_mlp")
     else:
-      self.intermediate_logical = ("activation_batch", "activation_length_no_exp", "activation_mlp")
+      self.intermediate_logical = ("activation_batch", "activation_length", "activation_mlp")
 
     if config.fused_mlp:
       self.wi = DenseGeneral(
diff --git a/src/maxtext/layers/moe.py b/src/maxtext/layers/moe.py
@@ -278,7 +278,7 @@ def __call__(self, inputs: jax.Array, _initializing: bool = False) -> Tuple[jax.
 
     contract_ind = tuple(range(0, len(norm_axis)))
     output_sharding = (
-        create_sharding(self.mesh, ("activation_batch_no_exp_moe", "activation_length_no_exp_moe", None))
+        create_sharding(self.mesh, ("activation_batch_no_exp_moe", "activation_length_moe", None))
         if self.shard_mode == ShardMode.EXPLICIT
         else None
     )
@@ -1452,11 +1452,11 @@ def reshape_and_update_weights(self, weights, indices):
         self._maybe_shard_with_logical(
             jnp.arange(weights.shape[0])[:, None, None], ("activation_batch_no_exp_moe", None, None)
         ),
-        self._maybe_shard_with_logical(jnp.arange(weights.shape[1])[:, None], ("activation_length_no_exp_moe", None)),
+        self._maybe_shard_with_logical(jnp.arange(weights.shape[1])[:, None], ("activation_length_moe", None)),
         indices,
     )
     weight_sharding = (
-        create_sharding(self.mesh, ("activation_batch_no_exp_moe", "activation_length_no_exp_moe", None))
+        create_sharding(self.mesh, ("activation_batch_no_exp_moe", "activation_length_moe", None))
         if self.config.shard_mode == ShardMode.EXPLICIT
         else None
     )
@@ -1705,13 +1705,11 @@ def dense_matmul(
   ) -> tuple[jax.Array, Optional[jax.Array], Optional[jax.Array]]:
     """Dense matrix multiplication."""
     # gate_logits: batch, length, expert
-    gate_logits = self._maybe_shard_with_logical(
-        gate_logits, ("activation_batch_moe", "activation_length_no_exp_moe", None)
-    )
+    gate_logits = self._maybe_shard_with_logical(gate_logits, ("activation_batch_moe", "activation_length_moe", None))
     if self.config.model_name.startswith("deepseek3"):
       # pre_bias_logits is None for non-DeepSeek v3 models
       pre_bias_logits = self._maybe_shard_with_logical(
-          pre_bias_logits, ("activation_batch_moe", "activation_length_no_exp_moe", None)
+          pre_bias_logits, ("activation_batch_moe", "activation_length_moe", None)
       )
     top_k_weights, top_k_indices = self.get_topk(gate_logits, pre_bias_logits, self.rngs)
     is_llama4_decoder_layer = self.config.decoder_block == ctypes.DecoderBlockType.LLAMA4
diff --git a/src/maxtext/layers/nnx_decoders.py b/src/maxtext/layers/nnx_decoders.py
@@ -169,7 +169,7 @@ def __call__(
     if self.model_mode == MODEL_MODE_PREFILL:
       logical_axis_names = ("activation_batch", "prefill_activation_length", "activation_embed")
     else:
-      logical_axis_names = ("activation_batch", "activation_length_no_exp", "activation_embed")
+      logical_axis_names = ("activation_batch", "activation_length", "activation_embed")
 
     inputs = _maybe_shard_with_logical(inputs, logical_axis_names)
     inputs = checkpoint_name(inputs, "decoder_layer_input")
@@ -736,7 +736,7 @@ def apply_output_head(self, shared_embedding, y, deterministic, model_mode):
 
     cfg = self.config
     if cfg.shard_mode == ShardMode.EXPLICIT:
-      norm_out_sharding = create_sharding(self.mesh, ("activation_batch", "activation_length_no_exp", "activation_embed"))
+      norm_out_sharding = create_sharding(self.mesh, ("activation_batch", "activation_length", "activation_embed"))
     else:
       norm_out_sharding = None
 
@@ -747,7 +747,7 @@ def apply_output_head(self, shared_embedding, y, deterministic, model_mode):
       out_sharding = create_sharding(self.mesh, (None, None, "activation_vocab"))
     else:
       out_sharding = create_sharding(
-          self.mesh, ("activation_embed_and_logits_batch", "activation_length_no_exp", "activation_vocab")
+          self.mesh, ("activation_embed_and_logits_batch", "activation_length", "activation_vocab")
       )
 
     # [batch, length, emb_dim] -> [batch, length, vocab_size]
diff --git a/src/maxtext/layers/pipeline.py b/src/maxtext/layers/pipeline.py
@@ -57,7 +57,7 @@ def setup(self):
     self.use_circ_storage = self.need_circ_storage()
 
     self.batch_axis_name = "activation_batch"
-    self.seq_len_axis_name = "activation_length_no_exp"
+    self.seq_len_axis_name = "activation_length"
 
     self.spmd_axis_name = "stage" if self.config.shard_mode == ShardMode.AUTO else None
 
diff --git a/src/maxtext/layers/pipeline_deprecated.py b/src/maxtext/layers/pipeline_deprecated.py
@@ -68,7 +68,7 @@ def setup(self):  # pylint: disable=missing-function-docstring
     self.use_circ_storage = self.need_circ_storage()
 
     self.batch_axis_name = "activation_batch"
-    self.seq_len_axis_name = "activation_length_no_exp"
+    self.seq_len_axis_name = "activation_length"
 
     # TODO(b/470167805): replace self.spmd_axis_name with "stage" when JAX >= 0.8.2.
     self.spmd_axis_name = "stage" if self.config.shard_mode == ShardMode.AUTO else None
diff --git a/src/maxtext/models/llama2.py b/src/maxtext/models/llama2.py
@@ -184,9 +184,7 @@ def __call__(
     hidden_states = self._maybe_shard_with_logical(hidden_states, self.activation_axis_names)
 
     # MLP block.
-    mlp_intermediate_sharding = create_sharding(
-        self.mesh, ("activation_batch", "activation_length_no_exp", "activation_mlp")
-    )
+    mlp_intermediate_sharding = create_sharding(self.mesh, ("activation_batch", "activation_length", "activation_mlp"))
     mlp_lnx = self.mlp(
         hidden_states,
         deterministic=deterministic,
diff --git a/src/maxtext/models/qwen3.py b/src/maxtext/models/qwen3.py
@@ -29,7 +29,7 @@
 from flax import linen as nn
 from flax import nnx
 
-from maxtext.common.common_types import AttentionType, Config, DType, Array, BATCH, LENGTH_NO_EXP, EMBED, MODEL_MODE_TRAIN
+from maxtext.common.common_types import AttentionType, Config, DType, Array, BATCH, EMBED, MODEL_MODE_TRAIN, LENGTH
 from maxtext.layers import attentions
 from maxtext.layers import initializers as max_initializers
 from maxtext.layers import moe
@@ -723,7 +723,7 @@ def __init__(
         attention_kernel=cfg.attention,
         inputs_q_shape=dummy_inputs_shape,
         inputs_kv_shape=dummy_inputs_shape,
-        out_axis_names=(BATCH, LENGTH_NO_EXP, EMBED),
+        out_axis_names=(BATCH, LENGTH, EMBED),
         mesh=self.mesh,
         dtype=cfg.dtype,
         weight_dtype=cfg.weight_dtype,
diff --git a/src/maxtext/utils/vocabulary_tiling.py b/src/maxtext/utils/vocabulary_tiling.py
@@ -61,11 +61,11 @@ def vocab_tiling_linen_loss(
   param_spec = nn.get_partition_spec(params)
   hidden_spec = create_sharding(
       model.mesh,
-      ("activation_embed_and_logits_batch", "activation_length_no_exp", "activation_embed"),
+      ("activation_embed_and_logits_batch", "activation_length", "activation_embed"),
   )
   label_spec = create_sharding(
       model.mesh,
-      ("activation_embed_and_logits_batch", "activation_length_no_exp"),
+      ("activation_embed_and_logits_batch", "activation_length"),
   )
   reshaped_hidden_spec = create_sharding(
       model.mesh,
diff --git a/tests/unit/attention_test.py b/tests/unit/attention_test.py
diff --git a/tests/unit/moe_test.py b/tests/unit/moe_test.py
diff --git a/tests/utils/attention_test_util.py b/tests/utils/attention_test_util.py

Original file line number	Diff line number	Diff line change
`@@ -165,7 +165,7 @@ def __call__(self, inputs: Array, model_mode: str = MODEL_MODE_TRAIN) -> Array:`
`165`	`165`	`if model_mode == MODEL_MODE_PREFILL`
`166`	`166`	`else (`
`167`	`167`	`"activation_embed_and_logits_batch",`
`168`		`- "activation_length_no_exp",`
	`168`	`+ "activation_length",`
`169`	`169`	`"activation_embed",`
`170`	`170`	`)`
`171`	`171`	`)`
`@@ -850,7 +850,7 @@ def __init__(`
`850`	`850`	`self.attention_scaling = attention_scaling`
`851`	`851`
`852`	`852`	`self.freqs_sharding = (`
`853`		`- create_sharding(mesh, ("activation_batch", "activation_length_no_exp", "q_heads"))`
	`853`	`+ create_sharding(mesh, ("activation_batch", "activation_length", "q_heads"))`
`854`	`854`	`if shard_mode == ShardMode.EXPLICIT`
`855`	`855`	`else None`
`856`	`856`	`)`
`@@ -976,7 +976,7 @@ def __call__(self, inputs: Array, position: None \| Array = None) -> Array:`
`976`	`976`	`inputs_complex = first_half + 1j * second_half # shape: [B, S, N, half_dim]`
`977`	`977`	`# Apply the rotary transformation via complex multiplication.`
`978`	`978`	`rotated_sharding = (`
`979`		`- create_sharding(self.mesh, ("activation_batch", "activation_length_no_exp", None, None))`
	`979`	`+ create_sharding(self.mesh, ("activation_batch", "activation_length", None, None))`
`980`	`980`	`if self.shard_mode == ShardMode.EXPLICIT`
`981`	`981`	`else None`
`982`	`982`	`)`