refactor logical rule

NuojCheng · NuojCheng · commit 9fc1ccc2b83b · 2026-04-13T17:53:35.000Z
diff --git a/src/maxtext/configs/base.yml b/src/maxtext/configs/base.yml
@@ -453,86 +453,96 @@ compile_xla_flags: "" # Compiler options e.g. compile_xla_flags="--xla_tpu_num_s
 shard_mode: "auto" # can be either auto or explicit
 custom_mesh_and_rule: "" # replace default mesh and logical rule by specifying yml name under config/mesh_and_rule/.
 mesh_axes: ['diloco', 'data', 'stage', 'fsdp', 'fsdp_transpose', 'sequence', 'context', 'context_autoregressive', 'tensor', 'tensor_transpose', 'tensor_sequence', 'expert', 'autoregressive']
-logical_axis_rules: [
-                      ['activation_batch', ['data', 'fsdp', 'fsdp_transpose', 'expert']],
-                      # Vocab activation
+logical_axis_rules: [    
+                      # ==========================================
+                      # Vocabulary Embedding
+                      # ==========================================
+                      # Vocab Activations
                       ['activation_embed_and_logits_batch', ['data', 'stage', 'fsdp', 'fsdp_transpose', 'expert']],
                       ['activation_embed_and_logits_batch_sequence', ['data', 'stage', 'fsdp', 'fsdp_transpose', 'sequence', 'context', 'expert']],
                       ['activation_vocab', ['tensor', 'tensor_transpose', 'tensor_sequence']],
-                      ['activation_vocab', ['tensor', 'tensor_transpose']],
-                      ['activation_vocab', 'tensor_sequence'],
-                      ['activation_vocab', ['sequence','context']],
-                      # Vocab weight
+                      # ['activation_vocab', ['tensor', 'tensor_transpose']],
+                      # ['activation_vocab', 'tensor_sequence'],
+                      # ['activation_vocab', ['sequence', 'context']],
+                      # Vocab Weights
                       ['vocab', ['tensor', 'tensor_transpose', 'tensor_sequence', 'autoregressive']],
                       ['embed_vocab', ['fsdp', 'fsdp_transpose', 'sequence', 'context', 'expert']],
-                      # MoE activation
-                      ['activation_batch_moe', ['data', 'fsdp', 'fsdp_transpose']],
-                      ['activation_length_moe', ['sequence', 'context']],
-                      ['activation_length_moe', ['context']],
-                      ['activation_norm_length_moe', ['tensor_sequence', 'context', 'sequence']],
-                      ['activation_embed_moe', ['tensor', 'tensor_transpose']],
-                      ['activation_mlp_moe', ['tensor', 'tensor_transpose', 'tensor_sequence']],
-                      ['activation_exp', ['expert']],
-                      # MoE weight
-                      ['mlp_moe', ['fsdp_transpose', 'tensor', 'tensor_sequence', 'autoregressive']],
-                      ['mlp_no_fsdp', ['tensor', 'tensor_sequence', 'autoregressive']], # should be deprecated
-                      ['embed_moe', ['fsdp', 'fsdp_transpose', 'sequence', 'tensor_transpose', 'context']],
-                      ['embed_moe', ['fsdp', 'sequence', 'tensor_transpose', 'context']],
-                      ['embed_moe', ['fsdp', 'fsdp_transpose', 'sequence', 'context']],
-                      ['embed_moe', ['fsdp', 'sequence', 'context']],
-                      ['embed_tensor_transpose', ['tensor_transpose']], # should be deprecated
-                      ['exp_with_fsdp', 'fsdp'], # should be deprecated
-                      # Attn activation
+                      # ==========================================
+                      # Attention
+                      # ==========================================
+                      # Attention Activations
+                      ['activation_heads', ['tensor', 'tensor_transpose', 'sequence', 'tensor_sequence', 'autoregressive']],
+                      ['activation_kv_heads', ['tensor', 'tensor_transpose', 'sequence', 'tensor_sequence']],
                       ['activation_attn_length', ['sequence', 'context']],
-                      ['activation_attn_length', ['context']],
+                      # ['activation_attn_length', ['context']],
                       ['activation_q_length', ['context']],
                       ['activation_kv_length', []],
                       ['activation_attn_embed', ['tensor', 'tensor_transpose']],
                       ['activation_kv', ['tensor', 'tensor_transpose', 'tensor_sequence']],
                       ['activation_kv_batch', ['data', 'fsdp', 'fsdp_transpose', 'expert']],
                       ['activation_kv_head_dim', ['tensor', 'tensor_transpose', 'tensor_sequence']],
-                      ['activation_vocab', ['tensor', 'tensor_transpose', 'tensor_sequence']],
-                      ['activation_vocab', ['tensor', 'tensor_transpose']],
-                      ['activation_vocab', 'tensor_sequence'],
-                      ['activation_vocab', ['sequence','context']],
-                      ['activation_stage', 'stage'],
-                      ['activation_exp', ['expert']],
-                      ['decode_batch', ['data', 'fsdp', 'fsdp_transpose', 'expert']],
-                      ['decode_length', ['sequence']],
-                      ['mlp', ['fsdp_transpose', 'tensor', 'tensor_sequence', 'autoregressive']],
-                      ['mlp_no_fsdp', ['tensor', 'tensor_sequence', 'autoregressive']],
-                      ['vocab', ['tensor', 'tensor_transpose', 'tensor_sequence', 'autoregressive']],
+                      # Attention Weights
                       ['heads', ['tensor', 'tensor_transpose', 'tensor_sequence', 'autoregressive']],
                       ['q_heads', ['tensor', 'tensor_transpose', 'tensor_sequence', 'autoregressive']],
                       ['kv_heads', ['tensor', 'tensor_transpose', 'tensor_sequence', 'autoregressive']],
+                      ['qkv', []],
+                      ['kv', []],
+                      ['kv_head_dim', []],
                       ['q_lora', ['fsdp', 'fsdp_transpose', 'sequence', 'context', 'tensor_transpose', 'expert']],
-                      ['q_lora', ['fsdp', 'sequence', 'context', 'tensor_transpose', 'expert']],
-                      ['q_lora', ['fsdp', 'fsdp_transpose', 'sequence', 'context', 'expert']],
-                      ['q_lora', ['fsdp', 'sequence', 'context', 'expert']],
-                      ["q_lora_up_proj",[]],
+                      # ['q_lora', ['fsdp', 'sequence', 'context', 'tensor_transpose', 'expert']],
+                      # ['q_lora', ['fsdp', 'fsdp_transpose', 'sequence', 'context', 'expert']],
+                      # ['q_lora', ['fsdp', 'sequence', 'context', 'expert']],
+                      ["q_lora_up_proj", []],
                       ['kv_lora', ['fsdp', 'fsdp_transpose', 'sequence', 'context', 'tensor_transpose', 'expert']],
-                      ['kv_lora', ['fsdp', 'sequence', 'context', 'tensor_transpose', 'expert']],
-                      ['kv_lora', ['fsdp', 'fsdp_transpose', 'sequence', 'context', 'expert']],
-                      ['kv_lora', ['fsdp', 'sequence', 'context', 'expert']],
-                      ["kv_lora_up_proj",[]],
-                      # Other activation
-                      ['activation_heads', ['tensor', 'tensor_transpose', 'sequence','tensor_sequence','autoregressive']],
-                      ['activation_kv_heads', ['tensor', 'tensor_transpose', 'sequence','tensor_sequence']],
+                      # ['kv_lora', ['fsdp', 'sequence', 'context', 'tensor_transpose', 'expert']],
+                      # ['kv_lora', ['fsdp', 'fsdp_transpose', 'sequence', 'context', 'expert']],
+                      # ['kv_lora', ['fsdp', 'sequence', 'context', 'expert']],
+                      ["kv_lora_up_proj", []],
+                      # ==========================================
+                      # Mixture of Experts (MoE)
+                      # ==========================================
+                      # MoE Activations
+                      ['activation_batch_moe', ['data', 'fsdp', 'fsdp_transpose']],
+                      ['activation_length_moe', ['sequence', 'context']],
+                      # ['activation_length_moe', ['context']],
+                      ['activation_norm_length_moe', ['tensor_sequence', 'context', 'sequence']],
+                      ['activation_embed_moe', ['tensor', 'tensor_transpose']],
+                      ['activation_mlp_moe', ['tensor', 'tensor_transpose', 'tensor_sequence']],
+                      ['activation_exp', ['expert']],
+                      # MoE Weights
+                      ['exp', 'expert'],
+                      ['mlp_moe', ['fsdp_transpose', 'tensor', 'tensor_sequence', 'autoregressive']],
+                      ['embed_moe', ['fsdp', 'fsdp_transpose', 'sequence', 'tensor_transpose', 'context']],
+                      # ['embed_moe', ['fsdp', 'sequence', 'tensor_transpose', 'context']],
+                      # ['embed_moe', ['fsdp', 'fsdp_transpose', 'sequence', 'context']],
+                      # ['embed_moe', ['fsdp', 'sequence', 'context']],
+                      # ==========================================
+                      # Standard MLP / Dense Layers / Model Structure
+                      # ==========================================
+                      # Dense Activations
+                      ['activation_mlp', ['tensor', 'tensor_transpose', 'tensor_sequence']],
+                      ['activation_batch', ['data', 'fsdp', 'fsdp_transpose', 'expert']],
                       ['activation_length', ['sequence', 'context']],
-                      ['activation_length', ['context']],
+                      # ['activation_length', ['context']],
                       ['activation_norm_length', ['tensor_sequence', 'context', 'sequence']],
                       ['activation_embed', ['tensor', 'tensor_transpose']],
-                      ['activation_mlp', ['tensor', 'tensor_transpose', 'tensor_sequence']],
                       ['activation_stage', 'stage'],
-                      # Other weight
+                      # General Weights
                       ['mlp', ['fsdp_transpose', 'tensor', 'tensor_sequence', 'autoregressive']],
                       ['embed', ['fsdp', 'fsdp_transpose', 'sequence', 'tensor_transpose', 'context', 'expert']],
-                      ['embed', ['fsdp', 'sequence', 'tensor_transpose', 'context' , 'expert']],
-                      ['embed', ['fsdp', 'fsdp_transpose', 'sequence', 'context', 'expert']],
-                      ['embed', ['fsdp', 'sequence', 'context', 'expert']],
+                      # ['embed', ['fsdp', 'sequence', 'tensor_transpose', 'context', 'expert']],
+                      # ['embed', ['fsdp', 'fsdp_transpose', 'sequence', 'context', 'expert']],
+                      # ['embed', ['fsdp', 'sequence', 'context', 'expert']],
                       ['norm', ['tensor', 'tensor_transpose']],
                       ['layers', 'stage'],
-                      # Others (inference etc.)
+                      ['diloco', 'diloco'],
+                      ['engram_dim', ['tensor']],
+                      ['dense_layers', []],
+                      ['moe_layers', []],
+                      ['mhc', []],
+                      # ==========================================
+                      # Inference(Prefill, Decode, Cache)
+                      # ==========================================
                       ['prefill_activation_length', ['sequence', 'context']],
                       ['prefill_activation_norm_length', ['tensor_sequence', 'context', 'sequence']],
                       ['activation_prefill_kv_batch', ['data', 'fsdp', 'fsdp_transpose', 'expert']],
@@ -541,25 +551,21 @@ logical_axis_rules: [
                       ['cache_heads', ['autoregressive', 'tensor', 'tensor_transpose', 'tensor_sequence']],
                       ['cache_heads', ['autoregressive', 'tensor', 'tensor_sequence']],
                       ['paged_kv_heads', ['tensor']],
-                      ['diloco', 'diloco'],
-                      ['engram_dim', ['tensor']],
-                      # Should remove following names as they duplicate shardings
-                      ['qkv', []],
-                      ['kv', []],
-                      ['kv_head_dim', []],
                       ['cache_batch_prefill', []],
                       ['cache_batch', []],
                       ['cache_heads_none', []],
                       ['cache_kv', []],
                       ['cache_sequence', []],
-                      ['exp', 'expert'],
                       ['num_pages', []],
                       ['tokens_per_page', []],
                       ['paged_kv_head_dim_size', []],
-                      ['dense_layers', []],
-                      ['moe_layers', []],
-                      ['mhc', []],
-                    ]
+                      # ==========================================
+                      # Deprecated / Scheduled for Removal
+                      # ==========================================
+                      ['mlp_no_fsdp', ['tensor', 'tensor_sequence', 'autoregressive']], 
+                      ['embed_tensor_transpose', ['tensor_transpose']],                 
+                      ['exp_with_fsdp', 'fsdp'],
+                  ]
 # Axes used for DCN must be earlier in this list than ICI, see (b/339009148) for details
 data_sharding: [['data', 'stage', 'fsdp', 'fsdp_transpose', 'sequence', 'context', 'context_autoregressive', 'tensor', 'tensor_transpose', 'tensor_sequence', 'expert', 'autoregressive']]
 input_data_sharding_logical_axes: ['activation_embed_and_logits_batch', 'activation_norm_length']