AI-Hypercomputer
diff --git a/‎src/maxtext/configs/base.yml‎
Lines changed: 14 additions & 0 deletions b/‎src/maxtext/configs/base.yml‎
Lines changed: 14 additions & 0 deletions
diff --git a/‎src/maxtext/configs/custom_mesh_and_rule/pipeline-large-moe.yml‎
Lines changed: 6 additions & 0 deletions b/‎src/maxtext/configs/custom_mesh_and_rule/pipeline-large-moe.yml‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎src/maxtext/configs/custom_mesh_and_rule/pure-fsdp.yml‎
Lines changed: 4 additions & 0 deletions b/‎src/maxtext/configs/custom_mesh_and_rule/pure-fsdp.yml‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎src/maxtext/configs/inference/vllm.yml‎
Lines changed: 8 additions & 0 deletions b/‎src/maxtext/configs/inference/vllm.yml‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎src/maxtext/configs/models/deepseek3-671b-2dfsdp.yml‎
Lines changed: 4 additions & 0 deletions b/‎src/maxtext/configs/models/deepseek3-671b-2dfsdp.yml‎
Lines changed: 4 additions & 0 deletions
@@ -435,7 +435,9 @@ custom_mesh_and_rule: "" # replace default mesh and logical rule by specifying y
 mesh_axes: ['diloco', 'data', 'stage', 'fsdp', 'fsdp_transpose', 'sequence', 'context', 'context_autoregressive', 'tensor', 'tensor_transpose', 'tensor_sequence', 'expert', 'autoregressive']
 logical_axis_rules: [
                       ['activation_batch', ['data', 'fsdp', 'fsdp_transpose', 'expert']],
+                      ['activation_batch_moe', ['data', 'fsdp', 'fsdp_transpose', 'expert']],
                       ['activation_batch_no_exp', ['data', 'fsdp', 'fsdp_transpose']],
+                      ['activation_batch_no_exp_moe', ['data', 'fsdp', 'fsdp_transpose']],
                       ['activation_embed_and_logits_batch', ['data', 'stage', 'fsdp', 'fsdp_transpose', 'expert']],
                       ['activation_embed_and_logits_batch_sequence', ['data', 'stage', 'fsdp', 'fsdp_transpose', 'sequence', 'context', 'expert']],
                       ['activation_heads', ['tensor', 'tensor_transpose', 'sequence','tensor_sequence','autoregressive']],
@@ -448,14 +450,18 @@ logical_axis_rules: [
                       ['activation_attn_length_no_exp', ['context']],
                       ['activation_length_no_exp', ['sequence', 'context']],
                       ['activation_length_no_exp', ['context']],
+                      ['activation_length_no_exp_moe', ['sequence', 'context']],
+                      ['activation_length_no_exp_moe', ['context']],
                       ['activation_norm_length', ['tensor_sequence', 'context', 'sequence']],
+                      ['activation_norm_length_moe', ['tensor_sequence', 'context', 'sequence']],
                       ['activation_q_length', ['context', 'expert']],
                       ['activation_q_length_no_exp', ['context']],
                       ['prefill_activation_length', ['sequence', 'context']],
                       ['prefill_activation_norm_length', ['tensor_sequence', 'context', 'sequence']],
                       ['activation_kv_length', []],
                       ['activation_attn_embed', ['tensor', 'tensor_transpose']],
                       ['activation_embed', ['tensor', 'tensor_transpose']],
+                      ['activation_embed_moe', ['tensor', 'tensor_transpose']],
                       ['activation_mlp', ['tensor', 'tensor_transpose', 'tensor_sequence']],
                       ['activation_kv', ['tensor', 'tensor_transpose', 'tensor_sequence']],
                       ['activation_prefill_kv_batch', ['data', 'fsdp', 'fsdp_transpose', 'expert']],
@@ -484,6 +490,14 @@ logical_axis_rules: [
                       ['embed_no_exp', ['fsdp', 'sequence', 'tensor_transpose', 'context']],
                       ['embed_no_exp', ['fsdp', 'fsdp_transpose', 'sequence', 'context']],
                       ['embed_no_exp', ['fsdp', 'sequence', 'context']],
+                      ['embed_moe', ['fsdp', 'fsdp_transpose', 'sequence', 'tensor_transpose', 'context', 'expert']],
+                      ['embed_moe', ['fsdp', 'sequence', 'tensor_transpose', 'context' , 'expert']],
+                      ['embed_moe', ['fsdp', 'fsdp_transpose', 'sequence', 'context', 'expert']],
+                      ['embed_moe', ['fsdp', 'sequence', 'context', 'expert']],
+                      ['embed_no_exp_moe', ['fsdp', 'fsdp_transpose', 'sequence', 'tensor_transpose', 'context']],
+                      ['embed_no_exp_moe', ['fsdp', 'sequence', 'tensor_transpose', 'context']],
+                      ['embed_no_exp_moe', ['fsdp', 'fsdp_transpose', 'sequence', 'context']],
+                      ['embed_no_exp_moe', ['fsdp', 'sequence', 'context']],
                       ['embed_tensor_transpose', ['tensor_transpose']],
                       ['q_lora', ['fsdp', 'fsdp_transpose', 'sequence', 'context', 'tensor_transpose', 'expert']],
                       ['q_lora', ['fsdp', 'sequence', 'context', 'tensor_transpose', 'expert']],
 
@@ -28,7 +28,9 @@ mesh_axes: ['data', 'stage', 'fsdp', 'tensor', 'expert']
 data_sharding: [['data', 'stage', 'fsdp', 'tensor', 'expert']]
 logical_axis_rules: [
                       ['activation_batch', ['data', 'fsdp', 'expert']],
+                      ['activation_batch_moe', ['data', 'fsdp', 'expert']],
                       ['activation_batch_no_exp', ['data', 'fsdp']],
+                      ['activation_batch_no_exp_moe', ['data', 'fsdp']],
                       ['activation_embed_and_logits_batch', ['data', 'stage', 'fsdp', 'expert']],
                       ['activation_embed_and_logits_batch_sequence', ['data', 'stage', 'fsdp', 'expert']],
                       ['activation_heads', ['tensor']],
@@ -38,6 +40,7 @@ logical_axis_rules: [
                       ['activation_q_length', ['expert']],
                       ['activation_attn_embed', ['tensor']],
                       ['activation_embed', ['tensor']],
+                      ['activation_embed_moe', ['tensor']],
                       ['activation_mlp', ['tensor']],
                       ['activation_kv', ['tensor']],
                       ['activation_prefill_kv_batch', ['data', 'fsdp', 'expert']],
@@ -55,7 +58,10 @@ logical_axis_rules: [
                       ['q_heads', ['tensor']],
                       ['kv_heads', ['tensor']],
                       ['embed', ['fsdp', 'expert']],
+                      ['embed_moe', ['fsdp', 'expert']],
                       ['embed_no_exp', ['fsdp']],
+                      ['embed_no_exp_moe', ['fsdp']],
+                      ['embed_moe', ['fsdp']],
                       ['q_lora', ['fsdp']],
                       ['kv_lora', ['fsdp']],
                       ['norm', ['tensor']],
 
@@ -19,6 +19,8 @@ data_sharding: [['fsdp']]
 logical_axis_rules: [
                       ['activation_batch', ['fsdp']],
                       ['activation_batch_no_exp', ['fsdp']],
+                      ['activation_batch_moe', ['fsdp']],
+                      ['activation_batch_no_exp_moe', ['fsdp']],
                       ['activation_embed_and_logits_batch', ['fsdp']],
                       ['activation_embed_and_logits_batch_sequence', ['fsdp']],
                       ['activation_prefill_kv_batch', ['fsdp']],
@@ -27,6 +29,8 @@ logical_axis_rules: [
                       ['decode_batch', ['fsdp']],
                       ['embed', ['fsdp']],
                       ['embed_no_exp', ['fsdp']],
+                      ['embed_moe', ['fsdp']],
+                      ['embed_no_exp_moe', ['fsdp']],
                       ['q_lora', ['fsdp']],
                       ['kv_lora', ['fsdp']],
                       ['exp_with_fsdp', 'fsdp'],
 
@@ -30,18 +30,23 @@ weight_dtype: bfloat16
 mesh_axes: ['data', 'attn_dp', 'model', 'expert', 'attn_dp_expert']
 logical_axis_rules: [
                       ['activation_batch', ['expert']],
+                      ['activation_batch_moe', ['expert']],
                       ['activation_batch_no_exp', []],
+                      ['activation_batch_no_exp_moe', []],
                       ['activation_embed_and_logits_batch', ['expert']],
                       ['activation_embed_and_logits_batch_sequence', ['expert']],
                       ['activation_heads', ['model']],
                       ['activation_kv_heads', ['model']],
                       ['activation_attn_length', ['expert']],
                       ['activation_attn_length_no_exp', []],
                       ['activation_length', ['data', 'expert']],
+                      ['activation_length_moe', ['data', 'expert']],
                       ['activation_length_no_exp', 'data'],
+                      ['activation_length_no_exp_moe', 'data'],
                       ['activation_q_length', ['expert', 'attn_dp_expert']],
                       ['activation_attn_embed', 'model'],
                       ['activation_embed', ['model', 'attn_dp']],
+                      ['activation_embed_moe', ['model', 'attn_dp']],
                       ['activation_mlp', ['model', 'attn_dp']],
                       ['activation_kv', ['model']],
                       ['activation_prefill_kv_batch', ['expert', 'attn_dp_expert']],
@@ -50,6 +55,7 @@ logical_axis_rules: [
                       ['activation_kv_head_dim', ['model']],
                       ['activation_vocab', ['model', 'attn_dp']],
                       ['activation_norm_length', []],
+                      ['activation_norm_length_moe', []],
                       ['activation_exp', ['expert', 'attn_dp_expert']],
                       ['decode_batch', ['expert', 'attn_dp_expert']],
                       ['decode_length', []],
@@ -63,8 +69,10 @@ logical_axis_rules: [
                       ['kv_head_dim', []],
                       ['kv', []],
                       ['embed', ['expert', 'attn_dp_expert']],
+                      ['embed_moe', ['expert', 'attn_dp_expert']],
                       ['embed_tensor_transpose', ['attn_dp', 'model']],
                       ['embed_no_exp', []],
+                      ['embed_no_exp_moe', []],
                       ['q_lora', ['expert', 'attn_dp_expert']],
                       ['kv_lora', ['expert', 'attn_dp_expert']],
                       ['norm', []],
 
@@ -60,14 +60,18 @@ mesh_axes: ['data', 'stage', 'fsdp', 'fsdp_transpose', 'expert', 'context']
 data_sharding: [['data', 'stage', 'fsdp', 'fsdp_transpose', 'expert', 'context']]
 logical_axis_rules: [
     ['activation_batch', ['data', 'fsdp', 'fsdp_transpose', 'expert', 'context']],
+    ['activation_batch_moe', ['data', 'fsdp', 'fsdp_transpose', 'expert', 'context']],
     ['activation_embed_and_logits_batch', ['data', 'stage', 'fsdp', 'fsdp_transpose', 'expert', 'context']],
     ['activation_kv_batch', ['data', 'fsdp', 'fsdp_transpose', 'expert', 'context']],
     ['activation_embed_and_logits_batch', ['data', 'fsdp', 'fsdp_transpose', 'expert']],
     ['activation_norm_length', ['context']],
+    ['activation_norm_length_moe', ['context']],
     ['activation_heads', []],
     ['activation_stage', 'stage'],
     ['embed', ['fsdp']],
+    ['embed_moe', ['fsdp']],
     ['embed_no_exp', ['fsdp']],
+    ['embed_no_exp_moe', ['fsdp']],
     ['q_lora', ['fsdp']],
     ['kv_lora', ['fsdp']],
     ['layers', 'stage'],