Merge pull request #3497 from AI-Hypercomputer:qinwen/add_batch_split

Google-ML-Automation · Google-ML-Automation · commit aeb7510e20f3 · 2026-03-26T11:05:03.000-07:00
PiperOrigin-RevId: 889940120
diff --git a/src/maxtext/configs/models/deepseek3-671b-batchsplit.yml b/src/maxtext/configs/models/deepseek3-671b-batchsplit.yml
@@ -0,0 +1,88 @@
+# Copyright 2023–2026 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# model config for DeepSeek V3 - 671B that uses fsdp on two logical axes
+
+# For DeepSeek default device-limited routing, 
+# please set n_routing_groups=8 and topk_routing_group=4 in your command-line arguments.
+
+base_emb_dim: 7168
+base_num_query_heads: 128
+base_num_kv_heads: 128
+base_mlp_dim: 18432
+base_moe_mlp_dim: 2048
+base_num_decoder_layers: 61
+first_num_dense_layers: 3
+mlp_activations: ["silu","linear"]
+vocab_size: 129280
+enable_dropout: False
+logits_via_embedding: False
+normalization_layer_epsilon: 1.0e-6
+num_experts: 256
+num_experts_per_tok: 8
+shared_experts: 1
+routed_scaling_factor: 2.5
+routed_score_func: "sigmoid"
+routed_bias: True
+decoder_block: "deepseek"
+# MLA
+attention_type: "mla"
+q_lora_rank: 1536
+kv_lora_rank: 512
+qk_nope_head_dim: 128
+qk_rope_head_dim: 64
+v_head_dim: 128
+mscale: 1.0
+# RoPE
+rope_type: "yarn"
+rope_max_timescale: 10_000 # DeepSeek uses  "rope_theta": 10000
+max_position_embeddings: 163840
+original_max_position_embeddings: 4096
+rope_factor: 40
+beta_fast: 32
+rope_interleave: True
+rope_truncate: True
+rope_attention_scaling: False
+
+override_logical_axis_rules: True
+mesh_axes: ['data', 'stage', 'fsdp', 'fsdp_transpose', 'expert', 'context']
+data_sharding: [['data', 'stage', 'fsdp', 'fsdp_transpose', 'expert', 'context']]
+logical_axis_rules: [
+    ['activation_batch', ['data', 'fsdp', 'fsdp_transpose', 'expert', 'context']],
+    ['activation_batch_moe', ['data', 'fsdp', 'fsdp_transpose', 'expert', 'context']],
+    ['activation_embed_and_logits_batch', ['data', 'stage', 'fsdp', 'fsdp_transpose', 'expert', 'context']],
+    ['activation_kv_batch', ['data', 'fsdp', 'fsdp_transpose', 'expert', 'context']],
+    ['activation_embed_and_logits_batch', ['data', 'fsdp', 'fsdp_transpose', 'expert']],
+    ['activation_norm_length', ['context']],
+    ['activation_norm_length_moe', ['context']],
+    ['activation_heads', []],
+    ['activation_stage', 'stage'],
+    ['embed', ['fsdp']],
+    ['embed_moe', ['fsdp']],
+    ['embed_no_exp', ['fsdp']],
+    ['embed_no_exp_moe', ['fsdp']],
+    ['q_lora', ['fsdp']],
+    ['kv_lora', ['fsdp']],
+    ['layers', 'stage'],
+    ['q_lora_up_proj', ['fsdp_transpose']],
+    ['kv_lora_up_proj', ['fsdp_transpose']],
+    ['q_heads', ['fsdp_transpose']],
+    ['kv_heads', ['fsdp_transpose']],
+    ['heads', ['fsdp_transpose']],
+    ['mlp', ['fsdp_transpose']],
+    ['mlp_only_fsdp_transpose', ['fsdp_transpose']],
+    ['expert_only', ['expert']],
+    ['fsdp_transpose_only', ['fsdp_transpose']],
+    ['fsdp_transpose_and_expert', ['fsdp_transpose', 'expert']],
+]
diff --git a/src/maxtext/configs/types.py b/src/maxtext/configs/types.py
@@ -220,6 +220,7 @@ class ProfilerType(str, Enum):
     "deepseek2-236b",
     "deepseek3-671b",
     "deepseek3-671b-2dfsdp",
+    "deepseek3-671b-batchsplit",
     "deepseek3-test",
     "deepseek3-tiny",
     "deepseek3.2-671b",
diff --git a/tests/unit/configs_test.py b/tests/unit/configs_test.py
@@ -201,6 +201,7 @@ def test_gpt_configs(config_file):
     os.path.join(CONFIGS_DIR, "models", "deepseek3-test.yml"),
     os.path.join(CONFIGS_DIR, "models", "deepseek3-671b.yml"),
     os.path.join(CONFIGS_DIR, "models", "deepseek3-671b-2dfsdp.yml"),
+    os.path.join(CONFIGS_DIR, "models", "deepseek3-671b-batchsplit.yml"),
 ]
 
 

Original file line number	Diff line number	Diff line change
`@@ -201,6 +201,7 @@ def test_gpt_configs(config_file):`
`201`	`201`	`os.path.join(CONFIGS_DIR, "models", "deepseek3-test.yml"),`
`202`	`202`	`os.path.join(CONFIGS_DIR, "models", "deepseek3-671b.yml"),`
`203`	`203`	`os.path.join(CONFIGS_DIR, "models", "deepseek3-671b-2dfsdp.yml"),`
	`204`	`+ os.path.join(CONFIGS_DIR, "models", "deepseek3-671b-batchsplit.yml"),`
`204`	`205`	`]`
`205`	`206`
`206`	`207`