Skip to content

Commit aeb7510

Browse files
Merge pull request #3497 from AI-Hypercomputer:qinwen/add_batch_split
PiperOrigin-RevId: 889940120
2 parents 7ac9fb4 + ad4f663 commit aeb7510

3 files changed

Lines changed: 90 additions & 0 deletions

File tree

Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,88 @@
1+
# Copyright 2023–2026 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# https://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
# model config for DeepSeek V3 - 671B that uses fsdp on two logical axes
16+
17+
# For DeepSeek default device-limited routing,
18+
# please set n_routing_groups=8 and topk_routing_group=4 in your command-line arguments.
19+
20+
base_emb_dim: 7168
21+
base_num_query_heads: 128
22+
base_num_kv_heads: 128
23+
base_mlp_dim: 18432
24+
base_moe_mlp_dim: 2048
25+
base_num_decoder_layers: 61
26+
first_num_dense_layers: 3
27+
mlp_activations: ["silu","linear"]
28+
vocab_size: 129280
29+
enable_dropout: False
30+
logits_via_embedding: False
31+
normalization_layer_epsilon: 1.0e-6
32+
num_experts: 256
33+
num_experts_per_tok: 8
34+
shared_experts: 1
35+
routed_scaling_factor: 2.5
36+
routed_score_func: "sigmoid"
37+
routed_bias: True
38+
decoder_block: "deepseek"
39+
# MLA
40+
attention_type: "mla"
41+
q_lora_rank: 1536
42+
kv_lora_rank: 512
43+
qk_nope_head_dim: 128
44+
qk_rope_head_dim: 64
45+
v_head_dim: 128
46+
mscale: 1.0
47+
# RoPE
48+
rope_type: "yarn"
49+
rope_max_timescale: 10_000 # DeepSeek uses "rope_theta": 10000
50+
max_position_embeddings: 163840
51+
original_max_position_embeddings: 4096
52+
rope_factor: 40
53+
beta_fast: 32
54+
rope_interleave: True
55+
rope_truncate: True
56+
rope_attention_scaling: False
57+
58+
override_logical_axis_rules: True
59+
mesh_axes: ['data', 'stage', 'fsdp', 'fsdp_transpose', 'expert', 'context']
60+
data_sharding: [['data', 'stage', 'fsdp', 'fsdp_transpose', 'expert', 'context']]
61+
logical_axis_rules: [
62+
['activation_batch', ['data', 'fsdp', 'fsdp_transpose', 'expert', 'context']],
63+
['activation_batch_moe', ['data', 'fsdp', 'fsdp_transpose', 'expert', 'context']],
64+
['activation_embed_and_logits_batch', ['data', 'stage', 'fsdp', 'fsdp_transpose', 'expert', 'context']],
65+
['activation_kv_batch', ['data', 'fsdp', 'fsdp_transpose', 'expert', 'context']],
66+
['activation_embed_and_logits_batch', ['data', 'fsdp', 'fsdp_transpose', 'expert']],
67+
['activation_norm_length', ['context']],
68+
['activation_norm_length_moe', ['context']],
69+
['activation_heads', []],
70+
['activation_stage', 'stage'],
71+
['embed', ['fsdp']],
72+
['embed_moe', ['fsdp']],
73+
['embed_no_exp', ['fsdp']],
74+
['embed_no_exp_moe', ['fsdp']],
75+
['q_lora', ['fsdp']],
76+
['kv_lora', ['fsdp']],
77+
['layers', 'stage'],
78+
['q_lora_up_proj', ['fsdp_transpose']],
79+
['kv_lora_up_proj', ['fsdp_transpose']],
80+
['q_heads', ['fsdp_transpose']],
81+
['kv_heads', ['fsdp_transpose']],
82+
['heads', ['fsdp_transpose']],
83+
['mlp', ['fsdp_transpose']],
84+
['mlp_only_fsdp_transpose', ['fsdp_transpose']],
85+
['expert_only', ['expert']],
86+
['fsdp_transpose_only', ['fsdp_transpose']],
87+
['fsdp_transpose_and_expert', ['fsdp_transpose', 'expert']],
88+
]

src/maxtext/configs/types.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -220,6 +220,7 @@ class ProfilerType(str, Enum):
220220
"deepseek2-236b",
221221
"deepseek3-671b",
222222
"deepseek3-671b-2dfsdp",
223+
"deepseek3-671b-batchsplit",
223224
"deepseek3-test",
224225
"deepseek3-tiny",
225226
"deepseek3.2-671b",

tests/unit/configs_test.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -201,6 +201,7 @@ def test_gpt_configs(config_file):
201201
os.path.join(CONFIGS_DIR, "models", "deepseek3-test.yml"),
202202
os.path.join(CONFIGS_DIR, "models", "deepseek3-671b.yml"),
203203
os.path.join(CONFIGS_DIR, "models", "deepseek3-671b-2dfsdp.yml"),
204+
os.path.join(CONFIGS_DIR, "models", "deepseek3-671b-batchsplit.yml"),
204205
]
205206

206207

0 commit comments

Comments
 (0)