Skip to content

Commit 4910293

Browse files
Merge pull request #3506 from AI-Hypercomputer:jimmytsai/bring-up-qwen2_5-1_5b
PiperOrigin-RevId: 892103358
2 parents 5478bad + 387df2d commit 4910293

9 files changed

Lines changed: 69 additions & 12 deletions

File tree

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -107,7 +107,7 @@ MaxText aims to provide you with the best OSS models, whether as a reference imp
107107
* Gemma 2 (2B, 9B, 27B)
108108
* Gemma 1 (2B, 7B)
109109
* Alibaba
110-
* Qwen 2.5 (7B, 14B)
110+
* Qwen 2.5 (1.5B, 7B, 14B)
111111
* Qwen 3 MoE 2507 (235B, 480B)
112112
* Qwen 3 MoE (30B, 235B)
113113
* Qwen 3 Dense (0.6B, 1.7B, 4B, 8B, 14B, 32B)

docs/guides/checkpointing_solutions/convert_checkpoint.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ The following models are supported:
1111
| **Gemma2** | 2B, 9B, 27B |||||
1212
| **Gemma3** (Multimodal) | 4B, 12B, 27B |||||
1313
| **Llama3.1** | 8B, 70B, 450B |||||
14-
| **Qwen2.5** | 7B, 14B |||||
14+
| **Qwen2.5** | 1.5B, 7B, 14B |||||
1515
| **Qwen3** | 0.6B, 4B, 8B, 14B, 32B |||||
1616
| **Qwen3 MoE** | 30B, 235B, 480B |||||
1717
| **Mixtral** | 8x7B, 8x22B |||||

src/maxtext/checkpoint_conversion/utils/hf_model_configs.py

Lines changed: 27 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,16 @@
1-
# Copyright 2023–2025 Google LLC
1+
# Copyright 2023–2026 Google LLC
22
#
3-
# Licensed under the Apache License, Version 2.0 (the "License");
4-
# you may not use this file except in compliance with the License.
5-
# You may obtain a copy of the License at
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
66
#
7-
# https://www.apache.org/licenses/LICENSE-2.0
7+
# https://www.apache.org/licenses/LICENSE-2.0
88
#
9-
# Unless required by applicable law or agreed to in writing, software
10-
# distributed under the License is distributed on an "AS IS" BASIS,
11-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12-
# See the License for the specific language governing permissions and
13-
# limitations under the License.
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
1414

1515
"""
1616
This config defines the architectural configurations of the Hugging Face version of a model.
@@ -215,6 +215,22 @@
215215
query_pre_attn_scalar=144,
216216
)
217217

218+
qwen25_1_5b_config = transformers.Qwen2Config(
219+
vocab_size=151936,
220+
hidden_size=1536,
221+
intermediate_size=8960,
222+
num_hidden_layers=28,
223+
num_attention_heads=12,
224+
num_key_value_heads=2,
225+
hidden_act="silu",
226+
max_position_embeddings=32768,
227+
rms_norm_eps=1e-06,
228+
rope_theta=1000000.0,
229+
tie_word_embeddings=True,
230+
torch_dtype="bfloat16",
231+
attention_bias=True,
232+
)
233+
218234
qwen25_7b_config = transformers.Qwen2Config(
219235
vocab_size=152064,
220236
hidden_size=3584,
@@ -990,6 +1006,7 @@
9901006
"gemma3-4b": gemma3_4b_config,
9911007
"gemma3-12b": gemma3_12b_config,
9921008
"gemma3-27b": gemma3_27b_config,
1009+
"qwen2.5-1.5b": qwen25_1_5b_config,
9931010
"qwen2.5-7b": qwen25_7b_config,
9941011
"qwen2.5-14b": qwen25_14b_config,
9951012
"qwen3-0.6b": qwen3_0_6b_config,

src/maxtext/checkpoint_conversion/utils/hf_shape.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -787,6 +787,7 @@ def MIXTRAL_HF_WEIGHTS_TO_SHAPE(config):
787787
"gemma3-4b": GEMMA3_HF_WEIGHTS_TO_SHAPE,
788788
"gemma3-12b": GEMMA3_HF_WEIGHTS_TO_SHAPE,
789789
"gemma3-27b": GEMMA3_HF_WEIGHTS_TO_SHAPE,
790+
"qwen2.5-1.5b": QWEN_HF_WEIGHTS_TO_SHAPE,
790791
"qwen2.5-7b": QWEN_HF_WEIGHTS_TO_SHAPE,
791792
"qwen2.5-14b": QWEN_HF_WEIGHTS_TO_SHAPE,
792793
"qwen3-0.6b": QWEN_HF_WEIGHTS_TO_SHAPE,

src/maxtext/checkpoint_conversion/utils/param_mapping.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2395,6 +2395,7 @@ def pad_hf_embedding_layer(input_tensor, target_shape):
23952395
"gemma3-4b": GEMMA3_MAXTEXT_TO_HF_PARAM_MAPPING,
23962396
"gemma3-12b": GEMMA3_MAXTEXT_TO_HF_PARAM_MAPPING,
23972397
"gemma3-27b": GEMMA3_MAXTEXT_TO_HF_PARAM_MAPPING,
2398+
"qwen2.5-1.5b": QWEN_MAXTEXT_TO_HF_PARAM_MAPPING,
23982399
"qwen2.5-7b": QWEN_MAXTEXT_TO_HF_PARAM_MAPPING,
23992400
"qwen2.5-14b": QWEN_MAXTEXT_TO_HF_PARAM_MAPPING,
24002401
"qwen3-0.6b": QWEN_MAXTEXT_TO_HF_PARAM_MAPPING,
@@ -2437,6 +2438,7 @@ def pad_hf_embedding_layer(input_tensor, target_shape):
24372438
"gemma3-4b": GEMMA3_MAXTEXT_TO_HF_PARAM_HOOK_FN,
24382439
"gemma3-12b": GEMMA3_MAXTEXT_TO_HF_PARAM_HOOK_FN,
24392440
"gemma3-27b": GEMMA3_MAXTEXT_TO_HF_PARAM_HOOK_FN,
2441+
"qwen2.5-1.5b": QWEN_MAXTEXT_TO_HF_PARAM_HOOK_FN,
24402442
"qwen2.5-7b": QWEN_MAXTEXT_TO_HF_PARAM_HOOK_FN,
24412443
"qwen2.5-14b": QWEN_MAXTEXT_TO_HF_PARAM_HOOK_FN,
24422444
"qwen3-0.6b": QWEN_MAXTEXT_TO_HF_PARAM_HOOK_FN,
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
# Copyright 2023–2026 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# https://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
# Qwen 2.5 1.5B Instruct Configuration
16+
# https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct
17+
18+
base_emb_dim: 1536
19+
base_num_query_heads: 12
20+
base_num_kv_heads: 2
21+
base_mlp_dim: 8960
22+
base_num_decoder_layers: 28
23+
head_dim: 128
24+
mlp_activations: ["silu", "linear"]
25+
vocab_size: 151936
26+
decoder_block: "qwen2"
27+
normalization_layer_epsilon: 1e-06
28+
rope_max_timescale: 1000000.0
29+
use_qk_norm: False
30+
# Bias for q, k, v proj.
31+
attention_bias: True
32+
logits_via_embedding: True
33+
normalize_embedding_logits: False
34+
tokenizer_type: "huggingface"

src/maxtext/configs/pyconfig_deprecated.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -460,6 +460,7 @@ def validate_model_name(s: str) -> bool:
460460
"gemma3-4b",
461461
"gemma3-12b",
462462
"gemma3-27b",
463+
"qwen2.5-1.5b",
463464
"qwen2.5-7b",
464465
"qwen2.5-14b",
465466
"qwen3-0.6b",

src/maxtext/configs/types.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -234,6 +234,7 @@ class ProfilerType(str, Enum):
234234
"gemma3-4b",
235235
"gemma3-12b",
236236
"gemma3-27b",
237+
"qwen2.5-1.5b",
237238
"qwen2.5-7b",
238239
"qwen2.5-14b",
239240
"qwen3-0.6b",

src/maxtext/utils/globals.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@
5050
"gemma3-4b": "google/gemma-3-4b-it", # hf multi-modal should also support the pure-text
5151
"gemma3-12b": "google/gemma-3-12b-it",
5252
"gemma3-27b": "google/gemma-3-27b-it",
53+
"qwen2.5-1.5b": "Qwen/Qwen2.5-1.5B-Instruct",
5354
"qwen2.5-7b": "Qwen/Qwen2.5-7B-Instruct",
5455
"qwen2.5-14b": "Qwen/Qwen2.5-14B-Instruct",
5556
"qwen3-0.6b": "Qwen/Qwen3-0.6B",

0 commit comments

Comments
 (0)