Skip to content

Commit 387df2d

Browse files
committed
bringup qwen2.5-1.5B
1 parent 7ac9fb4 commit 387df2d

9 files changed

Lines changed: 69 additions & 12 deletions

File tree

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -107,7 +107,7 @@ MaxText aims to provide you with the best OSS models, whether as a reference imp
107107
* Gemma 2 (2B, 9B, 27B)
108108
* Gemma 1 (2B, 7B)
109109
* Alibaba
110-
* Qwen 2.5 (7B, 14B)
110+
* Qwen 2.5 (1.5B, 7B, 14B)
111111
* Qwen 3 MoE 2507 (235B, 480B)
112112
* Qwen 3 MoE (30B, 235B)
113113
* Qwen 3 Dense (0.6B, 1.7B, 4B, 8B, 14B, 32B)

docs/guides/checkpointing_solutions/convert_checkpoint.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ The following models are supported:
1111
| **Gemma2** | 2B, 9B, 27B |||||
1212
| **Gemma3** (Multimodal) | 4B, 12B, 27B |||||
1313
| **Llama3.1** | 8B, 70B, 450B |||||
14-
| **Qwen2.5** | 7B, 14B |||||
14+
| **Qwen2.5** | 1.5B, 7B, 14B |||||
1515
| **Qwen3** | 0.6B, 4B, 8B, 14B, 32B |||||
1616
| **Qwen3 MoE** | 30B, 235B, 480B |||||
1717
| **Mixtral** | 8x7B, 8x22B |||||

src/maxtext/checkpoint_conversion/utils/hf_model_configs.py

Lines changed: 27 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,16 @@
1-
# Copyright 2023–2025 Google LLC
1+
# Copyright 2023–2026 Google LLC
22
#
3-
# Licensed under the Apache License, Version 2.0 (the "License");
4-
# you may not use this file except in compliance with the License.
5-
# You may obtain a copy of the License at
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
66
#
7-
# https://www.apache.org/licenses/LICENSE-2.0
7+
# https://www.apache.org/licenses/LICENSE-2.0
88
#
9-
# Unless required by applicable law or agreed to in writing, software
10-
# distributed under the License is distributed on an "AS IS" BASIS,
11-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12-
# See the License for the specific language governing permissions and
13-
# limitations under the License.
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
1414

1515
"""
1616
This config defines the architectural configurations of the Hugging Face version of a model.
@@ -210,6 +210,22 @@
210210
query_pre_attn_scalar=144,
211211
)
212212

213+
qwen25_1_5b_config = transformers.Qwen2Config(
214+
vocab_size=151936,
215+
hidden_size=1536,
216+
intermediate_size=8960,
217+
num_hidden_layers=28,
218+
num_attention_heads=12,
219+
num_key_value_heads=2,
220+
hidden_act="silu",
221+
max_position_embeddings=32768,
222+
rms_norm_eps=1e-06,
223+
rope_theta=1000000.0,
224+
tie_word_embeddings=True,
225+
torch_dtype="bfloat16",
226+
attention_bias=True,
227+
)
228+
213229
qwen25_7b_config = transformers.Qwen2Config(
214230
vocab_size=152064,
215231
hidden_size=3584,
@@ -866,6 +882,7 @@
866882
"gemma3-4b": gemma3_4b_config,
867883
"gemma3-12b": gemma3_12b_config,
868884
"gemma3-27b": gemma3_27b_config,
885+
"qwen2.5-1.5b": qwen25_1_5b_config,
869886
"qwen2.5-7b": qwen25_7b_config,
870887
"qwen2.5-14b": qwen25_14b_config,
871888
"qwen3-0.6b": qwen3_0_6b_config,

src/maxtext/checkpoint_conversion/utils/hf_shape.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -766,6 +766,7 @@ def MIXTRAL_HF_WEIGHTS_TO_SHAPE(config):
766766
"gemma3-4b": GEMMA3_HF_WEIGHTS_TO_SHAPE,
767767
"gemma3-12b": GEMMA3_HF_WEIGHTS_TO_SHAPE,
768768
"gemma3-27b": GEMMA3_HF_WEIGHTS_TO_SHAPE,
769+
"qwen2.5-1.5b": QWEN_HF_WEIGHTS_TO_SHAPE,
769770
"qwen2.5-7b": QWEN_HF_WEIGHTS_TO_SHAPE,
770771
"qwen2.5-14b": QWEN_HF_WEIGHTS_TO_SHAPE,
771772
"qwen3-0.6b": QWEN_HF_WEIGHTS_TO_SHAPE,

src/maxtext/checkpoint_conversion/utils/param_mapping.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2359,6 +2359,7 @@ def pad_hf_embedding_layer(input_tensor, target_shape):
23592359
"gemma3-4b": GEMMA3_MAXTEXT_TO_HF_PARAM_MAPPING,
23602360
"gemma3-12b": GEMMA3_MAXTEXT_TO_HF_PARAM_MAPPING,
23612361
"gemma3-27b": GEMMA3_MAXTEXT_TO_HF_PARAM_MAPPING,
2362+
"qwen2.5-1.5b": QWEN_MAXTEXT_TO_HF_PARAM_MAPPING,
23622363
"qwen2.5-7b": QWEN_MAXTEXT_TO_HF_PARAM_MAPPING,
23632364
"qwen2.5-14b": QWEN_MAXTEXT_TO_HF_PARAM_MAPPING,
23642365
"qwen3-0.6b": QWEN_MAXTEXT_TO_HF_PARAM_MAPPING,
@@ -2399,6 +2400,7 @@ def pad_hf_embedding_layer(input_tensor, target_shape):
23992400
"gemma3-4b": GEMMA3_MAXTEXT_TO_HF_PARAM_HOOK_FN,
24002401
"gemma3-12b": GEMMA3_MAXTEXT_TO_HF_PARAM_HOOK_FN,
24012402
"gemma3-27b": GEMMA3_MAXTEXT_TO_HF_PARAM_HOOK_FN,
2403+
"qwen2.5-1.5b": QWEN_MAXTEXT_TO_HF_PARAM_HOOK_FN,
24022404
"qwen2.5-7b": QWEN_MAXTEXT_TO_HF_PARAM_HOOK_FN,
24032405
"qwen2.5-14b": QWEN_MAXTEXT_TO_HF_PARAM_HOOK_FN,
24042406
"qwen3-0.6b": QWEN_MAXTEXT_TO_HF_PARAM_HOOK_FN,
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
# Copyright 2023–2026 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# https://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
# Qwen 2.5 1.5B Instruct Configuration
16+
# https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct
17+
18+
base_emb_dim: 1536
19+
base_num_query_heads: 12
20+
base_num_kv_heads: 2
21+
base_mlp_dim: 8960
22+
base_num_decoder_layers: 28
23+
head_dim: 128
24+
mlp_activations: ["silu", "linear"]
25+
vocab_size: 151936
26+
decoder_block: "qwen2"
27+
normalization_layer_epsilon: 1e-06
28+
rope_max_timescale: 1000000.0
29+
use_qk_norm: False
30+
# Bias for q, k, v proj.
31+
attention_bias: True
32+
logits_via_embedding: True
33+
normalize_embedding_logits: False
34+
tokenizer_type: "huggingface"

src/maxtext/configs/pyconfig_deprecated.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -460,6 +460,7 @@ def validate_model_name(s: str) -> bool:
460460
"gemma3-4b",
461461
"gemma3-12b",
462462
"gemma3-27b",
463+
"qwen2.5-1.5b",
463464
"qwen2.5-7b",
464465
"qwen2.5-14b",
465466
"qwen3-0.6b",

src/maxtext/configs/types.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -233,6 +233,7 @@ class ProfilerType(str, Enum):
233233
"gemma3-4b",
234234
"gemma3-12b",
235235
"gemma3-27b",
236+
"qwen2.5-1.5b",
236237
"qwen2.5-7b",
237238
"qwen2.5-14b",
238239
"qwen3-0.6b",

src/maxtext/utils/globals.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@
5050
"gemma3-4b": "google/gemma-3-4b-it", # hf multi-modal should also support the pure-text
5151
"gemma3-12b": "google/gemma-3-12b-it",
5252
"gemma3-27b": "google/gemma-3-27b-it",
53+
"qwen2.5-1.5b": "Qwen/Qwen2.5-1.5B-Instruct",
5354
"qwen2.5-7b": "Qwen/Qwen2.5-7B-Instruct",
5455
"qwen2.5-14b": "Qwen/Qwen2.5-14B-Instruct",
5556
"qwen3-0.6b": "Qwen/Qwen3-0.6B",

0 commit comments

Comments
 (0)