Merge pull request #3506 from AI-Hypercomputer:jimmytsai/bring-up-qwen2_5-1_5b

Google-ML-Automation · Google-ML-Automation · commit 4910293acdaf · 2026-03-30T22:21:03.000-07:00
PiperOrigin-RevId: 892103358
diff --git a/README.md b/README.md
@@ -107,7 +107,7 @@ MaxText aims to provide you with the best OSS models, whether as a reference imp
   * Gemma 2 (2B, 9B, 27B)
   * Gemma 1 (2B, 7B)
 * Alibaba
-  * Qwen 2.5 (7B, 14B)
+  * Qwen 2.5 (1.5B, 7B, 14B)
   * Qwen 3 MoE 2507 (235B, 480B)
   * Qwen 3 MoE (30B, 235B)
   * Qwen 3 Dense (0.6B, 1.7B, 4B, 8B, 14B, 32B)
diff --git a/docs/guides/checkpointing_solutions/convert_checkpoint.md b/docs/guides/checkpointing_solutions/convert_checkpoint.md
@@ -11,7 +11,7 @@ The following models are supported:
 | **Gemma2**              | 2B, 9B, 27B            |           √            |            √             |           √            |            √             |
 | **Gemma3** (Multimodal) | 4B, 12B, 27B           |           √            |            √             |           √            |            √             |
 | **Llama3.1**            | 8B, 70B, 450B          |           √            |            √             |           √            |            √             |
-| **Qwen2.5**             | 7B, 14B                |           √            |            √             |           √            |            √             |
+| **Qwen2.5**             | 1.5B, 7B, 14B          |           √            |            √             |           √            |            √             |
 | **Qwen3**               | 0.6B, 4B, 8B, 14B, 32B |           √            |            √             |           √            |            √             |
 | **Qwen3 MoE**           | 30B, 235B, 480B        |           √            |            √             |           √            |            √             |
 | **Mixtral**             | 8x7B, 8x22B            |           √            |            √             |           √            |            √             |
diff --git a/src/maxtext/checkpoint_conversion/utils/hf_model_configs.py b/src/maxtext/checkpoint_conversion/utils/hf_model_configs.py
@@ -1,16 +1,16 @@
-# Copyright 2023–2025 Google LLC
+#  Copyright 2023–2026 Google LLC
 #
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
 #
-#    https://www.apache.org/licenses/LICENSE-2.0
+#     https://www.apache.org/licenses/LICENSE-2.0
 #
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
 
 """
 This config defines the architectural configurations of the Hugging Face version of a model.
@@ -215,6 +215,22 @@
     query_pre_attn_scalar=144,
 )
 
+qwen25_1_5b_config = transformers.Qwen2Config(
+    vocab_size=151936,
+    hidden_size=1536,
+    intermediate_size=8960,
+    num_hidden_layers=28,
+    num_attention_heads=12,
+    num_key_value_heads=2,
+    hidden_act="silu",
+    max_position_embeddings=32768,
+    rms_norm_eps=1e-06,
+    rope_theta=1000000.0,
+    tie_word_embeddings=True,
+    torch_dtype="bfloat16",
+    attention_bias=True,
+)
+
 qwen25_7b_config = transformers.Qwen2Config(
     vocab_size=152064,
     hidden_size=3584,
@@ -990,6 +1006,7 @@
     "gemma3-4b": gemma3_4b_config,
     "gemma3-12b": gemma3_12b_config,
     "gemma3-27b": gemma3_27b_config,
+    "qwen2.5-1.5b": qwen25_1_5b_config,
     "qwen2.5-7b": qwen25_7b_config,
     "qwen2.5-14b": qwen25_14b_config,
     "qwen3-0.6b": qwen3_0_6b_config,
diff --git a/src/maxtext/checkpoint_conversion/utils/hf_shape.py b/src/maxtext/checkpoint_conversion/utils/hf_shape.py
@@ -787,6 +787,7 @@ def MIXTRAL_HF_WEIGHTS_TO_SHAPE(config):
     "gemma3-4b": GEMMA3_HF_WEIGHTS_TO_SHAPE,
     "gemma3-12b": GEMMA3_HF_WEIGHTS_TO_SHAPE,
     "gemma3-27b": GEMMA3_HF_WEIGHTS_TO_SHAPE,
+    "qwen2.5-1.5b": QWEN_HF_WEIGHTS_TO_SHAPE,
     "qwen2.5-7b": QWEN_HF_WEIGHTS_TO_SHAPE,
     "qwen2.5-14b": QWEN_HF_WEIGHTS_TO_SHAPE,
     "qwen3-0.6b": QWEN_HF_WEIGHTS_TO_SHAPE,
diff --git a/src/maxtext/checkpoint_conversion/utils/param_mapping.py b/src/maxtext/checkpoint_conversion/utils/param_mapping.py
@@ -2395,6 +2395,7 @@ def pad_hf_embedding_layer(input_tensor, target_shape):
     "gemma3-4b": GEMMA3_MAXTEXT_TO_HF_PARAM_MAPPING,
     "gemma3-12b": GEMMA3_MAXTEXT_TO_HF_PARAM_MAPPING,
     "gemma3-27b": GEMMA3_MAXTEXT_TO_HF_PARAM_MAPPING,
+    "qwen2.5-1.5b": QWEN_MAXTEXT_TO_HF_PARAM_MAPPING,
     "qwen2.5-7b": QWEN_MAXTEXT_TO_HF_PARAM_MAPPING,
     "qwen2.5-14b": QWEN_MAXTEXT_TO_HF_PARAM_MAPPING,
     "qwen3-0.6b": QWEN_MAXTEXT_TO_HF_PARAM_MAPPING,
@@ -2437,6 +2438,7 @@ def pad_hf_embedding_layer(input_tensor, target_shape):
     "gemma3-4b": GEMMA3_MAXTEXT_TO_HF_PARAM_HOOK_FN,
     "gemma3-12b": GEMMA3_MAXTEXT_TO_HF_PARAM_HOOK_FN,
     "gemma3-27b": GEMMA3_MAXTEXT_TO_HF_PARAM_HOOK_FN,
+    "qwen2.5-1.5b": QWEN_MAXTEXT_TO_HF_PARAM_HOOK_FN,
     "qwen2.5-7b": QWEN_MAXTEXT_TO_HF_PARAM_HOOK_FN,
     "qwen2.5-14b": QWEN_MAXTEXT_TO_HF_PARAM_HOOK_FN,
     "qwen3-0.6b": QWEN_MAXTEXT_TO_HF_PARAM_HOOK_FN,
diff --git a/src/maxtext/configs/models/qwen2.5-1.5b.yml b/src/maxtext/configs/models/qwen2.5-1.5b.yml
@@ -0,0 +1,34 @@
+# Copyright 2023–2026 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Qwen 2.5 1.5B Instruct Configuration
+# https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct
+
+base_emb_dim: 1536
+base_num_query_heads: 12
+base_num_kv_heads: 2
+base_mlp_dim: 8960
+base_num_decoder_layers: 28
+head_dim: 128
+mlp_activations: ["silu", "linear"]
+vocab_size: 151936
+decoder_block: "qwen2"
+normalization_layer_epsilon: 1e-06
+rope_max_timescale: 1000000.0
+use_qk_norm: False
+# Bias for q, k, v proj.
+attention_bias: True
+logits_via_embedding: True
+normalize_embedding_logits: False
+tokenizer_type: "huggingface"
diff --git a/src/maxtext/configs/pyconfig_deprecated.py b/src/maxtext/configs/pyconfig_deprecated.py
@@ -460,6 +460,7 @@ def validate_model_name(s: str) -> bool:
       "gemma3-4b",
       "gemma3-12b",
       "gemma3-27b",
+      "qwen2.5-1.5b",
       "qwen2.5-7b",
       "qwen2.5-14b",
       "qwen3-0.6b",
diff --git a/src/maxtext/configs/types.py b/src/maxtext/configs/types.py
@@ -234,6 +234,7 @@ class ProfilerType(str, Enum):
     "gemma3-4b",
     "gemma3-12b",
     "gemma3-27b",
+    "qwen2.5-1.5b",
     "qwen2.5-7b",
     "qwen2.5-14b",
     "qwen3-0.6b",
diff --git a/src/maxtext/utils/globals.py b/src/maxtext/utils/globals.py
@@ -50,6 +50,7 @@
     "gemma3-4b": "google/gemma-3-4b-it",  # hf multi-modal should also support the pure-text
     "gemma3-12b": "google/gemma-3-12b-it",
     "gemma3-27b": "google/gemma-3-27b-it",
+    "qwen2.5-1.5b": "Qwen/Qwen2.5-1.5B-Instruct",
     "qwen2.5-7b": "Qwen/Qwen2.5-7B-Instruct",
     "qwen2.5-14b": "Qwen/Qwen2.5-14B-Instruct",
     "qwen3-0.6b": "Qwen/Qwen3-0.6B",