|
1 | | -# Copyright 2023–2025 Google LLC |
| 1 | +# Copyright 2023–2026 Google LLC |
2 | 2 | # |
3 | | -# Licensed under the Apache License, Version 2.0 (the "License"); |
4 | | -# you may not use this file except in compliance with the License. |
5 | | -# You may obtain a copy of the License at |
| 3 | +# Licensed under the Apache License, Version 2.0 (the "License"); |
| 4 | +# you may not use this file except in compliance with the License. |
| 5 | +# You may obtain a copy of the License at |
6 | 6 | # |
7 | | -# https://www.apache.org/licenses/LICENSE-2.0 |
| 7 | +# https://www.apache.org/licenses/LICENSE-2.0 |
8 | 8 | # |
9 | | -# Unless required by applicable law or agreed to in writing, software |
10 | | -# distributed under the License is distributed on an "AS IS" BASIS, |
11 | | -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
12 | | -# See the License for the specific language governing permissions and |
13 | | -# limitations under the License. |
| 9 | +# Unless required by applicable law or agreed to in writing, software |
| 10 | +# distributed under the License is distributed on an "AS IS" BASIS, |
| 11 | +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 12 | +# See the License for the specific language governing permissions and |
| 13 | +# limitations under the License. |
14 | 14 |
|
15 | 15 | """ |
16 | 16 | This config defines the architectural configurations of the Hugging Face version of a model. |
|
210 | 210 | query_pre_attn_scalar=144, |
211 | 211 | ) |
212 | 212 |
|
| 213 | +qwen25_1_5b_config = transformers.Qwen2Config( |
| 214 | + vocab_size=151936, |
| 215 | + hidden_size=1536, |
| 216 | + intermediate_size=8960, |
| 217 | + num_hidden_layers=28, |
| 218 | + num_attention_heads=12, |
| 219 | + num_key_value_heads=2, |
| 220 | + hidden_act="silu", |
| 221 | + max_position_embeddings=32768, |
| 222 | + rms_norm_eps=1e-06, |
| 223 | + rope_theta=1000000.0, |
| 224 | + tie_word_embeddings=True, |
| 225 | + torch_dtype="bfloat16", |
| 226 | + attention_bias=True, |
| 227 | +) |
| 228 | + |
213 | 229 | qwen25_7b_config = transformers.Qwen2Config( |
214 | 230 | vocab_size=152064, |
215 | 231 | hidden_size=3584, |
|
866 | 882 | "gemma3-4b": gemma3_4b_config, |
867 | 883 | "gemma3-12b": gemma3_12b_config, |
868 | 884 | "gemma3-27b": gemma3_27b_config, |
| 885 | + "qwen2.5-1.5b": qwen25_1_5b_config, |
869 | 886 | "qwen2.5-7b": qwen25_7b_config, |
870 | 887 | "qwen2.5-14b": qwen25_14b_config, |
871 | 888 | "qwen3-0.6b": qwen3_0_6b_config, |
|
0 commit comments