|
24 | 24 | else: |
25 | 25 | from transformers.configuration_utils import PretrainedConfig as PTConfig |
26 | 26 |
|
| 27 | + |
| 28 | +gemma4_26b_dict = { |
| 29 | + "architectures": ["Gemma4ForConditionalGeneration"], |
| 30 | + "audio_config": None, |
| 31 | + "audio_token_id": 258881, |
| 32 | + "boa_token_id": 256000, |
| 33 | + "boi_token_id": 255999, |
| 34 | + "dtype": "bfloat16", |
| 35 | + "eoa_token_id": 258883, |
| 36 | + "eoa_token_index": 258883, |
| 37 | + "eoi_token_id": 258882, |
| 38 | + "eos_token_id": [1, 106], |
| 39 | + "image_token_id": 258880, |
| 40 | + "initializer_range": 0.02, |
| 41 | + "model_type": "gemma4", |
| 42 | + "text_config": { |
| 43 | + "attention_bias": False, |
| 44 | + "attention_dropout": 0.0, |
| 45 | + "attention_k_eq_v": True, |
| 46 | + "bos_token_id": 2, |
| 47 | + "dtype": "bfloat16", |
| 48 | + "enable_moe_block": True, |
| 49 | + "eos_token_id": 1, |
| 50 | + "expert_intermediate_size": 704, |
| 51 | + "final_logit_softcapping": 30.0, |
| 52 | + "global_head_dim": 512, |
| 53 | + "head_dim": 256, |
| 54 | + "hidden_activation": "gelu_pytorch_tanh", |
| 55 | + "hidden_size": 2816, |
| 56 | + "hidden_size_per_layer_input": 0, |
| 57 | + "initializer_range": 0.02, |
| 58 | + "intermediate_size": 2112, |
| 59 | + "layer_types": [ |
| 60 | + "sliding_attention", |
| 61 | + "sliding_attention", |
| 62 | + "sliding_attention", |
| 63 | + "sliding_attention", |
| 64 | + "sliding_attention", |
| 65 | + "full_attention", |
| 66 | + ] |
| 67 | + * 5, |
| 68 | + "max_position_embeddings": 262144, |
| 69 | + "model_type": "gemma4_text", |
| 70 | + "num_attention_heads": 16, |
| 71 | + "num_experts": 128, |
| 72 | + "num_global_key_value_heads": 2, |
| 73 | + "num_hidden_layers": 30, |
| 74 | + "num_key_value_heads": 8, |
| 75 | + "num_kv_shared_layers": 0, |
| 76 | + "pad_token_id": 0, |
| 77 | + "rms_norm_eps": 1e-06, |
| 78 | + "rope_parameters": { |
| 79 | + "full_attention": {"partial_rotary_factor": 0.25, "rope_theta": 1_000_000.0, "rope_type": "proportional"}, |
| 80 | + "sliding_attention": {"rope_theta": 10_000.0, "rope_type": "default"}, |
| 81 | + }, |
| 82 | + "sliding_window": 1024, |
| 83 | + "tie_word_embeddings": True, |
| 84 | + "top_k_experts": 8, |
| 85 | + "use_bidirectional_attention": "vision", |
| 86 | + "use_cache": True, |
| 87 | + "use_double_wide_mlp": False, |
| 88 | + "vocab_size": 262144, |
| 89 | + "vocab_size_per_layer_input": 262144, |
| 90 | + }, |
| 91 | + "tie_word_embeddings": True, |
| 92 | + "transformers_version": "5.5.0.dev0", |
| 93 | + "video_token_id": 258884, |
| 94 | + "vision_config": { |
| 95 | + "attention_bias": False, |
| 96 | + "attention_dropout": 0.0, |
| 97 | + "default_output_length": 280, |
| 98 | + "dtype": "bfloat16", |
| 99 | + "global_head_dim": 72, |
| 100 | + "head_dim": 72, |
| 101 | + "hidden_activation": "gelu_pytorch_tanh", |
| 102 | + "hidden_size": 1152, |
| 103 | + "intermediate_size": 4304, |
| 104 | + "max_position_embeddings": 131072, |
| 105 | + "model_type": "gemma4_vision", |
| 106 | + "num_attention_heads": 16, |
| 107 | + "num_hidden_layers": 27, |
| 108 | + "num_key_value_heads": 16, |
| 109 | + "patch_size": 16, |
| 110 | + "pooling_kernel_size": 3, |
| 111 | + "position_embedding_size": 10240, |
| 112 | + "rms_norm_eps": 1e-06, |
| 113 | + "rope_parameters": {"rope_theta": 100.0, "rope_type": "default"}, |
| 114 | + "standardize": True, |
| 115 | + "use_clipped_linears": False, |
| 116 | + }, |
| 117 | + "vision_soft_tokens_per_image": 280, |
| 118 | +} |
| 119 | + |
| 120 | + |
| 121 | +gemma4_31b_dict = gemma4_26b_dict.copy() |
| 122 | +gemma4_31b_dict["text_config"] = gemma4_26b_dict["text_config"].copy() |
| 123 | +gemma4_31b_dict["text_config"].update( |
| 124 | + { |
| 125 | + "enable_moe_block": False, |
| 126 | + "expert_intermediate_size": None, |
| 127 | + "hidden_size": 5376, |
| 128 | + "intermediate_size": 21504, |
| 129 | + "layer_types": [ |
| 130 | + "sliding_attention", |
| 131 | + "sliding_attention", |
| 132 | + "sliding_attention", |
| 133 | + "sliding_attention", |
| 134 | + "sliding_attention", |
| 135 | + "full_attention", |
| 136 | + ] |
| 137 | + * 10, |
| 138 | + "num_attention_heads": 32, |
| 139 | + "num_experts": None, |
| 140 | + "num_global_key_value_heads": 4, |
| 141 | + "num_hidden_layers": 60, |
| 142 | + "num_key_value_heads": 16, |
| 143 | + "top_k_experts": None, |
| 144 | + } |
| 145 | +) |
| 146 | + |
| 147 | + |
| 148 | +try: |
| 149 | + # Will execute successfully if Transformers is updated with Gemma 4 support |
| 150 | + gemma4_26b_config = transformers.Gemma4Config(**gemma4_26b_dict) |
| 151 | + gemma4_31b_config = transformers.Gemma4Config(**gemma4_31b_dict) |
| 152 | +except AttributeError: |
| 153 | + # Graceful fallback to raw dict-based PTConfig if Gemma 4 natively is missing |
| 154 | + gemma4_26b_config = PTConfig(**gemma4_26b_dict) |
| 155 | + gemma4_31b_config = PTConfig(**gemma4_31b_dict) |
| 156 | + |
| 157 | + |
27 | 158 | gemma3_4b_config = transformers.Gemma3Config( |
28 | 159 | architectures=["Gemma3ForConditionalGeneration"], |
29 | 160 | boi_token_index=255999, |
|
584 | 715 | "mscale": 0.707, |
585 | 716 | "mscale_all_dim": 0.707, |
586 | 717 | "original_max_position_embeddings": 4096, |
| 718 | + "rope_theta": 10_000, |
587 | 719 | "type": "yarn", |
588 | 720 | }, |
589 | | - "rope_theta": 10000, |
| 721 | + "rope_theta": 10_000, |
590 | 722 | "routed_scaling_factor": 1.0, |
591 | 723 | "scoring_func": "softmax", |
592 | 724 | "seq_aux": True, |
|
645 | 777 | "mscale": 1.0, |
646 | 778 | "mscale_all_dim": 1.0, |
647 | 779 | "original_max_position_embeddings": 4096, |
| 780 | + "rope_theta": 10_000, |
648 | 781 | "type": "yarn", |
649 | 782 | }, |
650 | | - "rope_theta": 10000, |
| 783 | + "rope_theta": 10_000, |
651 | 784 | "routed_scaling_factor": 2.5, |
652 | 785 | "scoring_func": "sigmoid", |
653 | 786 | "tie_word_embeddings": False, |
|
697 | 830 | "qk_rope_head_dim": 64, |
698 | 831 | "rms_norm_eps": 1e-06, |
699 | 832 | "rope_scaling": { |
700 | | - "beta_fast": 32, |
701 | | - "beta_slow": 1, |
702 | | - "factor": 40, |
| 833 | + "beta_fast": 32.0, |
| 834 | + "beta_slow": 1.0, |
| 835 | + "factor": 40.0, |
703 | 836 | "mscale": 1.0, |
704 | 837 | "mscale_all_dim": 1.0, |
705 | 838 | "original_max_position_embeddings": 4096, |
| 839 | + "rope_theta": 10_000, |
706 | 840 | "type": "yarn", |
707 | 841 | }, |
708 | | - "rope_theta": 10000, |
| 842 | + "rope_theta": 10_000, |
709 | 843 | "routed_scaling_factor": 2.5, |
710 | 844 | "scoring_func": "sigmoid", |
711 | 845 | "tie_word_embeddings": False, |
|
717 | 851 | "v_head_dim": 128, |
718 | 852 | "vocab_size": 129280, |
719 | 853 | } |
| 854 | + |
| 855 | + |
720 | 856 | # TODO(shuningjin): replace with DeepseekV32Config when available in transformers library |
721 | | -deepseek32_671b_config = PTConfig(**deepseek32_671b_dict) |
| 857 | +class DeepseekV32Config(PTConfig): |
| 858 | + |
| 859 | + def __init__(self, **kwargs): |
| 860 | + self.max_position_embeddings = kwargs.get("max_position_embeddings", 163840) |
| 861 | + super().__init__(**kwargs) |
| 862 | + |
| 863 | + |
| 864 | +deepseek32_671b_config = DeepseekV32Config(**deepseek32_671b_dict) |
722 | 865 |
|
723 | 866 | # from https://huggingface.co/openai/gpt-oss-20b/blob/main/config.json |
724 | 867 | # remove mxfp4 quantization_config, since we are using bf16 |
|
775 | 918 | "beta_slow": 1.0, |
776 | 919 | "factor": 32.0, |
777 | 920 | "original_max_position_embeddings": 4096, |
| 921 | + "rope_theta": 150_000, |
778 | 922 | "rope_type": "yarn", |
779 | 923 | "truncate": False, |
780 | 924 | }, |
781 | | - "rope_theta": 150000, |
| 925 | + "rope_theta": 150_000, |
782 | 926 | "router_aux_loss_coef": 0.9, |
783 | 927 | "sliding_window": 128, |
784 | 928 | "swiglu_limit": 7.0, |
|
856 | 1000 | "beta_slow": 1.0, |
857 | 1001 | "factor": 32.0, |
858 | 1002 | "original_max_position_embeddings": 4096, |
| 1003 | + "rope_theta": 150_000, |
859 | 1004 | "rope_type": "yarn", |
860 | 1005 | "truncate": False, |
861 | 1006 | }, |
862 | | - "rope_theta": 150000, |
| 1007 | + "rope_theta": 150_000, |
863 | 1008 | "router_aux_loss_coef": 0.9, |
864 | 1009 | "sliding_window": 128, |
865 | 1010 | "swiglu_limit": 7.0, |
|
1006 | 1151 | "gemma3-4b": gemma3_4b_config, |
1007 | 1152 | "gemma3-12b": gemma3_12b_config, |
1008 | 1153 | "gemma3-27b": gemma3_27b_config, |
| 1154 | + "gemma4-26b": gemma4_26b_config, |
| 1155 | + "gemma4-31b": gemma4_31b_config, |
1009 | 1156 | "qwen2.5-1.5b": qwen25_1_5b_config, |
1010 | 1157 | "qwen2.5-7b": qwen25_7b_config, |
1011 | 1158 | "qwen2.5-14b": qwen25_14b_config, |
|
0 commit comments