Merge pull request #3554 from AI-Hypercomputer:agagik-gemma4-moe

Google-ML-Automation · Google-ML-Automation · commit 447a940105d9 · 2026-04-02T14:38:08.000-07:00
PiperOrigin-RevId: 893708237
diff --git a/src/maxtext/checkpoint_conversion/utils/param_mapping.py b/src/maxtext/checkpoint_conversion/utils/param_mapping.py
@@ -2362,16 +2362,16 @@ def GEMMA4_MAXTEXT_TO_HF_PARAM_MAPPING(config, maxtext_config, scan_layers=False
                   f"{text_base}.layers.{i}.router.proj.weight" if num_experts > 1 else None for i in hf_indices
               ],
               f"{prefix}-mlp-moe_block-MoeBlock_0-wi_0": [
-                  f"{text_base}.layers.{i}.moe.gate_up_proj" if num_experts > 1 else None for i in hf_indices
+                  f"{text_base}.layers.{i}.experts.gate_up_proj" if num_experts > 1 else None for i in hf_indices
               ],
               f"{prefix}-mlp-moe_block-MoeBlock_0-wi_1": [
-                  f"{text_base}.layers.{i}.moe.gate_up_proj" if num_experts > 1 else None for i in hf_indices
+                  f"{text_base}.layers.{i}.experts.gate_up_proj" if num_experts > 1 else None for i in hf_indices
               ],
               f"{prefix}-mlp-moe_block-MoeBlock_0-wo": [
-                  f"{text_base}.layers.{i}.moe.down_proj" if num_experts > 1 else None for i in hf_indices
+                  f"{text_base}.layers.{i}.experts.down_proj" if num_experts > 1 else None for i in hf_indices
               ],
               f"{prefix}-mlp-moe_block-MoeBlock_0-per_expert_scale": [
-                  f"{text_base}.layers.{i}.moe.per_expert_scale" if num_experts > 1 else None for i in hf_indices
+                  f"{text_base}.layers.{i}.router.per_expert_scale" if num_experts > 1 else None for i in hf_indices
               ],
               f"{prefix}-mlp-moe_block-shared_experts-wi_0-kernel": [
                   f"{text_base}.layers.{i}.mlp.gate_proj.weight" if num_experts > 1 else None for i in hf_indices
@@ -2440,10 +2440,14 @@ def GEMMA4_MAXTEXT_TO_HF_PARAM_MAPPING(config, maxtext_config, scan_layers=False
                 f"{prefix}-mlp-moe_block-MoeBlock_0-gate-kernel": f"{hf_prefix}.router.proj.weight"
                 if num_experts > 1
                 else None,
-                f"{prefix}-mlp-moe_block-MoeBlock_0-wi_0": f"{hf_prefix}.moe.gate_up_proj" if num_experts > 1 else None,
-                f"{prefix}-mlp-moe_block-MoeBlock_0-wi_1": f"{hf_prefix}.moe.gate_up_proj" if num_experts > 1 else None,
-                f"{prefix}-mlp-moe_block-MoeBlock_0-wo": f"{hf_prefix}.moe.down_proj" if num_experts > 1 else None,
-                f"{prefix}-mlp-moe_block-MoeBlock_0-per_expert_scale": f"{hf_prefix}.moe.per_expert_scale"
+                f"{prefix}-mlp-moe_block-MoeBlock_0-wi_0": f"{hf_prefix}.experts.gate_up_proj"
+                if num_experts > 1
+                else None,
+                f"{prefix}-mlp-moe_block-MoeBlock_0-wi_1": f"{hf_prefix}.experts.gate_up_proj"
+                if num_experts > 1
+                else None,
+                f"{prefix}-mlp-moe_block-MoeBlock_0-wo": f"{hf_prefix}.experts.down_proj" if num_experts > 1 else None,
+                f"{prefix}-mlp-moe_block-MoeBlock_0-per_expert_scale": f"{hf_prefix}.router.per_expert_scale"
                 if num_experts > 1
                 else None,
                 f"{prefix}-mlp-moe_block-shared_experts-wi_0-kernel": f"{hf_prefix}.mlp.gate_proj.weight"
@@ -2502,10 +2506,10 @@ def GEMMA4_MAXTEXT_TO_HF_PARAM_MAPPING(config, maxtext_config, scan_layers=False
               f"{prefix}-mlp-moe_block-MoeBlock_0-gate-kernel": f"{hf_prefix}.router.proj.weight"
               if num_experts > 1
               else None,
-              f"{prefix}-mlp-moe_block-MoeBlock_0-wi_0": f"{hf_prefix}.moe.gate_up_proj" if num_experts > 1 else None,
-              f"{prefix}-mlp-moe_block-MoeBlock_0-wi_1": f"{hf_prefix}.moe.gate_up_proj" if num_experts > 1 else None,
-              f"{prefix}-mlp-moe_block-MoeBlock_0-wo": f"{hf_prefix}.moe.down_proj" if num_experts > 1 else None,
-              f"{prefix}-mlp-moe_block-MoeBlock_0-per_expert_scale": f"{hf_prefix}.moe.per_expert_scale"
+              f"{prefix}-mlp-moe_block-MoeBlock_0-wi_0": f"{hf_prefix}.experts.gate_up_proj" if num_experts > 1 else None,
+              f"{prefix}-mlp-moe_block-MoeBlock_0-wi_1": f"{hf_prefix}.experts.gate_up_proj" if num_experts > 1 else None,
+              f"{prefix}-mlp-moe_block-MoeBlock_0-wo": f"{hf_prefix}.experts.down_proj" if num_experts > 1 else None,
+              f"{prefix}-mlp-moe_block-MoeBlock_0-per_expert_scale": f"{hf_prefix}.router.per_expert_scale"
               if num_experts > 1
               else None,
               f"{prefix}-mlp-moe_block-shared_experts-wi_0-kernel": f"{hf_prefix}.mlp.gate_proj.weight"
diff --git a/tests/end_to_end/tpu/gemma4/26b/convert_gemma4.sh b/tests/end_to_end/tpu/gemma4/26b/convert_gemma4.sh
@@ -7,18 +7,19 @@ MODEL_NAME='gemma4-26b'
 export MODEL_VARIATION='26b'
 TOKENIZER_PATH='google/gemma-4-26b-a4b-it'
 # To convert the multimodal model, make sure the use_multimodal is set to be true
-USE_MULTIMODAL=true
+USE_MULTIMODAL=false
 USE_SCAN_LAYERS=false
 
 
 # Installing torch for deps in forward_pass_logit_checker.py
 python3 -m pip install torch --index-url https://download.pytorch.org/whl/cpu
 
-# After downloading checkpoints, copy them to GCS bucket at $CHKPT_BUCKET
+# After downloading checkpoints, copy them to GCS bucket at $MODEL_BUCKET
 export MODEL_BUCKET='gs://maxtext-gemma/gemma4'
+export HF_MODEL='path/to/your/hf/gemma-4-26b-a4b-it'
 
 # To get converted ckpt:
-python3 -m maxtext.checkpoint_conversion.to_maxtext "${MAXTEXT_CONFIGS_DIR:-${MAXTEXT_REPO_ROOT:-$PWD}/src/maxtext/configs}"//base.yml \
+python3 -m maxtext.checkpoint_conversion.to_maxtext "${MAXTEXT_CONFIGS_DIR:-${MAXTEXT_REPO_ROOT:-$PWD}/src/maxtext/configs}"/base.yml \
     model_name=${MODEL_NAME} \
     hf_access_token=${HF_TOKEN} \
     --hf_model_path=${HF_MODEL} \
@@ -28,7 +29,6 @@ python3 -m maxtext.checkpoint_conversion.to_maxtext "${MAXTEXT_CONFIGS_DIR:-${MA
 
 
 export MAXTEXT_CKPT_PATH=${MODEL_BUCKET}/${MODEL_VARIATION}/converted/${idx}/0/items
-export HF_MODEL='path/to/your/hf/gemma-4-26b-a4b-it'
 
 
 if [ ${USE_MULTIMODAL} == true ]; then
@@ -62,7 +62,7 @@ if [ ${USE_MULTIMODAL} == true ]; then
         --max_kl_div=0.03 \
         --golden_logits_path=${GOLDEN_LOGITS_PATH}
 else
-    python3 -m tests.utils.forward_pass_logit_checker "${MAXTEXT_CONFIGS_DIR:-${MAXTEXT_REPO_ROOT:-$PWD}/src/maxtext/configs}"//base.yml \
+    python3 -m tests.utils.forward_pass_logit_checker "${MAXTEXT_CONFIGS_DIR:-${MAXTEXT_REPO_ROOT:-$PWD}/src/maxtext/configs}"/base.yml \
         tokenizer_path=${TOKENIZER_PATH}  \
         load_parameters_path=${MAXTEXT_CKPT_PATH} \
         model_name=${MODEL_NAME} \
diff --git a/tests/end_to_end/tpu/gemma4/26b/convert_gemma4_pt.sh b/tests/end_to_end/tpu/gemma4/26b/convert_gemma4_pt.sh
@@ -7,18 +7,18 @@ MODEL_NAME='gemma4-26b'
 export MODEL_VARIATION='26b'
 TOKENIZER_PATH='google/gemma-4-26b-a4b'
 # To convert the multimodal model, make sure the use_multimodal is set to be true
-USE_MULTIMODAL=true
+USE_MULTIMODAL=false
 USE_SCAN_LAYERS=false
 
 # Installing torch for deps in forward_pass_logit_checker.py
 python3 -m pip install torch --index-url https://download.pytorch.org/whl/cpu
 
-# After downloading checkpoints, copy them to GCS bucket at $CHKPT_BUCKET 
+# After downloading checkpoints, copy them to GCS bucket at $MODEL_BUCKET 
 export MODEL_BUCKET='gs://maxtext-gemma/gemma4'
-export HF_MODEL='path/to/your/gemma4-26b-a4b'
+export HF_MODEL='path/to/your/hf/gemma-4-26b-a4b'
 
 # To get converted ckpt:
-python3 -m maxtext.checkpoint_conversion.to_maxtext "${MAXTEXT_CONFIGS_DIR:-${MAXTEXT_REPO_ROOT:-$PWD}/src/maxtext/configs}"//base.yml \
+python3 -m maxtext.checkpoint_conversion.to_maxtext "${MAXTEXT_CONFIGS_DIR:-${MAXTEXT_REPO_ROOT:-$PWD}/src/maxtext/configs}"/base.yml \
     model_name=${MODEL_NAME} \
     hf_access_token=${HF_TOKEN} \
     --hf_model_path=${HF_MODEL} \
@@ -61,7 +61,7 @@ if [ ${USE_MULTIMODAL} == true ]; then
         --max_kl_div=0.03 \
         --golden_logits_path=${GOLDEN_LOGITS_PATH}
 else
-    python3 -m tests.utils.forward_pass_logit_checker "${MAXTEXT_CONFIGS_DIR:-${MAXTEXT_REPO_ROOT:-$PWD}/src/maxtext/configs}"//base.yml \
+    python3 -m tests.utils.forward_pass_logit_checker "${MAXTEXT_CONFIGS_DIR:-${MAXTEXT_REPO_ROOT:-$PWD}/src/maxtext/configs}"/base.yml \
         tokenizer_path=${TOKENIZER_PATH}  \
         load_parameters_path=${MAXTEXT_CKPT_PATH} \
         model_name=${MODEL_NAME} \
diff --git a/tests/end_to_end/tpu/gemma4/31b/convert_gemma4.sh b/tests/end_to_end/tpu/gemma4/31b/convert_gemma4.sh
@@ -14,13 +14,12 @@ USE_SCAN_LAYERS=false
 # Installing torch for deps in forward_pass_logit_checker.py
 python3 -m pip install torch --index-url https://download.pytorch.org/whl/cpu
 
-# After downloading checkpoints, copy them to GCS bucket at $CHKPT_BUCKET 
+# After downloading checkpoints, copy them to GCS bucket at $MODEL_BUCKET
 export MODEL_BUCKET='gs://maxtext-gemma/gemma4'
-
 export HF_MODEL='path/to/your/hf/gemma-4-31b-it'
 
 # To get converted ckpt:
-python3 -m maxtext.checkpoint_conversion.to_maxtext "${MAXTEXT_CONFIGS_DIR:-${MAXTEXT_REPO_ROOT:-$PWD}/src/maxtext/configs}"//base.yml \
+python3 -m maxtext.checkpoint_conversion.to_maxtext "${MAXTEXT_CONFIGS_DIR:-${MAXTEXT_REPO_ROOT:-$PWD}/src/maxtext/configs}"/base.yml \
     model_name=${MODEL_NAME} \
     hf_access_token=${HF_TOKEN} \
     --hf_model_path=${HF_MODEL} \
@@ -63,7 +62,7 @@ if [ ${USE_MULTIMODAL} == true ]; then
         --max_kl_div=0.03 \
         --golden_logits_path=${GOLDEN_LOGITS_PATH}
 else
-    python3 -m tests.utils.forward_pass_logit_checker "${MAXTEXT_CONFIGS_DIR:-${MAXTEXT_REPO_ROOT:-$PWD}/src/maxtext/configs}"//base.yml \
+    python3 -m tests.utils.forward_pass_logit_checker "${MAXTEXT_CONFIGS_DIR:-${MAXTEXT_REPO_ROOT:-$PWD}/src/maxtext/configs}"/base.yml \
         tokenizer_path=${TOKENIZER_PATH}  \
         load_parameters_path=${MAXTEXT_CKPT_PATH} \
         model_name=${MODEL_NAME} \
diff --git a/tests/end_to_end/tpu/gemma4/31b/convert_gemma4_pt.sh b/tests/end_to_end/tpu/gemma4/31b/convert_gemma4_pt.sh
@@ -14,13 +14,12 @@ USE_SCAN_LAYERS=false
 # Installing torch for deps in forward_pass_logit_checker.py
 python3 -m pip install torch --index-url https://download.pytorch.org/whl/cpu
 
-# After downloading checkpoints, copy them to GCS bucket at $CHKPT_BUCKET \
+# After downloading checkpoints, copy them to GCS bucket at $MODEL_BUCKET
 export MODEL_BUCKET='gs://maxtext-gemma/gemma4'
-
 export HF_MODEL='path/to/your/hf/gemma-4-31b'
 
 # To get converted ckpt:
-python3 -m maxtext.checkpoint_conversion.to_maxtext "${MAXTEXT_CONFIGS_DIR:-${MAXTEXT_REPO_ROOT:-$PWD}/src/maxtext/configs}"//base.yml \
+python3 -m maxtext.checkpoint_conversion.to_maxtext "${MAXTEXT_CONFIGS_DIR:-${MAXTEXT_REPO_ROOT:-$PWD}/src/maxtext/configs}"/base.yml \
     model_name=${MODEL_NAME} \
     hf_access_token=${HF_TOKEN} \
     --hf_model_path=${HF_MODEL} \
@@ -63,7 +62,7 @@ if [ ${USE_MULTIMODAL} == true ]; then
         --max_kl_div=0.03 \
         --golden_logits_path=${GOLDEN_LOGITS_PATH}
 else
-    python3 -m tests.utils.forward_pass_logit_checker "${MAXTEXT_CONFIGS_DIR:-${MAXTEXT_REPO_ROOT:-$PWD}/src/maxtext/configs}"//base.yml \
+    python3 -m tests.utils.forward_pass_logit_checker "${MAXTEXT_CONFIGS_DIR:-${MAXTEXT_REPO_ROOT:-$PWD}/src/maxtext/configs}"/base.yml \
         tokenizer_path=${TOKENIZER_PATH}  \
         load_parameters_path=${MAXTEXT_CKPT_PATH} \
         model_name=${MODEL_NAME} \