AI-Hypercomputer
diff --git a/‎README.md‎
Lines changed: 3 additions & 1 deletion b/‎README.md‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎src/maxtext/checkpoint_conversion/standalone_scripts/llama_or_mistral_ckpt.py‎
Lines changed: 6 additions & 1 deletion b/‎src/maxtext/checkpoint_conversion/standalone_scripts/llama_or_mistral_ckpt.py‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎src/maxtext/checkpoint_conversion/utils/hf_model_configs.py‎
Lines changed: 156 additions & 9 deletions b/‎src/maxtext/checkpoint_conversion/utils/hf_model_configs.py‎
Lines changed: 156 additions & 9 deletions
@@ -41,6 +41,7 @@ See our guide on running MaxText in decoupled mode, without any GCP dependencies
 
 ## 🔥 Latest news 🔥
 
+* \[April 2, 2026\] Gemma 4 multi-modal models (26B MoE, 31B dense) are now supported! Try them out with our [gemma4-26b](https://github.com/AI-Hypercomputer/maxtext/blob/main/src/maxtext/configs/models/gemma4-26b.yml) and [gemma4-31b](https://github.com/AI-Hypercomputer/maxtext/blob/main/src/maxtext/configs/models/gemma4-31b.yml) configs. For more details, see [Run_Gemma4.md](https://github.com/AI-Hypercomputer/maxtext/blob/main/tests/end_to_end/tpu/gemma4/Run_Gemma4.md).
 * \[March 6, 2026\] New features from DeepSeek-AI are now supported: Conditional Memory via Scalable Lookup ([Engram](https://arxiv.org/abs/2601.07372)) and Manifold-Constrained Hyper-Connections ([mHC](https://arxiv.org/abs/2512.24880)). Try them out with our [deepseek-custom](https://github.com/AI-Hypercomputer/maxtext/blob/main/src/maxtext/configs/models/deepseek-custom.yml) starter config.
 * \[March 5, 2026\] New `tpu-post-train` [target in PyPI](https://pypi.org/project/maxtext). Please also use this installation option for running vllm_decode. See the [MaxText installation instructions](https://maxtext.readthedocs.io/en/latest/install_maxtext.html) for more info.
 * \[March 5, 2026\] [Qwen3-Next](https://github.com/AI-Hypercomputer/maxtext/blob/7656eb8d1c9eb0dd91e617a6fdf6ad805221221a/tests/end_to_end/tpu/qwen/next/run_qwen3_next.md) is now supported.
@@ -73,7 +74,7 @@ MaxText provides a library of models and demonstrates how to perform pre-trainin
 
 MaxText leverages [JAX AI libraries](https://docs.jaxstack.ai/en/latest/getting_started.html) and presents a cohesive and comprehensive demonstration of training at scale by using [Flax](https://flax.readthedocs.io/en/latest/) (neural networks), [Tunix](https://github.com/google/tunix) (post-training), [Orbax](https://orbax.readthedocs.io/en/latest/) (checkpointing), [Optax](https://optax.readthedocs.io/en/latest/) (optimization), and [Grain](https://google-grain.readthedocs.io/en/latest/) (dataloading).
 
-In addition to pure text-based LLMs, we also support multi-modal training with Gemma 3 and Llama 4 VLMs.
+In addition to pure text-based LLMs, we also support multi-modal training with Gemma 3, Gemma 4, and Llama 4 VLMs.
 
 ### Pre-training
 
@@ -103,6 +104,7 @@ MaxText aims to provide you with the best OSS models, whether as a reference imp
 **Supported JAX models in MaxText**
 
 * Google
+  * Gemma 4 (26B MoE, 31B Dense)
   * Gemma 3 (4B, 12B, 27B)
   * Gemma 2 (2B, 9B, 27B)
   * Gemma 1 (2B, 7B)
 
@@ -1653,6 +1653,8 @@ def shard_checkpoint(jax_weights, device_count, mem_info):
   max_logging.log("Note: Axis 0 sharding is the default and will not be logged individually.")
   # Pre-define sharding specs
   mesh = jax.sharding.Mesh(jax.devices(), "checkpoint_sharding_axis")
+  # No sharding (replicated specifically for 0D scalars)
+  s0 = jax.sharding.NamedSharding(mesh, jax.sharding.PartitionSpec())
   # Sharding along axis 0
   s1 = jax.sharding.NamedSharding(mesh, jax.sharding.PartitionSpec("checkpoint_sharding_axis"))
   # Sharding along axis 1
@@ -1673,7 +1675,10 @@ def checkpoint_device_put(arr):
       # materialize lazy tensor
       arr = np.array(arr)
 
-    if arr.shape[0] % device_count == 0:
+    if len(arr.shape) == 0:
+      max_logging.log("0D scalar detected, replicating")
+      return jax.device_put(arr, device=s0)
+    elif arr.shape[0] % device_count == 0:
       # Sharding axis 0: Omit log for brevity per the summary log above.
       return jax.device_put(arr, device=s1)
     elif len(arr.shape) > 1 and arr.shape[1] % device_count == 0:
 
@@ -24,6 +24,137 @@
 else:
   from transformers.configuration_utils import PretrainedConfig as PTConfig
 
+
+gemma4_26b_dict = {
+    "architectures": ["Gemma4ForConditionalGeneration"],
+    "audio_config": None,
+    "audio_token_id": 258881,
+    "boa_token_id": 256000,
+    "boi_token_id": 255999,
+    "dtype": "bfloat16",
+    "eoa_token_id": 258883,
+    "eoa_token_index": 258883,
+    "eoi_token_id": 258882,
+    "eos_token_id": [1, 106],
+    "image_token_id": 258880,
+    "initializer_range": 0.02,
+    "model_type": "gemma4",
+    "text_config": {
+        "attention_bias": False,
+        "attention_dropout": 0.0,
+        "attention_k_eq_v": True,
+        "bos_token_id": 2,
+        "dtype": "bfloat16",
+        "enable_moe_block": True,
+        "eos_token_id": 1,
+        "expert_intermediate_size": 704,
+        "final_logit_softcapping": 30.0,
+        "global_head_dim": 512,
+        "head_dim": 256,
+        "hidden_activation": "gelu_pytorch_tanh",
+        "hidden_size": 2816,
+        "hidden_size_per_layer_input": 0,
+        "initializer_range": 0.02,
+        "intermediate_size": 2112,
+        "layer_types": [
+            "sliding_attention",
+            "sliding_attention",
+            "sliding_attention",
+            "sliding_attention",
+            "sliding_attention",
+            "full_attention",
+        ]
+        * 5,
+        "max_position_embeddings": 262144,
+        "model_type": "gemma4_text",
+        "num_attention_heads": 16,
+        "num_experts": 128,
+        "num_global_key_value_heads": 2,
+        "num_hidden_layers": 30,
+        "num_key_value_heads": 8,
+        "num_kv_shared_layers": 0,
+        "pad_token_id": 0,
+        "rms_norm_eps": 1e-06,
+        "rope_parameters": {
+            "full_attention": {"partial_rotary_factor": 0.25, "rope_theta": 1_000_000.0, "rope_type": "proportional"},
+            "sliding_attention": {"rope_theta": 10_000.0, "rope_type": "default"},
+        },
+        "sliding_window": 1024,
+        "tie_word_embeddings": True,
+        "top_k_experts": 8,
+        "use_bidirectional_attention": "vision",
+        "use_cache": True,
+        "use_double_wide_mlp": False,
+        "vocab_size": 262144,
+        "vocab_size_per_layer_input": 262144,
+    },
+    "tie_word_embeddings": True,
+    "transformers_version": "5.5.0.dev0",
+    "video_token_id": 258884,
+    "vision_config": {
+        "attention_bias": False,
+        "attention_dropout": 0.0,
+        "default_output_length": 280,
+        "dtype": "bfloat16",
+        "global_head_dim": 72,
+        "head_dim": 72,
+        "hidden_activation": "gelu_pytorch_tanh",
+        "hidden_size": 1152,
+        "intermediate_size": 4304,
+        "max_position_embeddings": 131072,
+        "model_type": "gemma4_vision",
+        "num_attention_heads": 16,
+        "num_hidden_layers": 27,
+        "num_key_value_heads": 16,
+        "patch_size": 16,
+        "pooling_kernel_size": 3,
+        "position_embedding_size": 10240,
+        "rms_norm_eps": 1e-06,
+        "rope_parameters": {"rope_theta": 100.0, "rope_type": "default"},
+        "standardize": True,
+        "use_clipped_linears": False,
+    },
+    "vision_soft_tokens_per_image": 280,
+}
+
+
+gemma4_31b_dict = gemma4_26b_dict.copy()
+gemma4_31b_dict["text_config"] = gemma4_26b_dict["text_config"].copy()
+gemma4_31b_dict["text_config"].update(
+    {
+        "enable_moe_block": False,
+        "expert_intermediate_size": None,
+        "hidden_size": 5376,
+        "intermediate_size": 21504,
+        "layer_types": [
+            "sliding_attention",
+            "sliding_attention",
+            "sliding_attention",
+            "sliding_attention",
+            "sliding_attention",
+            "full_attention",
+        ]
+        * 10,
+        "num_attention_heads": 32,
+        "num_experts": None,
+        "num_global_key_value_heads": 4,
+        "num_hidden_layers": 60,
+        "num_key_value_heads": 16,
+        "top_k_experts": None,
+    }
+)
+
+
+try:
+  # Will execute successfully if Transformers is updated with Gemma 4 support
+  gemma4_26b_config = transformers.Gemma4Config(**gemma4_26b_dict)
+  gemma4_31b_config = transformers.Gemma4Config(**gemma4_31b_dict)
+except AttributeError:
+  # Graceful fallback to raw dict-based PTConfig if Gemma 4 natively is missing
+  gemma4_26b_config = PTConfig(**gemma4_26b_dict)
+  gemma4_31b_config = PTConfig(**gemma4_31b_dict)
+
+
 gemma3_4b_config = transformers.Gemma3Config(
     architectures=["Gemma3ForConditionalGeneration"],
     boi_token_index=255999,
@@ -584,9 +715,10 @@
         "mscale": 0.707,
         "mscale_all_dim": 0.707,
         "original_max_position_embeddings": 4096,
+        "rope_theta": 10_000,
         "type": "yarn",
     },
-    "rope_theta": 10000,
+    "rope_theta": 10_000,
     "routed_scaling_factor": 1.0,
     "scoring_func": "softmax",
     "seq_aux": True,
@@ -645,9 +777,10 @@
         "mscale": 1.0,
         "mscale_all_dim": 1.0,
         "original_max_position_embeddings": 4096,
+        "rope_theta": 10_000,
         "type": "yarn",
     },
-    "rope_theta": 10000,
+    "rope_theta": 10_000,
     "routed_scaling_factor": 2.5,
     "scoring_func": "sigmoid",
     "tie_word_embeddings": False,
@@ -697,15 +830,16 @@
     "qk_rope_head_dim": 64,
     "rms_norm_eps": 1e-06,
     "rope_scaling": {
-        "beta_fast": 32,
-        "beta_slow": 1,
-        "factor": 40,
+        "beta_fast": 32.0,
+        "beta_slow": 1.0,
+        "factor": 40.0,
         "mscale": 1.0,
         "mscale_all_dim": 1.0,
         "original_max_position_embeddings": 4096,
+        "rope_theta": 10_000,
         "type": "yarn",
     },
-    "rope_theta": 10000,
+    "rope_theta": 10_000,
     "routed_scaling_factor": 2.5,
     "scoring_func": "sigmoid",
     "tie_word_embeddings": False,
@@ -717,8 +851,17 @@
     "v_head_dim": 128,
     "vocab_size": 129280,
 }
+
+
 # TODO(shuningjin): replace with DeepseekV32Config when available in transformers library
-deepseek32_671b_config = PTConfig(**deepseek32_671b_dict)
+class DeepseekV32Config(PTConfig):
+
+  def __init__(self, **kwargs):
+    self.max_position_embeddings = kwargs.get("max_position_embeddings", 163840)
+    super().__init__(**kwargs)
+
+
+deepseek32_671b_config = DeepseekV32Config(**deepseek32_671b_dict)
 
 # from https://huggingface.co/openai/gpt-oss-20b/blob/main/config.json
 # remove mxfp4 quantization_config, since we are using bf16
@@ -775,10 +918,11 @@
         "beta_slow": 1.0,
         "factor": 32.0,
         "original_max_position_embeddings": 4096,
+        "rope_theta": 150_000,
         "rope_type": "yarn",
         "truncate": False,
     },
-    "rope_theta": 150000,
+    "rope_theta": 150_000,
     "router_aux_loss_coef": 0.9,
     "sliding_window": 128,
     "swiglu_limit": 7.0,
@@ -856,10 +1000,11 @@
         "beta_slow": 1.0,
         "factor": 32.0,
         "original_max_position_embeddings": 4096,
+        "rope_theta": 150_000,
         "rope_type": "yarn",
         "truncate": False,
     },
-    "rope_theta": 150000,
+    "rope_theta": 150_000,
     "router_aux_loss_coef": 0.9,
     "sliding_window": 128,
     "swiglu_limit": 7.0,
@@ -1006,6 +1151,8 @@
     "gemma3-4b": gemma3_4b_config,
     "gemma3-12b": gemma3_12b_config,
     "gemma3-27b": gemma3_27b_config,
+    "gemma4-26b": gemma4_26b_config,
+    "gemma4-31b": gemma4_31b_config,
     "qwen2.5-1.5b": qwen25_1_5b_config,
     "qwen2.5-7b": qwen25_7b_config,
     "qwen2.5-14b": qwen25_14b_config,