feat: add DeepSeek-V3 support for MaxText to Hugging Face conversion

JamesDeng42 · JamesDeng42 · commit 47defb126f18 · 2026-03-31T23:33:20.000Z
- Update architecture validation in checkpoint conversion to include MLA and MoE parameters.
- Implement output projection initialization for MLA layers.
diff --git a/src/maxtext/checkpoint_conversion/to_huggingface.py b/src/maxtext/checkpoint_conversion/to_huggingface.py
@@ -140,11 +140,22 @@ def _validate_or_update_architecture(hf_config, max_config, override: bool):
   # Mapping from Hugging Face config attribute -> MaxText config attribute
   # Note: We use derived MaxText attributes (e.g. emb_dim) which account for scale factors.
   attributes_to_check = [
-      ("num_attention_heads", "num_query_heads"),
-      ("num_key_value_heads", "num_kv_heads"),
       ("hidden_size", "emb_dim"),
       ("intermediate_size", "mlp_dim"),
+      ("kv_lora_rank", "kv_lora_rank"),
+      ("moe_intermediate_size", "moe_mlp_dim"),
+      ("n_routed_experts", "num_experts"),
+      ("n_shared_experts", "shared_experts"),
+      ("num_attention_heads", "num_query_heads"),
+      ("num_experts", "num_experts"),
+      ("num_experts_per_tok", "num_experts_per_tok"),
       ("num_hidden_layers", "num_decoder_layers"),
+      ("num_key_value_heads", "num_kv_heads"),
+      ("num_local_experts", "num_experts"),
+      ("q_lora_rank", "q_lora_rank"),
+      ("qk_nope_head_dim", "qk_nope_head_dim"),
+      ("qk_rope_head_dim", "qk_rope_head_dim"),
+      ("v_head_dim", "v_head_dim"),
       ("vocab_size", "vocab_size"),
   ]
 
diff --git a/src/maxtext/layers/attention_mla.py b/src/maxtext/layers/attention_mla.py
@@ -786,6 +786,10 @@ def _init_projections(self, inputs_q_shape: Tuple, inputs_kv_shape: Tuple) -> No
           rngs=self.rngs,
       )
 
+  @property
+  def out_head_dim(self) -> int:
+    return self.v_head_dim
+
   def mla_query_projection(
       self, inputs_q: Array, inputs_positions: Array, model_mode
   ) -> tuple[jax.Array, Optional[jax.Array]]:
diff --git a/src/maxtext/layers/attentions.py b/src/maxtext/layers/attentions.py
@@ -696,17 +696,21 @@ def qkv_projection(self, inputs: Array, proj_name: str, out_sharding: NamedShard
     query, key, value = qkv_proj[:, :, 0, ...], qkv_proj[:, :, 1, ...], qkv_proj[:, :, 2, ...]
     return query, key, value
 
+  @property
+  def out_head_dim(self) -> int:
+    return self.head_dim
+
   def init_out_w(self, output_dim: int) -> nnx.Module:
     """out projection"""
-    in_features = (self.num_query_heads, self.head_dim)
+    in_features = (self.num_query_heads, self.out_head_dim)
     out_features = output_dim
     out_kernel_axis = (
         (None, None, None) if self.config.ici_context_autoregressive_parallelism > 1 else ("heads", "kv", "embed")
     )
     axis = (-2, -1)
 
     if self.is_qwen3_next:
-      in_features = self.num_query_heads * self.head_dim
+      in_features = self.num_query_heads * self.out_head_dim
       out_kernel_axis = ("mlp", "embed")
       axis = (-1,)