AI-Hypercomputer
diff --git a/‎benchmarks/api_server/encoding/encoding_dsv32.py‎
Lines changed: 403 additions & 0 deletions b/‎benchmarks/api_server/encoding/encoding_dsv32.py‎
Lines changed: 403 additions & 0 deletions
diff --git a/‎benchmarks/api_server/maxtext_server.py‎
Lines changed: 19 additions & 9 deletions b/‎benchmarks/api_server/maxtext_server.py‎
Lines changed: 19 additions & 9 deletions
diff --git a/‎benchmarks/api_server/server_models.py‎
Lines changed: 2 additions & 0 deletions b/‎benchmarks/api_server/server_models.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎benchmarks/api_server/server_utils.py‎
Lines changed: 19 additions & 0 deletions b/‎benchmarks/api_server/server_utils.py‎
Lines changed: 19 additions & 0 deletions
diff --git a/‎src/maxtext/layers/attention_mla.py‎
Lines changed: 102 additions & 10 deletions b/‎src/maxtext/layers/attention_mla.py‎
Lines changed: 102 additions & 10 deletions
diff --git a/‎src/maxtext/layers/attention_op.py‎
Lines changed: 11 additions & 2 deletions b/‎src/maxtext/layers/attention_op.py‎
Lines changed: 11 additions & 2 deletions
@@ -1,4 +1,4 @@
-# Copyright 2023–2025 Google LLC
+# Copyright 2023–2026 Google LLC
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -59,6 +59,7 @@
     ChatMessage,
 )
 from benchmarks.api_server import server_utils
+from benchmarks.api_server.encoding import encoding_dsv32
 
 # ----------------------------
 # Init
@@ -95,10 +96,13 @@
 response_dict = {}
 response_lock = threading.Lock()
 
-# Batching configuration
-BATCH_TIMEOUT_S = 0.1  # 100ms
+# Batching configuration.
+BATCH_TIMEOUT_S = float(os.environ.get("MAXTEXT_BATCH_TIMEOUT_S", "0.1"))
 # Timeout for a client waiting for a response.
 REQUEST_TIMEOUT_S = int(os.environ.get("MAXTEXT_REQUEST_TIMEOUT_S", "36000"))
+# Define a maximum size for the request payload to be broadcasted.
+# This avoids broadcasting variable-sized arrays, which can be complex.
+MAX_REQUEST_SIZE = int(os.environ.get("MAXTEXT_REQUEST_SIZE", "655360"))
 
 
 async def _queue_and_wait_for_response(request: Union[CompletionRequest, ChatCompletionRequest]):
@@ -165,14 +169,11 @@ def run_server():
   uvicorn.run(app, host="0.0.0.0", port=8000)
 
 
-# Define a maximum size for the request payload to be broadcasted.
-# This avoids broadcasting variable-sized arrays, which can be complex.
-MAX_REQUEST_SIZE = 65536 * 10
-
-
 def _build_chat_completion_response(request, completion_result, llm):
   """Builds a ChatCompletionResponse from a single completion result."""
   text_out = completion_result.text
+  reasoning_out = None
+
   if "gpt-oss" in request.model and harmony_enc:
     try:
       parsed_messages = harmony_enc.parse_messages_from_completion_tokens(completion_result.tokens, role=Role.ASSISTANT)
@@ -186,6 +187,15 @@ def _build_chat_completion_response(request, completion_result, llm):
     except (ValueError, IndexError) as e:
       logger.error("Harmony parsing failed for gpt-oss: %s. Falling back to raw text.", e, exc_info=True)
 
+  if server_utils.is_dsv32_encoding_enabled(request.model):
+    try:
+      # DeepSeek-V3.2 models often generate thinking block.
+      parsed = encoding_dsv32.parse_message_from_completion_text(text_out, thinking_mode="thinking")
+      text_out = parsed.get("content", text_out)
+      reasoning_out = parsed.get("reasoning_content")
+    except (AssertionError, ValueError, IndexError) as e:
+      logger.error("DeepSeek-V3.2 parsing failed: %s. Falling back to raw text.", e, exc_info=True)
+
   want_top_logprobs = (
       (request.top_logprobs or 0) > 0 if isinstance(request, ChatCompletionRequest) else (request.logprobs or 0) > 0
   )
@@ -206,7 +216,7 @@ def _build_chat_completion_response(request, completion_result, llm):
       choices=[
           ChatCompletionChoice(
               index=0,
-              message=ChatMessage(role="assistant", content=text_out),
+              message=ChatMessage(role="assistant", content=text_out, reasoning_content=reasoning_out),
               finish_reason=finish_reason,
               logprobs=lp_payload,
           )
 
@@ -172,10 +172,12 @@ class ChatMessage(BaseModel):
   Attributes:
       role: The role of the message's author (e.g., 'user', 'assistant').
       content: The text content of the message.
+      reasoning_content: The text content for reasoning/thinking.
   """
 
   role: str
   content: str
+  reasoning_content: Optional[str] = None
 
 
 class ChatCompletionRequest(SamplingParams):
 
@@ -32,6 +32,7 @@
 
 from benchmarks.api_server.maxtext_generator import MaxTextGenerator
 from benchmarks.api_server.server_models import LogProbsPayload
+from benchmarks.api_server.encoding import encoding_dsv32
 
 # ----------------------------
 # Debugging
@@ -41,6 +42,19 @@
 DEBUG_LOG_FILE = os.environ.get("MAXTEXT_DEBUG_LOG_FILE", "benchmarks/api_server/server_debug_log.jsonl")
 logger = logging.getLogger(__name__)
 
+# Indicate if we should disable specific encoding for DeepSeek-V3.2 family.
+# Encoding is needed for v3.2 and v3.2-speciale, but not deepseek v3.2-exp.
+DISABLE_DSV32_ENCODING = os.environ.get("DISABLE_DSV32_ENCODING", "0") == "1"
+
+
+def is_dsv32_encoding_enabled(model_name: str) -> bool:
+  """
+  Checks if DeepSeek-V3.2 specific encoding should be applied to the given model.
+  """
+  if DISABLE_DSV32_ENCODING:
+    return False
+  return "deepseek3.2" in model_name.lower()
+
 
 def log_debug_event(request_id: str, event_type: str, content: dict):
   """
@@ -114,6 +128,11 @@ def get_prompts_for_request(req: any, llm: MaxTextGenerator) -> List[str]:
       A list of string prompts.
   """
   if hasattr(req, "messages"):  # ChatCompletionRequest
+    if is_dsv32_encoding_enabled(req.model):
+      messages = [m.model_dump(exclude_none=True) for m in req.messages]
+      encode_config = {"thinking_mode": "thinking", "drop_thinking": True, "add_default_bos_token": True}
+      return [encoding_dsv32.encode_messages(messages, **encode_config)]
+
     messages = [m.model_dump() for m in req.messages]
     formatted_prompt = llm.tokenizer.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
     return [formatted_prompt]
 
@@ -38,6 +38,11 @@
     AxisNames,
     BATCH,
     BATCH_NO_EXP,
+    CACHE_BATCH,
+    CACHE_BATCH_PREFILL,
+    CACHE_SEQUENCE,
+    CACHE_HEADS_NONE,
+    CACHE_KV,
     Config,
     DECODE_BATCH,
     DECODE_LENGTH,
@@ -76,6 +81,9 @@
 from maxtext.utils.globals import EPS
 
 
+PLACEHOLDER_SEQ_LEN = 1
+
+
 class Indexer(nnx.Module):
   """Indexer for DeepSeek Sparse Attention (DSA).
 
@@ -109,6 +117,7 @@ def __init__(
     self.rngs = rngs
     self.dtype = config.dtype
     self.weight_dtype = config.weight_dtype
+    self.max_target_length = config.max_target_length
 
     self.n_heads = config.indexer_n_heads
     self.head_dim = config.indexer_head_dim
@@ -168,6 +177,31 @@ def __init__(
         rngs=self.rngs,
     )
 
+  def update_indexer_cache(self, kv_cache, k, decoder_segment_ids, model_mode, previous_chunk):
+    """Updates Indexer buffers by processing KV cache results."""
+    k_expanded = k[:, :, jnp.newaxis, :]
+    p_res, a_res = kv_cache(
+        key=k_expanded,
+        value=k_expanded,
+        decoder_segment_ids=decoder_segment_ids,
+        model_mode=model_mode,
+        use_ragged_attention=self.config.use_ragged_attention,
+        previous_chunk=previous_chunk,
+    )
+
+    # Filter out None values to handle PREFILL vs AR modes uniformly
+    active_results = [res for res in [p_res, a_res] if res is not None]
+
+    if not active_results:
+      return None, None
+
+    # Extract keys (index 0) and segment IDs (index 2)
+    keys = jnp.concatenate([res[0] for res in active_results], axis=1)
+    segs = jnp.concatenate([res[2] for res in active_results], axis=1)
+
+    # squeeze(2) removes the jnp.newaxis added above
+    return keys.squeeze(2), segs
+
   def apply_partial_rope(
       self,
       inputs: Array,
@@ -221,6 +255,10 @@ def __call__(
       inputs_kv: Array,
       inputs_positions: Optional[Array | None] = None,
       attention_mask: Optional[Array | None] = None,
+      decoder_segment_ids: Optional[Array | None] = None,
+      previous_chunk: Any = None,
+      kv_cache: Any = None,
+      model_mode: str = MODEL_MODE_TRAIN,
   ):
     """Computes the index score to determine the top-k relevant tokens.
 
@@ -245,6 +283,10 @@ def __call__(
         `DEFAULT_MASK_VALUE` (a large negative number) prevent it.
         Returns `None` if no masking is determined to be necessary based on
         the inputs and configuration.
+      decoder_segment_ids: Segment IDs for decoder masking.
+      previous_chunk: Previous chunk info for prefill.
+      kv_cache: Key-value cache used when serving models.
+      model_mode: "train", "prefill", or "autoregressive".
 
     Returns:
       indexer_mask: A sparse mask [b, t, s] with 0.0 for top-k selected tokens
@@ -259,10 +301,6 @@ def __call__(
       h: Number of Indexer Heads (indexer_n_heads)
       d: Indexer Head Dimension (indexer_head_dim)
     """
-    # NOTE: If sequence length <= topk, indexer always selects all tokens.
-    if self.config.max_target_length <= self.indexer_topk:
-      return None, None, None
-
     bsz, seqlen, _ = inputs_q.shape  # s = t = seqlen
     # ==============================================================================
     # Gradient Isolation Strategy: Main Model vs. Indexer
@@ -300,6 +338,16 @@ def __call__(
     k = self.apply_partial_rope(k, inputs_positions=inputs_positions)
     k = k.squeeze(2)  # [b, s, 1, d] -> [b, s, d]
 
+    # Update and retrieve from cache if not training
+    cached_s = None
+    if model_mode != MODEL_MODE_TRAIN:
+      k_cached, cached_s = self.update_indexer_cache(kv_cache, k, decoder_segment_ids, model_mode, previous_chunk)
+      k = k_cached if k_cached is not None else k
+
+    # NOTE: If the total available sequence length <= topk, indexer always selects all tokens.
+    if k.shape[1] <= self.indexer_topk:
+      return None, None, None
+
     # Compute Index Scores
     # QK product: relu(q @ k.T), [b, t, s, h]
     # Similar to MQA, each key is shared by h query head
@@ -313,6 +361,12 @@ def __call__(
     # Aggregate head-wise logits: logits @ weights
     indexer_score = jnp.einsum("btsh, bth -> bts", logits, weights, precision=self.config.matmul_precision)  # [b, t, s]
 
+    internal_padding_mask = None
+    if cached_s is not None:
+      # cached_s marks valid tokens from the original prefill step and all subsequent AR steps
+      internal_padding_mask = jnp.where(cached_s > 0, 0.0, DEFAULT_MASK_VALUE)
+      indexer_score += internal_padding_mask[:, None, :]
+
     # Apply attention mask before TopK
     if attention_mask is not None:
       indexer_score += attention_mask
@@ -321,12 +375,15 @@ def __call__(
     _, topk_indices = jax.lax.top_k(indexer_score, k=self.indexer_topk)  # topk_indices [b, t, k]
 
     # Create Sparse Index Mask: 0 and large negatives
-    indexer_mask = self.generate_mask(topk_indices, seqlen)  # [b, t, s]
+    indexer_mask = self.generate_mask(topk_indices, k.shape[1])  # [b, t, s]
 
     # Re-apply attention mask after TopK: in case number of unmasked tokens < TopK
     if attention_mask is not None:
       indexer_mask += attention_mask
 
+    if internal_padding_mask is not None:
+      indexer_mask += internal_padding_mask[:, None, :]
+
     return indexer_mask, topk_indices, indexer_score
 
 
@@ -645,10 +702,41 @@ def __init__(
           quant=quant,
           model_mode=model_mode,
       )
+      self.IndexerKVCache_0 = self.init_indexer_cache(inputs_kv_shape) if model_mode != MODEL_MODE_TRAIN else None
+    else:
+      self.indexer = None
+      self.IndexerKVCache_0 = None
 
     # Module attribute names must match names previously passed to Linen for checkpointing
     self.MlaKVCache_0 = self.init_mla_kv_caches(inputs_kv_shape) if model_mode != MODEL_MODE_TRAIN else None
 
+  def init_indexer_cache(self, inputs_kv_shape: Tuple):
+    """Initializes Indexer Cache."""
+    batch_size, _, _ = inputs_kv_shape
+    # Use standard KVCache to store keys. Values are unused but required by KVCache API.
+    # KVCache expects key_heads and value_heads. Since k is shared (MQA-like for Indexer),
+    # we use key_heads=1, value_heads=1.
+    return kvcache.KVCache(
+        max_prefill_length=self.max_prefill_predict_length,
+        max_target_length=self.max_target_length,
+        batch=batch_size,
+        key_seq_len=PLACEHOLDER_SEQ_LEN,
+        value_seq_len=PLACEHOLDER_SEQ_LEN,
+        key_heads=1,
+        value_heads=1,
+        key_head_size=self.config.indexer_head_dim,
+        value_head_size=self.config.indexer_head_dim,
+        dtype=self.dtype,
+        kv_quant=None,  # Quantization is not yet supported by the indexer.
+        prefill_cache_logical_axis_names=(CACHE_BATCH_PREFILL, CACHE_SEQUENCE, CACHE_HEADS_NONE, CACHE_KV),
+        cache_logical_axis_names=(CACHE_BATCH, CACHE_SEQUENCE, CACHE_HEADS_NONE, CACHE_KV),
+        prefill_cache_axis_order=(1, 2, 0, 3),
+        ar_cache_axis_order=(1, 2, 0, 3),
+        use_chunked_prefill=self.config.use_chunked_prefill,
+        model_mode=self.model_mode,
+        rngs=self.rngs,
+    )
+
   def _init_projections(self, inputs_q_shape: Tuple, inputs_kv_shape: Tuple) -> None:
     """Initializes the MLA-specific projections."""
     # Assert required configuration parameters for MLA attention.
@@ -881,14 +969,13 @@ def init_mla_kv_caches(self, inputs_kv_shape: Tuple):
     # and max_target_length, not the passed seq_len.
     # We can use a placeholder value. The correct fix might involve refactoring
     # MlaKVCache.
-    placeholder_seq_len = 1
 
     return kvcache.MlaKVCache(
         max_prefill_length=self.max_prefill_predict_length,
         max_target_length=self.max_target_length,
         batch=batch_size,
-        key_seq_len=placeholder_seq_len,
-        value_seq_len=placeholder_seq_len,
+        key_seq_len=PLACEHOLDER_SEQ_LEN,
+        value_seq_len=PLACEHOLDER_SEQ_LEN,
         key_head_size=self.kv_lora_rank,
         value_head_size=self.qk_rope_head_dim,
         dtype=self.dtype,
@@ -1100,6 +1187,9 @@ def __call__(
       inputs_kv = self._maybe_shard_with_logical(inputs_kv, self.input_axis_names)
       out_logical_name = (BATCH, LENGTH_NO_EXP, HEAD, D_KV)
 
+    if model_mode != MODEL_MODE_TRAIN and decoder_segment_ids is None:
+      decoder_segment_ids = jnp.ones(inputs_q.shape[:2], dtype=jnp.int32)
+
     query, low_rank_q = self.mla_query_projection(inputs_q, inputs_positions, model_mode)
     if self.config.force_q_layout:
       query = layout.with_layout_constraint(query, DLL(major_to_minor=(0, 2, 3, 1)))
@@ -1113,8 +1203,6 @@ def __call__(
     # Indexer Logic
     indexer_mask = None
     if self.use_indexer:
-      if model_mode != MODEL_MODE_TRAIN:
-        raise NotImplementedError("Sparse indexer has not implemented for inference yet.")
       # generate mask: with 0 and large negative, [b, 1, 1, q_len, kv_len] -> [b, q_len, kv_len]
       attention_mask = self.attention_op.generate_attention_mask(
           query, key, decoder_segment_ids, model_mode, previous_chunk, bidirectional_mask
@@ -1128,6 +1216,10 @@ def __call__(
           inputs_kv=inputs_kv,
           inputs_positions=inputs_positions,
           attention_mask=attention_mask,
+          decoder_segment_ids=decoder_segment_ids,
+          previous_chunk=previous_chunk,
+          kv_cache=self.IndexerKVCache_0,
+          model_mode=model_mode,
       )
 
       if indexer_mask is not None and self.config.indexer_loss_scaling_factor > 0.0:
 
@@ -677,7 +677,7 @@ def generate_attention_mask(
           Chunked Prefills - ArXiv:2308.16369 (https://arxiv.org/abs/2308.16369)
     """
     mask = None
-    if model_mode == MODEL_MODE_AUTOREGRESSIVE:
+    if model_mode == MODEL_MODE_AUTOREGRESSIVE and decoder_segment_ids is not None:
       mask = decoder_segment_ids[:, None, None, None, :] == DECODING_ACTIVE_SEQUENCE_INDICATOR
     elif decoder_segment_ids is not None:
       mask = decoder_segment_ids[:, :, None] == decoder_segment_ids[:, None, :]
@@ -2047,6 +2047,14 @@ def __call__(
       assert prefill_kv_cache
       key, value, decoder_segment_ids = prefill_kv_cache
 
+    indexer_mask_prefill = None
+    indexer_mask_ar = None
+    if indexer_mask is not None:
+      prefill_len = key.shape[1]
+      indexer_mask_prefill = indexer_mask[:, :, :prefill_len]
+      if ar_kv_cache is not None:
+        indexer_mask_ar = indexer_mask[:, :, prefill_len:]
+
     prefill_unnormalized_output, prefill_exponentials_max, prefill_exponentials_sum = self.apply_attention(
         query=query,
         key=key,
@@ -2058,7 +2066,7 @@ def __call__(
         previous_chunk=previous_chunk,
         bidirectional_mask=bidirectional_mask,
         sinks=sinks,
-        indexer_mask=indexer_mask,
+        indexer_mask=indexer_mask_prefill,
         record_max_logits=record_max_logits,
         qk_product_einsum=self.AqtEinsum_0,
         wv_product_einsum=self.AqtEinsum_1,
@@ -2081,6 +2089,7 @@ def __call__(
         model_mode=model_mode,
         use_ragged_attention=self.use_ragged_attention,
         bidirectional_mask=bidirectional_mask,
+        indexer_mask=indexer_mask_ar,
         qk_product_einsum=self.AqtEinsum_2,
         wv_product_einsum=self.AqtEinsum_3,
     )