Merge pull request #3540 from AI-Hypercomputer:nicogrande/async-rollouts

Google-ML-Automation · Google-ML-Automation · commit 1f04ad18dd85 · 2026-04-03T14:17:58.000-07:00
PiperOrigin-RevId: 894247267
diff --git a/src/maxtext/configs/post_train/rl.yml b/src/maxtext/configs/post_train/rl.yml
@@ -54,6 +54,21 @@ rl:
   grpo_epsilon: 0.2
   loss_algo: 'grpo' # grpo or gspo-token
 
+  # ====== Agentic Rollout ======
+  # If True, uses the async AgenticGRPOLearner, which overlaps rollout generation
+  # with training for faster throughput via online vLLM inference.
+  use_agentic_rollout: False
+  # Max concurrent rollout requests when using agentic rollout.
+  max_concurrency: 256
+  # Number of off-policy steps tolerated before requiring a policy update.
+  off_policy_steps: 0
+  # System prompt injected into the agent at rollout time.
+  system_prompt: ''
+  # If True, mask degenerate groups (all-zero advantages) from contributing to the loss.
+  degenerate_group_masking: True
+  # Upper-bound clipping epsilon for GRPO loss; defaults to grpo_epsilon when null.
+  epsilon_high: null
+
 
 # ====== Models ======
 # for MaxText
diff --git a/src/maxtext/configs/types.py b/src/maxtext/configs/types.py
@@ -1709,6 +1709,20 @@ class RL(BaseModel):
   grpo_beta: float = Field(0.08, description="Coefficient for the KL divergence penalty (β).")
   grpo_epsilon: float = Field(0.2, description="Epsilon value for clipping in the GRPO loss.")
   loss_algo: Literal["grpo", "gspo-token"] = Field("grpo", description="Loss algorithm, i.e., 'grpo' or 'gspo-token'.")
+  use_agentic_rollout: bool = Field(
+      False, description="If True, uses the asynchronous AgenticGRPOLearner for online vLLM rollouts."
+  )
+  max_concurrency: int = Field(256, description="Maximum number of concurrent rollout requests (agentic rollout only).")
+  off_policy_steps: int = Field(
+      0, description="Number of off-policy steps tolerated before requiring a policy update (agentic only)."
+  )
+  system_prompt: str = Field("", description="System prompt injected into the agent at rollout time (agentic only).")
+  degenerate_group_masking: bool = Field(
+      True, description="Mask degenerate groups (all-zero advantages) from contributing to loss (agentic only)."
+  )
+  epsilon_high: Optional[float] = Field(
+      None, description="Upper-bound clipping epsilon for GRPO loss. Defaults to epsilon when None (agentic only)."
+  )
 
 
 class RLDataset(BaseModel):
diff --git a/src/maxtext/trainers/post_train/rl/train_rl.py b/src/maxtext/trainers/post_train/rl/train_rl.py
@@ -67,6 +67,7 @@
 from tunix.rl import rl_cluster as rl_cluster_lib
 from tunix.rl.rollout import base_rollout
 from tunix.rl.grpo.grpo_learner import GrpoConfig, GrpoLearner
+from tunix.rl.agentic.agentic_grpo_learner import GrpoConfig as AgenticGrpoConfig, GrpoLearner as AgenticGrpoLearner
 from tunix.sft import metrics_logger, profiler
 
 # for vLLM we can skip JAX precompilation with this flag, it makes startup faster
@@ -386,6 +387,16 @@ def _filter_long_prompts(x):
     return len(tokens) <= trainer_config.max_prefill_predict_length
 
   train_dataset = train_dataset.filter(_filter_long_prompts)
+
+  # AgenticGRPOLearner uses a built in chat parser that expects raw prompts
+  if getattr(trainer_config.rl, "use_agentic_rollout", False):
+
+    def _use_raw_prompt(x):
+      x["prompts"] = x["question"]
+      return x
+
+    train_dataset = train_dataset.map(_use_raw_prompt)
+
   dataset_size = int(trainer_config.num_batches * trainer_config.batch_size * trainer_config.train_fraction)
   train_dataset = train_dataset[:dataset_size]
   train_dataset = train_dataset.repeat(trainer_config.num_epoch)
@@ -525,27 +536,24 @@ def create_rl_components(
           rollout_vllm_max_num_batched_tokens=trainer_config.max_num_batched_tokens,
           rollout_vllm_max_num_seqs=trainer_config.max_num_seqs,
           rollout_vllm_async_scheduling=trainer_config.async_scheduling,
+          rollout_vllm_server_mode=trainer_config.rl.use_agentic_rollout,
           rollout_vllm_kwargs={
               "hf_overrides": trainer_config.vllm_hf_overrides,
               "enable_expert_parallel": sampler_config.rollout_expert_parallelism > 1,
+              "enable_prefix_caching": True,  # Enable prefix caching to speed up generation for long prompts
           },
           rollout_vllm_sampling_kwargs={
               "stop": trainer_config.stop_strings,
               "detokenize": trainer_config.stop_strings is not None,
               "include_stop_str_in_output": trainer_config.stop_strings is not None,
           },
+          # AgenticGRPOLearner requires log-probabilities from the rollout engine
+          # to support off-policy filtering and multi-iteration training.
+          **({"return_logprobs": True} if trainer_config.rl.use_agentic_rollout else {}),
           **get_rollout_kwargs_for_parallelism(sampler_config, len(sampler_devices)),
       ),
   )
 
-  grpo_config = GrpoConfig(
-      num_generations=trainer_config.rl.num_generations,
-      num_iterations=trainer_config.rl.num_iterations,
-      beta=trainer_config.rl.grpo_beta,
-      epsilon=trainer_config.rl.grpo_epsilon,
-      loss_algo=trainer_config.rl.loss_algo,
-  )
-
   # Create RL cluster
   max_logging.log("Creating RL cluster...")
   rl_cluster_kwargs = {}
@@ -578,19 +586,57 @@ def _reward_fn(**kwargs):
 
     return _reward_fn
 
+  reward_fns = [  # type: ignore
+      make_reward_fn(utils_rl.match_format_exactly),
+      make_reward_fn(utils_rl.match_format_approximately),
+      # TODO(atwigg): comment out to simplify reward and overlap with check_numbers
+      make_reward_fn(utils_rl.check_answer),
+      make_reward_fn(utils_rl.check_numbers),
+  ]
+
   # Create RL trainer
   max_logging.log("Setting up RL trainer...")
-  rl_trainer = GrpoLearner(
-      rl_cluster=rl_cluster,
-      reward_fns=[  # type: ignore
-          make_reward_fn(utils_rl.match_format_exactly),
-          make_reward_fn(utils_rl.match_format_approximately),
-          # TODO(atwigg): comment out to simplify reward and overlap with check_numbers
-          make_reward_fn(utils_rl.check_answer),
-          make_reward_fn(utils_rl.check_numbers),
-      ],
-      algo_config=grpo_config,
-  )
+  if trainer_config.rl.use_agentic_rollout:
+    max_logging.log("Using AgenticGRPOLearner with async online rollouts.")
+    grpo_config = AgenticGrpoConfig(
+        num_generations=trainer_config.rl.num_generations,
+        num_iterations=trainer_config.rl.num_iterations,
+        beta=trainer_config.rl.grpo_beta,
+        epsilon=trainer_config.rl.grpo_epsilon,
+        loss_algo=trainer_config.rl.loss_algo,
+        max_response_length=trainer_config.max_target_length - trainer_config.max_prefill_predict_length,
+        max_concurrency=trainer_config.rl.max_concurrency,
+        off_policy_steps=trainer_config.rl.off_policy_steps,
+        system_prompt=trainer_config.rl.system_prompt,
+        degenerate_group_masking=trainer_config.rl.degenerate_group_masking,
+        epsilon_high=trainer_config.rl.epsilon_high,
+    )
+    # Instantiate the custom MaxText chat parser
+    template_config = load_template_from_file(trainer_config.chat_template_path)
+    chat_parser = utils_rl.MaxTextChatParser(
+        model_tokenizer=model_tokenizer, template_config=template_config, tmvp_config=trainer_config
+    )
+    rl_trainer = AgenticGrpoLearner(
+        rl_cluster=rl_cluster,
+        reward_fns=reward_fns,
+        algo_config=grpo_config,
+        chat_parser=chat_parser,
+        metric_fns=[utils_rl.get_correctness_metrics],
+    )
+  else:
+    max_logging.log("Using standard GRPOLearner with offline rollouts.")
+    grpo_config = GrpoConfig(
+        num_generations=trainer_config.rl.num_generations,
+        num_iterations=trainer_config.rl.num_iterations,
+        beta=trainer_config.rl.grpo_beta,
+        epsilon=trainer_config.rl.grpo_epsilon,
+        loss_algo=trainer_config.rl.loss_algo,
+    )
+    rl_trainer = GrpoLearner(
+        rl_cluster=rl_cluster,
+        reward_fns=reward_fns,
+        algo_config=grpo_config,
+    )
 
   return rl_cluster, rl_trainer, optimizer
 
diff --git a/src/maxtext/trainers/post_train/rl/utils_rl.py b/src/maxtext/trainers/post_train/rl/utils_rl.py
@@ -17,13 +17,17 @@
 import re
 import optax
 from maxtext.utils import max_logging
+import numpy as np
 
 
 from math_verify.errors import TimeoutException
 from math_verify.metric import math_metric
 from math_verify.parser import ExprExtractionConfig, LatexExtractionConfig
 from math_verify import parse
 
+from tunix.rl.agentic.parser.chat_template_parser import parser as agentic_chat_template_parser
+
+
 # initialize math_verify_func once
 math_verify_func = math_metric(
     gold_extraction_target=(LatexExtractionConfig(),),
@@ -514,6 +518,23 @@ def make_optimizer(learning_rate):
   return optax.inject_hyperparams(make_optimizer)(learning_rate=schedule)
 
 
+def format_maxtext_messages(messages: list[dict[str, str]], template_config: dict, tmvp_config) -> list[dict[str, str]]:
+  """Helper to inject MaxText's system prompt into the input user messages."""
+  formatted_messages = []
+  for msg in messages:
+    formatted_content = template_config["TEMPLATE"].format(
+        system_prompt=template_config["SYSTEM_PROMPT"].format(
+            reasoning_start_token=tmvp_config.reasoning_start_token,
+            reasoning_end_token=tmvp_config.reasoning_end_token,
+            solution_start_token=tmvp_config.solution_start_token,
+            solution_end_token=tmvp_config.solution_end_token,
+        ),
+        question=msg,
+    )
+    formatted_messages.append({"role": "user", "content": formatted_content})
+  return formatted_messages
+
+
 def process_data(dataset_name, model_tokenizer, template_config, tmvp_config, x):
   """Function to process input dataset"""
 
@@ -552,28 +573,75 @@ def _to_str(val):
   if dataset_name == "gsm8k":
     answer = extract_hash_answer(answer)
 
+  messages = [question]
+  formatted_messages = format_maxtext_messages(messages, template_config, tmvp_config)
+
+  prompts = model_tokenizer.apply_chat_template(
+      formatted_messages,
+      tokenize=False,
+      add_generation_prompt=True,
+  )
+
   return {
-      # passed to model forward pass
-      "prompts": model_tokenizer.apply_chat_template(
-          [
-              {
-                  "role": "user",
-                  "content": template_config["TEMPLATE"].format(
-                      system_prompt=template_config["SYSTEM_PROMPT"].format(
-                          reasoning_start_token=tmvp_config.reasoning_start_token,
-                          reasoning_end_token=tmvp_config.reasoning_end_token,
-                          solution_start_token=tmvp_config.solution_start_token,
-                          solution_end_token=tmvp_config.solution_end_token,
-                      ),
-                      question=question,
-                  ),
-              },
-          ],
-          tokenize=False,
-          add_generation_prompt=True,
-      ),
-      # passed to reward functions
+      # pre-formatted prompts for evaluation
+      "prompts": prompts,
+      # raw question for AgenticGRPOLearner to bypass formatting
       "question": question,
       # passed to reward functions
       "answer": answer,
   }
+
+
+def get_correctness_metrics(prompts, completions, rewards, advantages, **kwargs):
+  """Compute correctness statistics metrics based on rewards."""
+  del prompts, completions, advantages, kwargs
+  solve_all = (rewards > 0.1).all()
+  solve_none = (rewards == 0).all()
+  solve_partial = (~solve_all) and (~solve_none)
+  solve_ratio = (rewards > 0.1).mean()
+  return {
+      "rewards/solve_all": (
+          1 if solve_all else 0,
+          np.mean,
+      ),
+      "rewards/solve_none": (
+          1 if solve_none else 0,
+          np.mean,
+      ),
+      "rewards/solve_partial": (
+          1 if solve_partial else 0,
+          np.mean,
+      ),
+      "rewards/solve_ratio": (
+          solve_ratio,
+          np.mean,
+      ),
+  }
+
+
+class MaxTextChatParser(agentic_chat_template_parser.DefaultChatTemplateParser):
+  """
+  Custom Chat Parser for MaxText that intercepts message lists dynamically
+  during agentic rollouts and injects the necessary system templates and
+  special tokens using the shared helper.
+  """
+
+  def __init__(self, model_tokenizer, template_config, tmvp_config):
+    super().__init__(model_tokenizer)
+    self.template_config = template_config
+    self.tmvp_config = tmvp_config
+
+  def parse(
+      self,
+      messages: list[dict[str, str]],
+      add_generation_prompt: bool = False,
+      is_first_msg: bool = False,
+  ) -> str:
+    """Overrides the default parse method to apply MaxText-specific formatting to the messages."""
+    # Apply MaxText specific formatting to the messages
+    formatted_messages = format_maxtext_messages(messages, self.template_config, self.tmvp_config)
+
+    # Delegate to Tunix default parser to apply the tokenizer's chat template
+    return super().parse(
+        messages=formatted_messages, add_generation_prompt=add_generation_prompt, is_first_msg=is_first_msg
+    )