feat(generate): add reasoning effort for o1 and o3

terryyz · terryyz · commit 24e42d192caf · 2024-12-21T17:03:39.000+08:00
diff --git a/bigcodebench/gen/util/openai_request.py b/bigcodebench/gen/util/openai_request.py
@@ -10,16 +10,19 @@ def make_request(
     model: str,
     max_tokens: int = 512,
     temperature: float = 1,
+    reasoning_effort: str = "medium",
     n: int = 1,
     **kwargs
 ) -> ChatCompletion:
     kwargs["top_p"] = 0.95
     kwargs["max_completion_tokens"] = max_tokens
     kwargs["temperature"] = temperature
-    if model.startswith("o1-"):  # pop top-p and max_completion_tokens
+    if model.startswith("o1-") or model.startswith("o3-"):  # pop top-p and max_completion_tokens
         kwargs.pop("top_p")
         kwargs.pop("max_completion_tokens")
         kwargs.pop("temperature")
+        kwargs["reasoning_effort"] = reasoning_effort
+    
     return client.chat.completions.create(
         model=model,
         messages=[
diff --git a/bigcodebench/generate.py b/bigcodebench/generate.py
@@ -132,6 +132,7 @@ def run_codegen(
     temperature: float = 0.0,
     max_new_tokens: int = 1280,
     greedy: bool = False,
+    reasoning_effort: str = "medium", # o1 and o3 only
     strip_newlines: bool = False,
     direct_completion: bool = False,
     resume: bool = True,
@@ -175,6 +176,7 @@ def run_codegen(
         split=split,
         temperature=temperature,
         max_new_tokens=max_new_tokens,
+        reasoning_effort=reasoning_effort,
         instruction_prefix=instruction_prefix,
         response_prefix=response_prefix,
         base_url=base_url,
diff --git a/bigcodebench/provider/__init__.py b/bigcodebench/provider/__init__.py
@@ -9,6 +9,8 @@ def make_model(
     dataset: str = "bigcodebench",
     temperature: float = 0.0,
     max_new_tokens: int = 1280,
+    # o1 and o3 only
+    reasoning_effort: str = "medium",
     # instruction model only
     instruction_prefix: str = None,
     response_prefix: str = None,
@@ -73,6 +75,7 @@ def make_model(
             split=split,
             temperature=temperature,
             max_new_tokens=max_new_tokens,
+            reasoning_effort=reasoning_effort,
             base_url=base_url,
             instruction_prefix=instruction_prefix,
             response_prefix=response_prefix,
diff --git a/bigcodebench/provider/openai.py b/bigcodebench/provider/openai.py
@@ -9,9 +9,10 @@
 from bigcodebench.provider.utility import concurrent_call
 
 class OpenAIChatDecoder(DecoderBase):
-    def __init__(self, name: str, base_url=None, **kwargs) -> None:
+    def __init__(self, name: str, base_url=None, reasoning_effort="medium", **kwargs) -> None:
         super().__init__(name, **kwargs)
         self.base_url = base_url
+        self.reasoning_effort = reasoning_effort
     
     def codegen(
         self, prompts: List[str], do_sample: bool = True, num_samples: int = 200
@@ -45,6 +46,7 @@ def _codegen_api_batch(self, messages: List[str], num_samples: int) -> List[str]
                 model=self.name,
                 max_tokens=self.max_new_tokens,
                 temperature=self.temperature,
+                reasoning_effort=self.reasoning_effort,
                 n=num_samples,
             )
             outputs = []