add e2b support

terryyz · terryyz · commit 468eeceb0044 · 2025-01-22T16:32:38.000+08:00
diff --git a/bigcodebench/evaluate.py b/bigcodebench/evaluate.py
@@ -12,6 +12,7 @@
 from typing import Any, Dict, List, Tuple, Optional
 from warnings import warn
 from gradio_client import Client, handle_file
+from e2b import Sandbox
 
 import httpx
 import numpy as np
@@ -118,9 +119,10 @@ def evaluate(
     subset: str,
     samples: Optional[str] = None,
     no_execute: bool = False,
-    local_execute: bool = False,
+    execution: str = "e2b", # "e2b", "gradio", "local"
     selective_evaluate: str = "",
-    remote_execute_api: str = "https://bigcode-bigcodebench-evaluator.hf.space/",
+    e2b_endpoint: str = "bigcodebench-evaluator",
+    gradio_endpoint: str = "https://bigcode-bigcodebench-evaluator.hf.space/",
     pass_k: str = "1,5,10",
     save_pass_rate: bool = True,
     calibrated: bool = True,
@@ -152,10 +154,10 @@ def evaluate(
         assert samples.endswith(".jsonl")
         result_path = samples.replace(".jsonl", "_eval_results.json")
     
-    if not local_execute:
+    if execution == "gradio":
         while True:
             try:
-                client = Client(remote_execute_api)
+                client = Client(gradio_endpoint)
                 results, pass_at_k = client.predict(
                     split=split,
                     subset=subset,
@@ -178,7 +180,28 @@ def evaluate(
                 time.sleep(4)
         gt_pass_rate = pass_at_k["gt_pass_rate"]
         failed_tasks = pass_at_k["failed_tasks"]
+    
+    elif execution == "e2b":
+        sandbox = Sandbox(e2b_endpoint, timeout=60*10)
+        
+        # upload file to sandbox
+        with open(samples, "r") as file:
+            sandbox.files.write(samples, file)
         
+        # run the evaluation
+        sandbox.commands.run("python3 -m bigcodebench.evaluate \
+                            --split {} --subset {} --samples {} \
+                            --pass_k {} --save_pass_rate {} --calibrated {} \
+                            --parallel {} --min_time_limit {} --max_as_limit {} \
+                            --max_data_limit {} --max_stack_limit {} --check_gt_only {} --no_gt {} \
+                            ".format(split, subset, samples, pass_k, save_pass_rate, calibrated, parallel, 
+                                     min_time_limit, max_as_limit, max_data_limit, max_stack_limit, check_gt_only, no_gt))
+        
+        # download the results
+        content = sandbox.files.read(result_path)
+        with open(result_path, "w") as file:
+            file.write(content)
+
     else:
         
         pass_at_k = dict()
diff --git a/bigcodebench/gen/util/openai_request.py b/bigcodebench/gen/util/openai_request.py
@@ -17,7 +17,7 @@ def make_request(
     kwargs["top_p"] = 0.95
     kwargs["max_completion_tokens"] = max_tokens
     kwargs["temperature"] = temperature
-    if model.startswith("o1-") or model.startswith("o3-"):  # pop top-p and max_completion_tokens
+    if model.startswith("o1-") or model.startswith("o3-") or model.endswith("-reasoner"):  # pop top-p and max_completion_tokens
         kwargs.pop("top_p")
         kwargs.pop("max_completion_tokens")
         kwargs.pop("temperature")
diff --git a/bigcodebench/generate.py b/bigcodebench/generate.py
@@ -132,7 +132,7 @@ def run_codegen(
     temperature: float = 0.0,
     max_new_tokens: int = 1280,
     greedy: bool = False,
-    reasoning_effort: str = "medium", # o1 and o3 only
+    reasoning_effort: str = "medium",
     strip_newlines: bool = False,
     direct_completion: bool = False,
     resume: bool = True,
diff --git a/bigcodebench/provider/openai.py b/bigcodebench/provider/openai.py
@@ -28,7 +28,7 @@ def codegen(
             tokenizer=None,
         ) for prompt in prompts]
         # use concurrency based batching for o1 and deepseek models
-        if self.name.startswith("o1-") or self.name == "deepseek-chat":
+        if self.name.startswith("o1-") or self.name.startswith("o3-") or self.name.startswith("deepseek"):
             return self._codegen_batch_via_concurrency(messages, num_samples)
 
         return self._codegen_api_batch(messages, num_samples)
diff --git a/sandbox-templates/e2b.Dockerfile b/sandbox-templates/e2b.Dockerfile
@@ -0,0 +1,66 @@
+# Better use newer Python as generated code can use new features
+FROM python:3.10-slim
+
+# install git, g++ and python3-tk
+RUN apt-get update && apt-get install -y \
+    git \
+    g++ \
+    python3-tk \
+    zip \
+    unzip \
+    procps \
+    r-base \
+    libgdal-dev \
+    # Add these new dependencies for matplotlib
+    libfreetype6-dev \
+    libpng-dev \
+    pkg-config \
+    python3-dev \
+    python3-matplotlib \
+    && rm -rf /var/lib/apt/lists/*
+
+# upgrade to latest pip
+RUN pip install --upgrade pip
+
+# Add a new user "bigcodebenchuser"
+RUN adduser --disabled-password --gecos "" bigcodebenchuser
+
+RUN rm -rf /bigcodebench
+
+# Acquire benchmark code to local
+ADD "https://api.github.com/repos/bigcode-project/bigcodebench/commits?per_page=1" latest_commit
+RUN git clone https://github.com/bigcode-project/bigcodebench.git /bigcodebench
+
+RUN pip install numpy==1.24.3 pyarrow==14.0.1
+
+RUN cd /bigcodebench && \
+    pip install . --no-deps && \
+    pip install \
+    appdirs>=1.4.4 \
+    fire>=0.6.0 \
+    multipledispatch>=0.6.0 \
+    pqdm>=0.2.0 \
+    tempdir>=0.7.1 \
+    termcolor>=2.0.0 \
+    tqdm>=4.56.0 \
+    tree_sitter_languages>=1.10.2 \
+    tree-sitter==0.21.3 \
+    wget>=3.2 \
+    gradio-client \
+    rich
+
+RUN pip install -I --timeout 2000 -r https://raw.githubusercontent.com/bigcode-project/bigcodebench/refs/heads/main/Requirements/requirements-eval.txt
+
+# Ensure the numpy version is compatible with the datasets version
+RUN pip install datasets==2.17.0
+
+# Pre-install the dataset
+RUN python3 -c "from bigcodebench.data import get_bigcodebench; get_bigcodebench(subset='full'); get_bigcodebench(subset='hard')"
+
+WORKDIR /app
+
+RUN chown -R bigcodebenchuser:bigcodebenchuser /app
+
+RUN chmod -R 777 /app
+
+USER bigcodebenchuser
diff --git a/sandbox-templates/e2b.toml b/sandbox-templates/e2b.toml
@@ -0,0 +1,16 @@
+# This is a config for E2B sandbox template.
+# You can use template ID (tbjhnhg5e3bd22i8jqgk) or template name (bigcodebench-evaluator) to create a sandbox:
+
+# Python SDK
+# from e2b import Sandbox, AsyncSandbox
+# sandbox = Sandbox("bigcodebench-evaluator") # Sync sandbox
+# sandbox = await AsyncSandbox.create("bigcodebench-evaluator") # Async sandbox
+
+# JS SDK
+# import { Sandbox } from 'e2b'
+# const sandbox = await Sandbox.create('bigcodebench-evaluator')
+
+team_id = "f317d0d2-ba02-44c5-8b77-e4a2d7830c7c"
+dockerfile = "e2b.Dockerfile"
+template_name = "bigcodebench-evaluator"
+template_id = "tbjhnhg5e3bd22i8jqgk"
diff --git a/setup.cfg b/setup.cfg
@@ -38,6 +38,7 @@ install_requires =
     google-generativeai>=0.5.4
     mistralai>=0.2.0,<1.0.0
     openai>=1.11.1
+    e2b
 
 [options.entry_points]
 console_scripts =