Skip to content

Commit 468eece

Browse files
committed
add e2b support
1 parent 342aed8 commit 468eece

7 files changed

Lines changed: 113 additions & 7 deletions

File tree

bigcodebench/evaluate.py

Lines changed: 27 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
from typing import Any, Dict, List, Tuple, Optional
1313
from warnings import warn
1414
from gradio_client import Client, handle_file
15+
from e2b import Sandbox
1516

1617
import httpx
1718
import numpy as np
@@ -118,9 +119,10 @@ def evaluate(
118119
subset: str,
119120
samples: Optional[str] = None,
120121
no_execute: bool = False,
121-
local_execute: bool = False,
122+
execution: str = "e2b", # "e2b", "gradio", "local"
122123
selective_evaluate: str = "",
123-
remote_execute_api: str = "https://bigcode-bigcodebench-evaluator.hf.space/",
124+
e2b_endpoint: str = "bigcodebench-evaluator",
125+
gradio_endpoint: str = "https://bigcode-bigcodebench-evaluator.hf.space/",
124126
pass_k: str = "1,5,10",
125127
save_pass_rate: bool = True,
126128
calibrated: bool = True,
@@ -152,10 +154,10 @@ def evaluate(
152154
assert samples.endswith(".jsonl")
153155
result_path = samples.replace(".jsonl", "_eval_results.json")
154156

155-
if not local_execute:
157+
if execution == "gradio":
156158
while True:
157159
try:
158-
client = Client(remote_execute_api)
160+
client = Client(gradio_endpoint)
159161
results, pass_at_k = client.predict(
160162
split=split,
161163
subset=subset,
@@ -178,7 +180,28 @@ def evaluate(
178180
time.sleep(4)
179181
gt_pass_rate = pass_at_k["gt_pass_rate"]
180182
failed_tasks = pass_at_k["failed_tasks"]
183+
184+
elif execution == "e2b":
185+
sandbox = Sandbox(e2b_endpoint, timeout=60*10)
186+
187+
# upload file to sandbox
188+
with open(samples, "r") as file:
189+
sandbox.files.write(samples, file)
181190

191+
# run the evaluation
192+
sandbox.commands.run("python3 -m bigcodebench.evaluate \
193+
--split {} --subset {} --samples {} \
194+
--pass_k {} --save_pass_rate {} --calibrated {} \
195+
--parallel {} --min_time_limit {} --max_as_limit {} \
196+
--max_data_limit {} --max_stack_limit {} --check_gt_only {} --no_gt {} \
197+
".format(split, subset, samples, pass_k, save_pass_rate, calibrated, parallel,
198+
min_time_limit, max_as_limit, max_data_limit, max_stack_limit, check_gt_only, no_gt))
199+
200+
# download the results
201+
content = sandbox.files.read(result_path)
202+
with open(result_path, "w") as file:
203+
file.write(content)
204+
182205
else:
183206

184207
pass_at_k = dict()

bigcodebench/gen/util/openai_request.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ def make_request(
1717
kwargs["top_p"] = 0.95
1818
kwargs["max_completion_tokens"] = max_tokens
1919
kwargs["temperature"] = temperature
20-
if model.startswith("o1-") or model.startswith("o3-"): # pop top-p and max_completion_tokens
20+
if model.startswith("o1-") or model.startswith("o3-") or model.endswith("-reasoner"): # pop top-p and max_completion_tokens
2121
kwargs.pop("top_p")
2222
kwargs.pop("max_completion_tokens")
2323
kwargs.pop("temperature")

bigcodebench/generate.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -132,7 +132,7 @@ def run_codegen(
132132
temperature: float = 0.0,
133133
max_new_tokens: int = 1280,
134134
greedy: bool = False,
135-
reasoning_effort: str = "medium", # o1 and o3 only
135+
reasoning_effort: str = "medium",
136136
strip_newlines: bool = False,
137137
direct_completion: bool = False,
138138
resume: bool = True,

bigcodebench/provider/openai.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ def codegen(
2828
tokenizer=None,
2929
) for prompt in prompts]
3030
# use concurrency based batching for o1 and deepseek models
31-
if self.name.startswith("o1-") or self.name == "deepseek-chat":
31+
if self.name.startswith("o1-") or self.name.startswith("o3-") or self.name.startswith("deepseek"):
3232
return self._codegen_batch_via_concurrency(messages, num_samples)
3333

3434
return self._codegen_api_batch(messages, num_samples)

sandbox-templates/e2b.Dockerfile

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
# Better use newer Python as generated code can use new features
2+
FROM python:3.10-slim
3+
4+
# install git, g++ and python3-tk
5+
RUN apt-get update && apt-get install -y \
6+
git \
7+
g++ \
8+
python3-tk \
9+
zip \
10+
unzip \
11+
procps \
12+
r-base \
13+
libgdal-dev \
14+
# Add these new dependencies for matplotlib
15+
libfreetype6-dev \
16+
libpng-dev \
17+
pkg-config \
18+
python3-dev \
19+
python3-matplotlib \
20+
&& rm -rf /var/lib/apt/lists/*
21+
22+
# upgrade to latest pip
23+
RUN pip install --upgrade pip
24+
25+
# Add a new user "bigcodebenchuser"
26+
RUN adduser --disabled-password --gecos "" bigcodebenchuser
27+
28+
RUN rm -rf /bigcodebench
29+
30+
# Acquire benchmark code to local
31+
ADD "https://api.github.com/repos/bigcode-project/bigcodebench/commits?per_page=1" latest_commit
32+
RUN git clone https://github.com/bigcode-project/bigcodebench.git /bigcodebench
33+
34+
RUN pip install numpy==1.24.3 pyarrow==14.0.1
35+
36+
RUN cd /bigcodebench && \
37+
pip install . --no-deps && \
38+
pip install \
39+
appdirs>=1.4.4 \
40+
fire>=0.6.0 \
41+
multipledispatch>=0.6.0 \
42+
pqdm>=0.2.0 \
43+
tempdir>=0.7.1 \
44+
termcolor>=2.0.0 \
45+
tqdm>=4.56.0 \
46+
tree_sitter_languages>=1.10.2 \
47+
tree-sitter==0.21.3 \
48+
wget>=3.2 \
49+
gradio-client \
50+
rich
51+
52+
RUN pip install -I --timeout 2000 -r https://raw.githubusercontent.com/bigcode-project/bigcodebench/refs/heads/main/Requirements/requirements-eval.txt
53+
54+
# Ensure the numpy version is compatible with the datasets version
55+
RUN pip install datasets==2.17.0
56+
57+
# Pre-install the dataset
58+
RUN python3 -c "from bigcodebench.data import get_bigcodebench; get_bigcodebench(subset='full'); get_bigcodebench(subset='hard')"
59+
60+
WORKDIR /app
61+
62+
RUN chown -R bigcodebenchuser:bigcodebenchuser /app
63+
64+
RUN chmod -R 777 /app
65+
66+
USER bigcodebenchuser

sandbox-templates/e2b.toml

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
# This is a config for E2B sandbox template.
2+
# You can use template ID (tbjhnhg5e3bd22i8jqgk) or template name (bigcodebench-evaluator) to create a sandbox:
3+
4+
# Python SDK
5+
# from e2b import Sandbox, AsyncSandbox
6+
# sandbox = Sandbox("bigcodebench-evaluator") # Sync sandbox
7+
# sandbox = await AsyncSandbox.create("bigcodebench-evaluator") # Async sandbox
8+
9+
# JS SDK
10+
# import { Sandbox } from 'e2b'
11+
# const sandbox = await Sandbox.create('bigcodebench-evaluator')
12+
13+
team_id = "f317d0d2-ba02-44c5-8b77-e4a2d7830c7c"
14+
dockerfile = "e2b.Dockerfile"
15+
template_name = "bigcodebench-evaluator"
16+
template_id = "tbjhnhg5e3bd22i8jqgk"

setup.cfg

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ install_requires =
3838
google-generativeai>=0.5.4
3939
mistralai>=0.2.0,<1.0.0
4040
openai>=1.11.1
41+
e2b
4142

4243
[options.entry_points]
4344
console_scripts =

0 commit comments

Comments
 (0)