Skip to content

Commit 62f387c

Browse files
committed
update e2b env setup
1 parent 468eece commit 62f387c

4 files changed

Lines changed: 55 additions & 41 deletions

File tree

Docker/Evaluate.Dockerfile

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -35,19 +35,29 @@ RUN pip install numpy==1.24.3 pyarrow==14.0.1
3535

3636
RUN cd /bigcodebench && \
3737
pip install . --no-deps && \
38-
pip install \
38+
39+
RUN pip install \
3940
appdirs>=1.4.4 \
4041
fire>=0.6.0 \
4142
multipledispatch>=0.6.0 \
4243
pqdm>=0.2.0 \
4344
tempdir>=0.7.1 \
4445
termcolor>=2.0.0 \
4546
tqdm>=4.56.0 \
46-
tree_sitter_languages>=1.10.2 \
47-
tree-sitter==0.21.3 \
47+
tree_sitter>=0.22.0 \
48+
tree-sitter-python>=0.21.0 \
4849
wget>=3.2 \
50+
transformers \
51+
datasets \
4952
gradio-client \
50-
rich
53+
numpy \
54+
rich \
55+
accelerate>=0.30.1 \
56+
anthropic>=0.26.1 \
57+
google-generativeai>=0.5.4
58+
mistralai<1.0.0 \
59+
openai>=1.11.1 \
60+
e2b
5161

5262
RUN pip install -I --timeout 2000 -r https://raw.githubusercontent.com/bigcode-project/bigcodebench/refs/heads/main/Requirements/requirements-eval.txt
5363

bigcodebench/evaluate.py

Lines changed: 17 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -121,7 +121,7 @@ def evaluate(
121121
no_execute: bool = False,
122122
execution: str = "e2b", # "e2b", "gradio", "local"
123123
selective_evaluate: str = "",
124-
e2b_endpoint: str = "bigcodebench-evaluator",
124+
e2b_endpoint: str = "bigcodebench_evaluator",
125125
gradio_endpoint: str = "https://bigcode-bigcodebench-evaluator.hf.space/",
126126
pass_k: str = "1,5,10",
127127
save_pass_rate: bool = True,
@@ -135,7 +135,6 @@ def evaluate(
135135
no_gt: bool = False,
136136
**model_kwargs,
137137
):
138-
139138
if not samples and model_kwargs:
140139
samples = run_codegen(
141140
split=split,
@@ -182,31 +181,32 @@ def evaluate(
182181
failed_tasks = pass_at_k["failed_tasks"]
183182

184183
elif execution == "e2b":
185-
sandbox = Sandbox(e2b_endpoint, timeout=60*10)
186-
184+
sandbox = Sandbox(e2b_endpoint, api_key=os.environ["E2B_API_KEY"], timeout=60*60)
185+
187186
# upload file to sandbox
188187
with open(samples, "r") as file:
189188
sandbox.files.write(samples, file)
190189

191190
# run the evaluation
192-
sandbox.commands.run("python3 -m bigcodebench.evaluate \
193-
--split {} --subset {} --samples {} \
194-
--pass_k {} --save_pass_rate {} --calibrated {} \
195-
--parallel {} --min_time_limit {} --max_as_limit {} \
196-
--max_data_limit {} --max_stack_limit {} --check_gt_only {} --no_gt {} \
197-
".format(split, subset, samples, pass_k, save_pass_rate, calibrated, parallel,
198-
min_time_limit, max_as_limit, max_data_limit, max_stack_limit, check_gt_only, no_gt))
191+
print(f"Command run in sandbox {e2b_endpoint}")
192+
sandbox.commands.run("bigcodebench.evaluate --execution 'local' "
193+
f"--split {split} --subset {subset} --samples {samples} "
194+
f"--pass_k {pass_k} --save_pass_rate {save_pass_rate} --calibrated {calibrated} "
195+
f"--parallel {parallel} --selective_evaluate {selective_evaluate} --min_time_limit {min_time_limit} "
196+
f"--max_as_limit {max_as_limit} --max_data_limit {max_data_limit} --max_stack_limit {max_stack_limit} "
197+
f"--check_gt_only {check_gt_only} --no_gt {no_gt}", on_stderr=lambda x: print(x), on_stdout=lambda x: print(x), timeout=60*50)
199198

200-
# download the results
201-
content = sandbox.files.read(result_path)
202-
with open(result_path, "w") as file:
203-
file.write(content)
199+
if not check_gt_only:
200+
# download the results
201+
content = sandbox.files.read(result_path)
202+
with open(result_path, "w") as file:
203+
file.write(content)
204204

205205
else:
206206

207207
pass_at_k = dict()
208208

209-
pass_k = [int(k) for k in pass_k.split(",")]
209+
passk = [int(k) for k in pass_k.split(",")]
210210

211211
if parallel < 1:
212212
n_workers = max(1, multiprocessing.cpu_count() // 2)
@@ -350,7 +350,7 @@ def stucking_checker():
350350

351351
pass_at_k.update({
352352
f"pass@{k}": estimate_pass_at_k(total, base_correct, k).mean()
353-
for k in pass_k
353+
for k in passk
354354
if total.min() >= k
355355
})
356356

run.sh

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,13 @@ MODEL=meta-llama/Meta-Llama-3.1-8B-Instruct
33
BACKEND=vllm
44
NUM_GPU=2
55
SPLIT=complete
6-
SUBSET=hard
6+
SUBSET=full
7+
export E2B_API_KEY="e2b_0a231fa3b0a2b01690ab6c66a23b55c0979ce4ee"
78

89
bigcodebench.evaluate \
9-
--tp $NUM_GPU \
10+
--samples bcb_results/gemini-2.0-flash-exp--main--bigcodebench-instruct--google-0-1-sanitized_calibrated.jsonl \
1011
--model $MODEL \
1112
--split $SPLIT \
1213
--subset $SUBSET \
13-
--backend $BACKEND
14+
--backend $BACKEND \
15+
--check_gt_only

sandbox-templates/e2b.Dockerfile

Lines changed: 19 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -34,33 +34,35 @@ RUN git clone https://github.com/bigcode-project/bigcodebench.git /bigcodebench
3434
RUN pip install numpy==1.24.3 pyarrow==14.0.1
3535

3636
RUN cd /bigcodebench && \
37-
pip install . --no-deps && \
38-
pip install \
39-
appdirs>=1.4.4 \
40-
fire>=0.6.0 \
41-
multipledispatch>=0.6.0 \
42-
pqdm>=0.2.0 \
43-
tempdir>=0.7.1 \
44-
termcolor>=2.0.0 \
45-
tqdm>=4.56.0 \
46-
tree_sitter_languages>=1.10.2 \
47-
tree-sitter==0.21.3 \
48-
wget>=3.2 \
37+
pip install . --no-deps
38+
39+
RUN pip install --timeout 2000 \
40+
appdirs \
41+
fire \
42+
multipledispatch \
43+
pqdm \
44+
tempdir \
45+
termcolor \
46+
tqdm \
47+
transformers \
48+
tree_sitter \
49+
tree-sitter-python \
50+
wget \
51+
datasets \
4952
gradio-client \
50-
rich
53+
numpy \
54+
rich \
55+
e2b
5156

5257
RUN pip install -I --timeout 2000 -r https://raw.githubusercontent.com/bigcode-project/bigcodebench/refs/heads/main/Requirements/requirements-eval.txt
5358

5459
# Ensure the numpy version is compatible with the datasets version
5560
RUN pip install datasets==2.17.0
5661

57-
# Pre-install the dataset
58-
RUN python3 -c "from bigcodebench.data import get_bigcodebench; get_bigcodebench(subset='full'); get_bigcodebench(subset='hard')"
59-
6062
WORKDIR /app
6163

6264
RUN chown -R bigcodebenchuser:bigcodebenchuser /app
6365

64-
RUN chmod -R 777 /app
66+
RUN chmod -R 777 /app && rm -rf /root/.cache/pip
6567

6668
USER bigcodebenchuser

0 commit comments

Comments
 (0)