update e2b env setup

terryyz · terryyz · commit 62f387c41a4e · 2025-01-22T21:16:25.000+08:00
diff --git a/Docker/Evaluate.Dockerfile b/Docker/Evaluate.Dockerfile
@@ -35,19 +35,29 @@ RUN pip install numpy==1.24.3 pyarrow==14.0.1
 
 RUN cd /bigcodebench && \
     pip install . --no-deps && \
-    pip install \
+    
+RUN pip install \
     appdirs>=1.4.4 \
     fire>=0.6.0 \
     multipledispatch>=0.6.0 \
     pqdm>=0.2.0 \
     tempdir>=0.7.1 \
     termcolor>=2.0.0 \
     tqdm>=4.56.0 \
-    tree_sitter_languages>=1.10.2 \
-    tree-sitter==0.21.3 \
+    tree_sitter>=0.22.0 \
+    tree-sitter-python>=0.21.0 \
     wget>=3.2 \
+    transformers \
+    datasets \
     gradio-client \
-    rich
+    numpy \
+    rich \
+    accelerate>=0.30.1 \
+    anthropic>=0.26.1 \
+    google-generativeai>=0.5.4
+    mistralai<1.0.0 \
+    openai>=1.11.1 \
+    e2b
 
 RUN pip install -I --timeout 2000 -r https://raw.githubusercontent.com/bigcode-project/bigcodebench/refs/heads/main/Requirements/requirements-eval.txt
 
diff --git a/bigcodebench/evaluate.py b/bigcodebench/evaluate.py
@@ -121,7 +121,7 @@ def evaluate(
     no_execute: bool = False,
     execution: str = "e2b", # "e2b", "gradio", "local"
     selective_evaluate: str = "",
-    e2b_endpoint: str = "bigcodebench-evaluator",
+    e2b_endpoint: str = "bigcodebench_evaluator",
     gradio_endpoint: str = "https://bigcode-bigcodebench-evaluator.hf.space/",
     pass_k: str = "1,5,10",
     save_pass_rate: bool = True,
@@ -135,7 +135,6 @@ def evaluate(
     no_gt: bool = False,
     **model_kwargs,
 ):  
-    
     if not samples and model_kwargs:
         samples = run_codegen(
             split=split,
@@ -182,31 +181,32 @@ def evaluate(
         failed_tasks = pass_at_k["failed_tasks"]
     
     elif execution == "e2b":
-        sandbox = Sandbox(e2b_endpoint, timeout=60*10)
-        
+        sandbox = Sandbox(e2b_endpoint, api_key=os.environ["E2B_API_KEY"], timeout=60*60)
+
         # upload file to sandbox
         with open(samples, "r") as file:
             sandbox.files.write(samples, file)
         
         # run the evaluation
-        sandbox.commands.run("python3 -m bigcodebench.evaluate \
-                            --split {} --subset {} --samples {} \
-                            --pass_k {} --save_pass_rate {} --calibrated {} \
-                            --parallel {} --min_time_limit {} --max_as_limit {} \
-                            --max_data_limit {} --max_stack_limit {} --check_gt_only {} --no_gt {} \
-                            ".format(split, subset, samples, pass_k, save_pass_rate, calibrated, parallel, 
-                                     min_time_limit, max_as_limit, max_data_limit, max_stack_limit, check_gt_only, no_gt))
+        print(f"Command run in sandbox {e2b_endpoint}")
+        sandbox.commands.run("bigcodebench.evaluate  --execution 'local' "
+                        f"--split {split} --subset {subset} --samples {samples} "
+                        f"--pass_k {pass_k} --save_pass_rate {save_pass_rate} --calibrated {calibrated} "
+                        f"--parallel {parallel} --selective_evaluate {selective_evaluate} --min_time_limit {min_time_limit} "
+                        f"--max_as_limit {max_as_limit} --max_data_limit {max_data_limit} --max_stack_limit {max_stack_limit} "
+                        f"--check_gt_only {check_gt_only} --no_gt {no_gt}", on_stderr=lambda x: print(x), on_stdout=lambda x: print(x), timeout=60*50)
         
-        # download the results
-        content = sandbox.files.read(result_path)
-        with open(result_path, "w") as file:
-            file.write(content)
+        if not check_gt_only:
+            # download the results
+            content = sandbox.files.read(result_path)
+            with open(result_path, "w") as file:
+                file.write(content)
 
     else:
         
         pass_at_k = dict()
 
-        pass_k = [int(k) for k in pass_k.split(",")]
+        passk = [int(k) for k in pass_k.split(",")]
         
         if parallel < 1:
             n_workers = max(1, multiprocessing.cpu_count() // 2)
@@ -350,7 +350,7 @@ def stucking_checker():
 
         pass_at_k.update({
             f"pass@{k}": estimate_pass_at_k(total, base_correct, k).mean()
-            for k in pass_k
+            for k in passk
             if total.min() >= k
         })
 
diff --git a/run.sh b/run.sh
@@ -3,11 +3,13 @@ MODEL=meta-llama/Meta-Llama-3.1-8B-Instruct
 BACKEND=vllm
 NUM_GPU=2
 SPLIT=complete
-SUBSET=hard
+SUBSET=full
+export E2B_API_KEY="e2b_0a231fa3b0a2b01690ab6c66a23b55c0979ce4ee"
 
 bigcodebench.evaluate \
-  --tp $NUM_GPU \
+  --samples bcb_results/gemini-2.0-flash-exp--main--bigcodebench-instruct--google-0-1-sanitized_calibrated.jsonl \
   --model $MODEL \
   --split $SPLIT \
   --subset $SUBSET \
-  --backend $BACKEND
+  --backend $BACKEND \
+  --check_gt_only
diff --git a/sandbox-templates/e2b.Dockerfile b/sandbox-templates/e2b.Dockerfile
@@ -34,33 +34,35 @@ RUN git clone https://github.com/bigcode-project/bigcodebench.git /bigcodebench
 RUN pip install numpy==1.24.3 pyarrow==14.0.1
 
 RUN cd /bigcodebench && \
-    pip install . --no-deps && \
-    pip install \
-    appdirs>=1.4.4 \
-    fire>=0.6.0 \
-    multipledispatch>=0.6.0 \
-    pqdm>=0.2.0 \
-    tempdir>=0.7.1 \
-    termcolor>=2.0.0 \
-    tqdm>=4.56.0 \
-    tree_sitter_languages>=1.10.2 \
-    tree-sitter==0.21.3 \
-    wget>=3.2 \
+    pip install . --no-deps
+    
+RUN pip install --timeout 2000 \
+    appdirs \
+    fire \
+    multipledispatch \
+    pqdm \
+    tempdir \
+    termcolor \
+    tqdm \
+    transformers \
+    tree_sitter \
+    tree-sitter-python \
+    wget \
+    datasets \
     gradio-client \
-    rich
+    numpy \
+    rich \
+    e2b
 
 RUN pip install -I --timeout 2000 -r https://raw.githubusercontent.com/bigcode-project/bigcodebench/refs/heads/main/Requirements/requirements-eval.txt
 
 # Ensure the numpy version is compatible with the datasets version
 RUN pip install datasets==2.17.0
 
-# Pre-install the dataset
-RUN python3 -c "from bigcodebench.data import get_bigcodebench; get_bigcodebench(subset='full'); get_bigcodebench(subset='hard')"
-
 WORKDIR /app
 
 RUN chown -R bigcodebenchuser:bigcodebenchuser /app
 
-RUN chmod -R 777 /app
+RUN chmod -R 777 /app && rm -rf /root/.cache/pip
 
 USER bigcodebenchuser