1212from typing import Any , Dict , List , Tuple , Optional
1313from warnings import warn
1414from gradio_client import Client , handle_file
15+ from e2b import Sandbox
1516
1617import httpx
1718import numpy as np
@@ -118,9 +119,10 @@ def evaluate(
118119 subset : str ,
119120 samples : Optional [str ] = None ,
120121 no_execute : bool = False ,
121- local_execute : bool = False ,
122+ execution : str = "e2b" , # "e2b", "gradio", "local"
122123 selective_evaluate : str = "" ,
123- remote_execute_api : str = "https://bigcode-bigcodebench-evaluator.hf.space/" ,
124+ e2b_endpoint : str = "bigcodebench-evaluator" ,
125+ gradio_endpoint : str = "https://bigcode-bigcodebench-evaluator.hf.space/" ,
124126 pass_k : str = "1,5,10" ,
125127 save_pass_rate : bool = True ,
126128 calibrated : bool = True ,
@@ -152,10 +154,10 @@ def evaluate(
152154 assert samples .endswith (".jsonl" )
153155 result_path = samples .replace (".jsonl" , "_eval_results.json" )
154156
155- if not local_execute :
157+ if execution == "gradio" :
156158 while True :
157159 try :
158- client = Client (remote_execute_api )
160+ client = Client (gradio_endpoint )
159161 results , pass_at_k = client .predict (
160162 split = split ,
161163 subset = subset ,
@@ -178,7 +180,28 @@ def evaluate(
178180 time .sleep (4 )
179181 gt_pass_rate = pass_at_k ["gt_pass_rate" ]
180182 failed_tasks = pass_at_k ["failed_tasks" ]
183+
184+ elif execution == "e2b" :
185+ sandbox = Sandbox (e2b_endpoint , timeout = 60 * 10 )
186+
187+ # upload file to sandbox
188+ with open (samples , "r" ) as file :
189+ sandbox .files .write (samples , file )
181190
191+ # run the evaluation
192+ sandbox .commands .run ("python3 -m bigcodebench.evaluate \
193+ --split {} --subset {} --samples {} \
194+ --pass_k {} --save_pass_rate {} --calibrated {} \
195+ --parallel {} --min_time_limit {} --max_as_limit {} \
196+ --max_data_limit {} --max_stack_limit {} --check_gt_only {} --no_gt {} \
197+ " .format (split , subset , samples , pass_k , save_pass_rate , calibrated , parallel ,
198+ min_time_limit , max_as_limit , max_data_limit , max_stack_limit , check_gt_only , no_gt ))
199+
200+ # download the results
201+ content = sandbox .files .read (result_path )
202+ with open (result_path , "w" ) as file :
203+ file .write (content )
204+
182205 else :
183206
184207 pass_at_k = dict ()
0 commit comments