@@ -121,7 +121,7 @@ def evaluate(
121121 no_execute : bool = False ,
122122 execution : str = "e2b" , # "e2b", "gradio", "local"
123123 selective_evaluate : str = "" ,
124- e2b_endpoint : str = "bigcodebench-evaluator " ,
124+ e2b_endpoint : str = "bigcodebench_evaluator " ,
125125 gradio_endpoint : str = "https://bigcode-bigcodebench-evaluator.hf.space/" ,
126126 pass_k : str = "1,5,10" ,
127127 save_pass_rate : bool = True ,
@@ -135,7 +135,6 @@ def evaluate(
135135 no_gt : bool = False ,
136136 ** model_kwargs ,
137137):
138-
139138 if not samples and model_kwargs :
140139 samples = run_codegen (
141140 split = split ,
@@ -182,31 +181,32 @@ def evaluate(
182181 failed_tasks = pass_at_k ["failed_tasks" ]
183182
184183 elif execution == "e2b" :
185- sandbox = Sandbox (e2b_endpoint , timeout = 60 * 10 )
186-
184+ sandbox = Sandbox (e2b_endpoint , api_key = os . environ [ "E2B_API_KEY" ], timeout = 60 * 60 )
185+
187186 # upload file to sandbox
188187 with open (samples , "r" ) as file :
189188 sandbox .files .write (samples , file )
190189
191190 # run the evaluation
192- sandbox . commands . run ( "python3 -m bigcodebench.evaluate \
193- --split {} --subset {} --samples {} \
194- --pass_k { } --save_pass_rate { } --calibrated {} \
195- --parallel { } --min_time_limit { } --max_as_limit {} \
196- --max_data_limit { } --max_stack_limit { } --check_gt_only {} --no_gt {} \
197- " . format ( split , subset , samples , pass_k , save_pass_rate , calibrated , parallel ,
198- min_time_limit , max_as_limit , max_data_limit , max_stack_limit , check_gt_only , no_gt ) )
191+ print ( f"Command run in sandbox { e2b_endpoint } " )
192+ sandbox . commands . run ( "bigcodebench.evaluate --execution 'local' "
193+ f"--split { split } --subset { subset } --samples { samples } "
194+ f"--pass_k { pass_k } --save_pass_rate { save_pass_rate } --calibrated { calibrated } "
195+ f"--parallel { parallel } --selective_evaluate { selective_evaluate } --min_time_limit { min_time_limit } "
196+ f"--max_as_limit { max_as_limit } --max_data_limit { max_data_limit } --max_stack_limit { max_stack_limit } "
197+ f"--check_gt_only { check_gt_only } --no_gt { no_gt } " , on_stderr = lambda x : print ( x ), on_stdout = lambda x : print ( x ), timeout = 60 * 50 )
199198
200- # download the results
201- content = sandbox .files .read (result_path )
202- with open (result_path , "w" ) as file :
203- file .write (content )
199+ if not check_gt_only :
200+ # download the results
201+ content = sandbox .files .read (result_path )
202+ with open (result_path , "w" ) as file :
203+ file .write (content )
204204
205205 else :
206206
207207 pass_at_k = dict ()
208208
209- pass_k = [int (k ) for k in pass_k .split ("," )]
209+ passk = [int (k ) for k in pass_k .split ("," )]
210210
211211 if parallel < 1 :
212212 n_workers = max (1 , multiprocessing .cpu_count () // 2 )
@@ -350,7 +350,7 @@ def stucking_checker():
350350
351351 pass_at_k .update ({
352352 f"pass@{ k } " : estimate_pass_at_k (total , base_correct , k ).mean ()
353- for k in pass_k
353+ for k in passk
354354 if total .min () >= k
355355 })
356356
0 commit comments