@@ -328,6 +328,12 @@ def compute_beyond_eval(generations_list, reference_list, timeout=10):
328328 "Average" : dict (total_c = list (), correct_c = list (), beyond_c = list ()),
329329 }
330330
331+ errors = {
332+ "Easy" : dict (failed_load = 0 , failed_eval = 0 , failed_cases = 0 , failed_timeout = 0 , failed_error = 0 , passed = 0 ),
333+ "Medium" : dict (failed_load = 0 , failed_eval = 0 , failed_cases = 0 , failed_timeout = 0 , failed_error = 0 , passed = 0 ),
334+ "Hard" : dict (failed_load = 0 , failed_eval = 0 , failed_cases = 0 , failed_timeout = 0 , failed_error = 0 , passed = 0 ),
335+ }
336+
331337 for generations , instance in tqdm (zip (generations_list , reference_list ), total = len (generations_list ), desc = 'compute_beyond_eval' ):
332338 # Construct runtime distribution from sample solutions
333339 runtimes = list ()
@@ -377,6 +383,9 @@ def compute_beyond_eval(generations_list, reference_list, timeout=10):
377383 p_c += 1
378384 else :
379385 runtime = float ('inf' )
386+
387+ # Statistic Errors
388+ errors [difficulty ][results [0 ]['result' ]] += 1
380389
381390 runtime = min (runtime , max_runtime )
382391 runtime = max (runtime , min_runtime )
@@ -390,10 +399,10 @@ def compute_beyond_eval(generations_list, reference_list, timeout=10):
390399 scores ['Average' ]['correct_c' ] += [p_c ]
391400 scores ['Average' ]['beyond_c' ] += [b_l ]
392401
393- print (f'total: { t_c } ' )
394- print (f'correct: { p_c } ' )
395- print (f'beyond: { b_l } ' )
396- print ("-" * 60 )
402+ # print(f'total: {t_c}')
403+ # print(f'correct: {p_c}')
404+ # print(f'beyond: {b_l}')
405+ # print("-" * 60)
397406
398407 results = dict ()
399408 for difficulty in ['Easy' , "Medium" , "Hard" , "Average" ]:
@@ -406,6 +415,8 @@ def compute_beyond_eval(generations_list, reference_list, timeout=10):
406415
407416 results .update (pass_at_k )
408417 results .update (beyond_at_k )
418+
419+ results .update (errors )
409420
410421 return results
411422
0 commit comments