Skip to content

Commit 7117b47

Browse files
committed
update
1 parent 74f8e19 commit 7117b47

15 files changed

Lines changed: 1192 additions & 160 deletions
Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
{
2+
"mercury": {
3+
"Easy_pass@1": 0.5568181818181818,
4+
"Easy_pass@3": 0.7022727272727273,
5+
"Easy_pass@5": 0.7613636363636364,
6+
"Easy_beyond@1": 0.3868611560557632,
7+
"Easy_beyond@3": 0.41686202322642835,
8+
"Easy_beyond@5": 0.4255333541664179,
9+
"Medium_pass@1": 0.4172839506172839,
10+
"Medium_pass@3": 0.6061728395061728,
11+
"Medium_pass@5": 0.6666666666666666,
12+
"Medium_beyond@1": 0.36229925158807574,
13+
"Medium_beyond@3": 0.32596671221913265,
14+
"Medium_beyond@5": 0.30994619901730974,
15+
"Hard_pass@1": 0.12873563218390804,
16+
"Hard_pass@3": 0.2206896551724138,
17+
"Hard_pass@5": 0.25287356321839083,
18+
"Hard_beyond@1": 0.07089000795798558,
19+
"Hard_beyond@3": 0.08270522875441813,
20+
"Hard_beyond@5": 0.08881214320311258,
21+
"Average_pass@1": 0.3671875,
22+
"Average_pass@3": 0.508203125,
23+
"Average_pass@5": 0.55859375,
24+
"Average_beyond@1": 0.27170879610892984,
25+
"Average_beyond@3": 0.274541080606679,
26+
"Average_beyond@5": 0.27452825681920967
27+
},
28+
"config": {
29+
"prefix": "",
30+
"do_sample": true,
31+
"temperature": 0.2,
32+
"top_k": 0,
33+
"top_p": 0.95,
34+
"n_samples": 5,
35+
"eos": "<|endoftext|>",
36+
"seed": 0,
37+
"model": "codellama/CodeLlama-7b-hf",
38+
"modeltype": "causal",
39+
"peft_model": null,
40+
"revision": null,
41+
"use_auth_token": false,
42+
"trust_remote_code": false,
43+
"tasks": "mercury",
44+
"instruction_tokens": null,
45+
"batch_size": 10,
46+
"max_length_generation": 2048,
47+
"precision": "fp32",
48+
"load_in_8bit": false,
49+
"load_in_4bit": true,
50+
"left_padding": false,
51+
"limit": null,
52+
"limit_start": 0,
53+
"save_every_k_tasks": -1,
54+
"postprocess": true,
55+
"allow_code_execution": true,
56+
"generation_only": false,
57+
"load_generations_path": null,
58+
"load_data_path": null,
59+
"metric_output_path": "CodeLlama-7b-hf-mercury-result.json",
60+
"save_generations": true,
61+
"load_generations_intermediate_paths": null,
62+
"save_generations_path": "generations.json",
63+
"save_references": false,
64+
"save_references_path": "references.json",
65+
"prompt": "prompt",
66+
"max_memory_per_gpu": null,
67+
"check_references": false
68+
}
69+
}

CodeQwen1.5-7B-mercury-generations_mercury.json

Lines changed: 1 addition & 0 deletions
Large diffs are not rendered by default.

CodeQwen1.5-7B-mercury-result.json

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
{
2+
"mercury": {
3+
"Easy_pass@1": 0.7136363636363636,
4+
"Easy_pass@3": 0.7363636363636363,
5+
"Easy_pass@5": 0.7386363636363636,
6+
"Easy_beyond@1": 0.5346616692949514,
7+
"Easy_beyond@3": 0.543085393353663,
8+
"Easy_beyond@5": 0.5445871487558871,
9+
"Medium_pass@1": 0.6938271604938272,
10+
"Medium_pass@3": 0.7444444444444445,
11+
"Medium_pass@5": 0.7530864197530864,
12+
"Medium_beyond@1": 0.5439194762152021,
13+
"Medium_beyond@3": 0.5416128864524683,
14+
"Medium_beyond@5": 0.549752418636262,
15+
"Hard_pass@1": 0.5218390804597701,
16+
"Hard_pass@3": 0.6505747126436782,
17+
"Hard_pass@5": 0.6781609195402298,
18+
"Hard_beyond@1": 0.4255225142333653,
19+
"Hard_beyond@3": 0.4055039346171537,
20+
"Hard_beyond@5": 0.40262274727501585,
21+
"Average_pass@1": 0.6421874999999999,
22+
"Average_pass@3": 0.7097656250000001,
23+
"Average_pass@5": 0.72265625,
24+
"Average_beyond@1": 0.5005006375378508,
25+
"Average_beyond@3": 0.4958632840994711,
26+
"Average_beyond@5": 0.49797575786320986
27+
},
28+
"config": {
29+
"prefix": "",
30+
"do_sample": true,
31+
"temperature": 0.2,
32+
"top_k": 0,
33+
"top_p": 0.95,
34+
"n_samples": 5,
35+
"eos": "<|endoftext|>",
36+
"seed": 0,
37+
"model": "Qwen/CodeQwen1.5-7B",
38+
"modeltype": "causal",
39+
"peft_model": null,
40+
"revision": null,
41+
"use_auth_token": false,
42+
"trust_remote_code": false,
43+
"tasks": "mercury",
44+
"instruction_tokens": null,
45+
"batch_size": 5,
46+
"max_length_generation": 2048,
47+
"precision": "fp32",
48+
"load_in_8bit": true,
49+
"load_in_4bit": false,
50+
"left_padding": false,
51+
"limit": null,
52+
"limit_start": 0,
53+
"save_every_k_tasks": -1,
54+
"postprocess": true,
55+
"allow_code_execution": true,
56+
"generation_only": false,
57+
"load_generations_path": null,
58+
"load_data_path": null,
59+
"metric_output_path": "CodeQwen1.5-7B-mercury-result.json",
60+
"save_generations": true,
61+
"load_generations_intermediate_paths": null,
62+
"save_generations_path": "CodeQwen1.5-7B-mercury-generations.json",
63+
"save_references": false,
64+
"save_references_path": "references.json",
65+
"prompt": "prompt",
66+
"max_memory_per_gpu": null,
67+
"check_references": false
68+
}
69+
}

bigcode_eval/generation.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,6 @@ def __init__(self, input_length, multiplier):
3737
def __call__(self, input_ids, scores, **kwargs):
3838
"""Returns true if generated sequence is too long."""
3939
return input_ids.shape[1] > int(self.input_length * self.multiplier)
40-
4140

4241
def parallel_generations(
4342
task,
@@ -108,6 +107,7 @@ def parallel_generations(
108107
print(f"number of problems for this task is {n_tasks}")
109108
n_copies = ceil(args.n_samples / args.batch_size)
110109

110+
print("TokenizedDataset...")
111111
ds_tokenized = TokenizedDataset(
112112
task,
113113
dataset,
@@ -137,7 +137,8 @@ def parallel_generations(
137137
else:
138138
# model.to() is not supported for 8bit and 4bit models
139139
model, ds_loader = accelerator.prepare(model, ds_loader)
140-
140+
141+
print("complete_code...")
141142
generations = complete_code(
142143
task,
143144
accelerator,
@@ -156,4 +157,5 @@ def parallel_generations(
156157
intermediate_save_generations_path=intermediate_save_generations_path,
157158
**gen_kwargs,
158159
)
160+
159161
return generations

bigcode_eval/tasks/custom_metrics/beyond_eval.py

Lines changed: 19 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -291,15 +291,13 @@ def run_samples(samples, n_workers=4):
291291
args = (sample,)
292292
future = executor.submit(Sandbox.run_sample, *args)
293293
futures.append(future)
294-
n_samples += 1
295294

296295
for future in tqdm(as_completed(futures), total=len(futures), desc='Reading futures'):
297296
result = future.result()
298297
results.append(result)
299298

300299
return results
301300

302-
303301
def estimate_pass_at_k(num_samples, num_correct, k):
304302
"""Estimates pass@k of each problem and returns them in an array."""
305303

@@ -317,18 +315,10 @@ def estimator(n: int, c: int, k: int) -> float:
317315

318316
return np.array([estimator(int(n), int(c), k) for n, c in zip(num_samples_it, num_correct)])
319317

320-
def estimate_beyond_at_k(runtimes, k):
321-
"""Estimates pass@k of each problem and returns them in an array."""
322-
323-
def estimator(runtimes: list, k: int) -> float:
324-
"""Calculates 1 - comb(n - c, k) / comb(n, k)."""
325-
print(runtimes)
326-
print("============")
327-
return sum(runtimes[:k])/len(runtimes)
328-
329-
return np.array([estimator(r, k) for r in runtimes])
318+
def estimate_beyond_at_k(beyonds, k):
319+
return sum([sum(b[:k]) / k for b in beyonds]) / len(beyonds)
330320

331-
def compute_beyond_eval(generations_list, reference_list, timeout=30):
321+
def compute_beyond_eval(generations_list, reference_list, timeout=10):
332322
sandbox = Sandbox()
333323

334324
scores = {
@@ -338,10 +328,10 @@ def compute_beyond_eval(generations_list, reference_list, timeout=30):
338328
"Average": dict(total_c=list(), correct_c=list(), beyond_c=list()),
339329
}
340330

341-
for generations, instance in tqdm(zip(generations_list, reference_list), total=len(generations_list)):
331+
for generations, instance in tqdm(zip(generations_list, reference_list), total=len(generations_list), desc='compute_beyond_eval'):
342332
# Construct runtime distribution from sample solutions
343333
runtimes = list()
344-
for index, solution in enumerate(instance['solutions']):
334+
for index, solution in tqdm(enumerate(instance['solutions']), desc="Construct runtime distribution from sample solutions"):
345335
sample = {
346336
"solution": solution['solution'],
347337
"convert_offline": instance['convert_offline'],
@@ -366,23 +356,24 @@ def compute_beyond_eval(generations_list, reference_list, timeout=30):
366356
b_l = list()
367357
difficulty = instance['difficulty']
368358

369-
for index, solution in enumerate(generations):
359+
for index, solution in tqdm(enumerate(generations), desc="generation execution", total=len(generations)):
370360
sample = {
371-
"solution": instance['prompt'] + solution,
361+
"solution": solution,
372362
"convert_offline": instance['convert_offline'],
373363
"evaluate_offline": instance['evaluate_offline'],
374364
"entry_point": instance['entry_point'],
375365
"test_cases": json.loads(instance['test_cases']),
376366
"solution_index": index,
377367
"timeout": timeout,
378368
}
379-
380-
result = sandbox.run_sample(sample)
369+
370+
results = [sandbox.run_sample(sample) for _ in range(3)]
371+
print(results[0])
381372
t_c += 1
382373

383374
# Calculate Beyond
384-
if result['result'] == "passed":
385-
runtime = result['runtime']
375+
if results[0]['result'] == "passed":
376+
runtime = sum([r['runtime'] for r in results]) / len(results)
386377
p_c += 1
387378
else:
388379
runtime = float('inf')
@@ -398,15 +389,20 @@ def compute_beyond_eval(generations_list, reference_list, timeout=30):
398389
scores['Average']['total_c'] += [t_c]
399390
scores['Average']['correct_c'] += [p_c]
400391
scores['Average']['beyond_c'] += [b_l]
392+
393+
print(f'total: {t_c}')
394+
print(f'correct: {p_c}')
395+
print(f'beyond: {b_l}')
396+
print("-" * 60)
401397

402398
results = dict()
403399
for difficulty in ['Easy', "Medium", "Hard", "Average"]:
404400
total = np.array(scores[difficulty]['total_c'])
405401
correct = np.array(scores[difficulty]['correct_c'])
406402
beyond = scores[difficulty]['beyond_c']
407403

408-
pass_at_k = {f"{difficulty}_pass@{k}": estimate_pass_at_k(total, correct, k).mean() for k in [1,3,5] if (total >= k).all()}
409-
beyond_at_k = {f"{difficulty}_beyond@{k}": estimate_beyond_at_k(beyond, k).mean() for k in [1,3,5] if (total >= k).all()}
404+
pass_at_k = {f"{difficulty}_pass@{k}": estimate_pass_at_k(total, correct, k).mean() for k in [1,3,5,10,15,20,30,50,100] if (total >= k).all()}
405+
beyond_at_k = {f"{difficulty}_beyond@{k}": estimate_beyond_at_k(beyond, k) for k in [1,3,5,10,15,20,30,50,100] if (total >= k).all()}
410406

411407
results.update(pass_at_k)
412408
results.update(beyond_at_k)

bigcode_eval/tasks/mercury.py

Lines changed: 36 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,8 @@
66
77
Homepage: https://github.com/Elfsong/Mercury
88
"""
9-
9+
import os
10+
os.environ["TOKENIZERS_PARALLELISM"] = "false"
1011

1112
from bigcode_eval.base import Task
1213
from bigcode_eval.tasks.custom_metrics.beyond_eval import compute_beyond_eval
@@ -20,7 +21,6 @@
2021
}
2122
"""
2223

23-
2424
class Mercury(Task):
2525
"""
2626
A task represents an entire benchmark including its dataset, problems,
@@ -31,18 +31,50 @@ class Mercury(Task):
3131

3232
def __init__(self, prompt):
3333
super().__init__(
34-
stop_words=["\nclass", "\ndef", "\n#", "\n@", "\nprint", "\nif", "\n```", "<file_sep>", "<|end▁of▁sentence|>"],
34+
# stop_words=["\nclass", "\ndef", "\n#", "\n@", "\nprint", "\nif", "\n```", "<file_sep>", "<|end▁of▁sentence|>"],
35+
stop_words=["\nclass", "\ndef", "\n#", "\n@", "\nprint", "\nif", "\n```", "<file_sep>", "<|end▁of▁sentence|>", "\n###", "\n\n\n\n\n", "<|endoftext|>"],
3536
requires_execution=True,
3637
)
3738
self.prompt = prompt
3839

3940
def get_dataset(self):
4041
"""Returns dataset for the task or an iterable of any object, that get_prompt can handle"""
4142
return self.dataset["eval"]
43+
44+
@staticmethod
45+
def prompt_generate(question_content, starter_code):
46+
examples_json = {
47+
"question": "You are given a 0-indexed array of positive integers nums. Find the number of triplets (i, j, k) that meet the following conditions:\n\n0 <= i < j < k < nums.length\nnums[i], nums[j], and nums[k] are pairwise distinct.\n\t\nIn other words, nums[i] != nums[j], nums[i] != nums[k], and nums[j] != nums[k].\n\n\n\nReturn the number of triplets that meet the conditions.\n \nExample 1:\n\nInput: nums = [4,4,2,4,3]\nOutput: 3\nExplanation: The following triplets meet the conditions:\n- (0, 2, 4) because 4 != 2 != 3\n- (1, 2, 4) because 4 != 2 != 3\n- (2, 3, 4) because 2 != 4 != 3\nSince there are 3 triplets, we return 3.\nNote that (2, 0, 4) is not a valid triplet because 2 > 0.\n\nExample 2:\n\nInput: nums = [1,1,1,1,1]\nOutput: 0\nExplanation: No triplets meet the conditions so we return 0.\n\n \nConstraints:\n\n3 <= nums.length <= 100\n1 <= nums[i] <= 1000\n\n",
48+
"sample_code": 'class Solution(object):\n def unequalTriplets(self, nums: List[int]) -> int:\n """\n\t:type nums: List[int]\n\t:rtype: int\n\t"""\n \n',
49+
"answer": 'class Solution(object):\n def unequalTriplets(self, nums: List[int]) -> int:\n """\n\t:type nums: List[int]\n\t:rtype: int\n\t"""\n \n ans = 0\n n = len(a)\n for i in range(n):\n for j in range(i + 1, n):\n for k in range(j + 1, n):\n ans += len({a[i], a[j], a[k]}) == 3\n return ans'
50+
}
51+
52+
def get_example_prompt(example):
53+
prompt = ""
54+
prompt += "### Question\n"
55+
prompt += example["question"]
56+
prompt += "\n\n"
57+
if starter_code:
58+
prompt += "### Code Prompt\n"
59+
prompt += example["sample_code"]
60+
prompt += "\n\n"
61+
prompt += "### Completion\n"
62+
prompt += example["answer"]
63+
if example["answer"]:
64+
prompt += "\n\n"
65+
return prompt
66+
67+
prompt = ""
68+
# one-shot generation example
69+
prompt += get_example_prompt(examples_json)
70+
# code generation
71+
prompt += get_example_prompt({"question": question_content,"sample_code": starter_code,"answer": ""})
72+
73+
return prompt
4274

4375
def get_prompt(self, doc):
4476
"""Builds the prompt for the LM to generate from."""
45-
return f'\'\'\'{doc["pretty_content"][0]}\'\'\'\n{doc["prompt"]}'
77+
return self.prompt_generate(doc["pretty_content"][0], doc["prompt"])
4678

4779
def get_reference(self, doc):
4880
"""Builds the reference solutions for the doc (sample from the test dataset)."""

bigcode_eval/utils.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -120,7 +120,7 @@ def __iter__(self):
120120
"n_copies (n_samples/batch_size) was changed from 1 to 2 because n_tasks isn't proportional to num devices"
121121
)
122122

123-
for sample in range(self.n_tasks):
123+
for sample in tqdm(range(self.n_tasks), desc="Task Encoding"):
124124
for _ in range(self.n_copies):
125125
if self.has_encoder:
126126
yield {
@@ -220,7 +220,6 @@ def _parse_instruction(code, instruction_tokens):
220220
shift = len("```python")
221221
return code[idx + shift :]
222222

223-
224223
def complete_code(
225224
task,
226225
accelerator,
@@ -249,11 +248,13 @@ def complete_code(
249248
code_gens: List[List[Optional[str]]] = [[] for _ in range(n_tasks)]
250249
generations = [] if not intermediate_generations else intermediate_generations
251250
gen_token_dict = defaultdict(list) # dict of list of generated tokens
251+
252252
for step, batch in tqdm(
253253
enumerate(dataloader),
254254
total=math.ceil(
255255
n_tasks * dataloader.dataset.n_copies / accelerator.num_processes
256256
),
257+
desc="batch generation",
257258
):
258259
with torch.no_grad():
259260
if task.stop_words:

0 commit comments

Comments
 (0)