Skip to content

Commit 74f8e19

Browse files
committed
Update
1 parent 3c5cb6e commit 74f8e19

4 files changed

Lines changed: 238 additions & 18 deletions

File tree

bigcode_eval/tasks/custom_metrics/beyond_eval.py

Lines changed: 26 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -300,16 +300,14 @@ def run_samples(samples, n_workers=4):
300300
return results
301301

302302

303-
def estimate_at_k(num_samples, num_correct, k):
304-
"""Estimates beyond@k of each problem and returns them in an array."""
305-
306-
def cf(n, k):
307-
return math.gamma(n+1) / (math.gamma(k+1) * (math.gamma(n-k+1)))
303+
def estimate_pass_at_k(num_samples, num_correct, k):
304+
"""Estimates pass@k of each problem and returns them in an array."""
308305

309306
def estimator(n: int, c: int, k: int) -> float:
307+
"""Calculates 1 - comb(n - c, k) / comb(n, k)."""
310308
if n - c < k:
311309
return 1.0
312-
return 1 - cf(n-c, k) / cf(n, k)
310+
return 1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1))
313311

314312
if isinstance(num_samples, int):
315313
num_samples_it = itertools.repeat(num_samples, len(num_correct))
@@ -319,6 +317,17 @@ def estimator(n: int, c: int, k: int) -> float:
319317

320318
return np.array([estimator(int(n), int(c), k) for n, c in zip(num_samples_it, num_correct)])
321319

320+
def estimate_beyond_at_k(runtimes, k):
321+
"""Estimates pass@k of each problem and returns them in an array."""
322+
323+
def estimator(runtimes: list, k: int) -> float:
324+
"""Calculates 1 - comb(n - c, k) / comb(n, k)."""
325+
print(runtimes)
326+
print("============")
327+
return sum(runtimes[:k])/len(runtimes)
328+
329+
return np.array([estimator(r, k) for r in runtimes])
330+
322331
def compute_beyond_eval(generations_list, reference_list, timeout=30):
323332
sandbox = Sandbox()
324333

@@ -353,7 +362,8 @@ def compute_beyond_eval(generations_list, reference_list, timeout=30):
353362
max_runtime = max(runtimes)
354363

355364
# Evaluate generated solutions
356-
t_c, p_c, b_c = 0, 0, 0
365+
t_c, p_c = 0, 0
366+
b_l = list()
357367
difficulty = instance['difficulty']
358368

359369
for index, solution in enumerate(generations):
@@ -373,29 +383,30 @@ def compute_beyond_eval(generations_list, reference_list, timeout=30):
373383
# Calculate Beyond
374384
if result['result'] == "passed":
375385
runtime = result['runtime']
376-
runtime = min(runtime, max_runtime)
377-
runtime = max(runtime, min_runtime)
378-
b_c += (max_runtime - runtime) / (max_runtime - min_runtime)
379386
p_c += 1
380387
else:
381388
runtime = float('inf')
389+
390+
runtime = min(runtime, max_runtime)
391+
runtime = max(runtime, min_runtime)
392+
b_l += [(max_runtime - runtime) / (max_runtime - min_runtime)]
382393

383394
scores[difficulty]['total_c'] += [t_c]
384395
scores[difficulty]['correct_c'] += [p_c]
385-
scores[difficulty]['beyond_c'] += [b_c]
396+
scores[difficulty]['beyond_c'] += [b_l]
386397

387398
scores['Average']['total_c'] += [t_c]
388399
scores['Average']['correct_c'] += [p_c]
389-
scores['Average']['beyond_c'] += [b_c]
400+
scores['Average']['beyond_c'] += [b_l]
390401

391402
results = dict()
392403
for difficulty in ['Easy', "Medium", "Hard", "Average"]:
393404
total = np.array(scores[difficulty]['total_c'])
394405
correct = np.array(scores[difficulty]['correct_c'])
395-
beyond = np.array(scores[difficulty]['beyond_c'])
406+
beyond = scores[difficulty]['beyond_c']
396407

397-
pass_at_k = {f"{difficulty}_pass@{k}": estimate_at_k(total, correct, k).mean() for k in [1,3,5] if (total >= k).all()}
398-
beyond_at_k = {f"{difficulty}_beyond@{k}": estimate_at_k(total, beyond, k).mean() for k in [1,3,5] if (total >= k).all()}
408+
pass_at_k = {f"{difficulty}_pass@{k}": estimate_pass_at_k(total, correct, k).mean() for k in [1,3,5] if (total >= k).all()}
409+
beyond_at_k = {f"{difficulty}_beyond@{k}": estimate_beyond_at_k(beyond, k).mean() for k in [1,3,5] if (total >= k).all()}
399410

400411
results.update(pass_at_k)
401412
results.update(beyond_at_k)

main.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -266,7 +266,7 @@ def main():
266266
model_kwargs = {
267267
"revision": args.revision,
268268
"trust_remote_code": args.trust_remote_code,
269-
"use_auth_token": args.use_auth_token,
269+
"token": args.use_auth_token,
270270
}
271271
if args.load_in_8bit:
272272
print("Loading model in 8bit")
@@ -275,6 +275,8 @@ def main():
275275
elif args.load_in_4bit:
276276
print("Loading model in 4bit")
277277
model_kwargs["load_in_4bit"] = args.load_in_4bit
278+
model_kwargs["torch_dtype"] = torch.float16
279+
model_kwargs["bnb_4bit_compute_dtype"] = torch.float16
278280
model_kwargs["device_map"] = {"": accelerator.process_index}
279281
else:
280282
print(f"Loading model in {args.precision}")
@@ -322,7 +324,7 @@ def main():
322324
args.model,
323325
revision=args.revision,
324326
trust_remote_code=args.trust_remote_code,
325-
use_auth_token=args.use_auth_token,
327+
token=args.use_auth_token,
326328
padding_side="left",
327329
)
328330
else:
@@ -331,7 +333,7 @@ def main():
331333
args.model,
332334
revision=args.revision,
333335
trust_remote_code=args.trust_remote_code,
334-
use_auth_token=args.use_auth_token,
336+
token=args.use_auth_token,
335337
truncation_side="left",
336338
padding_side="right",
337339
)

mercury.sh

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
accelerate launch --main_process_port 29501 main.py \
2+
--model deepseek-ai/deepseek-coder-6.7b-base \
3+
--load_in_4bit \
4+
--limit 256 \
5+
--max_length_generation 1024 \
6+
--tasks mercury \
7+
--n_samples 5 \
8+
--temperature 0.2 \
9+
--batch_size 6 \
10+
--allow_code_execution
11+
12+
accelerate launch --main_process_port 29502 main.py \
13+
--model deepseek-ai/deepseek-coder-6.7b-instruct \
14+
--load_in_4bit \
15+
--limit 256 \
16+
--max_length_generation 1024 \
17+
--tasks mercury \
18+
--n_samples 5 \
19+
--temperature 0.2 \
20+
--batch_size 6 \
21+
--allow_code_execution
22+

playground.ipynb

Lines changed: 185 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,185 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": 2,
6+
"metadata": {},
7+
"outputs": [
8+
{
9+
"data": {
10+
"application/vnd.jupyter.widget-view+json": {
11+
"model_id": "070e463eddd94abf8d5d2384fcd7674a",
12+
"version_major": 2,
13+
"version_minor": 0
14+
},
15+
"text/plain": [
16+
"tokenizer_config.json: 0%| | 0.00/749 [00:00<?, ?B/s]"
17+
]
18+
},
19+
"metadata": {},
20+
"output_type": "display_data"
21+
},
22+
{
23+
"data": {
24+
"application/vnd.jupyter.widget-view+json": {
25+
"model_id": "7122460fac9643cda39186565c42eeb4",
26+
"version_major": 2,
27+
"version_minor": 0
28+
},
29+
"text/plain": [
30+
"tokenizer.model: 0%| | 0.00/500k [00:00<?, ?B/s]"
31+
]
32+
},
33+
"metadata": {},
34+
"output_type": "display_data"
35+
},
36+
{
37+
"data": {
38+
"application/vnd.jupyter.widget-view+json": {
39+
"model_id": "b51157e2cc5f436cb49b972b4bd23ecc",
40+
"version_major": 2,
41+
"version_minor": 0
42+
},
43+
"text/plain": [
44+
"tokenizer.json: 0%| | 0.00/1.84M [00:00<?, ?B/s]"
45+
]
46+
},
47+
"metadata": {},
48+
"output_type": "display_data"
49+
},
50+
{
51+
"data": {
52+
"application/vnd.jupyter.widget-view+json": {
53+
"model_id": "cf661fb7a0224e68be5f5d1255b608c2",
54+
"version_major": 2,
55+
"version_minor": 0
56+
},
57+
"text/plain": [
58+
"special_tokens_map.json: 0%| | 0.00/411 [00:00<?, ?B/s]"
59+
]
60+
},
61+
"metadata": {},
62+
"output_type": "display_data"
63+
},
64+
{
65+
"data": {
66+
"application/vnd.jupyter.widget-view+json": {
67+
"model_id": "06601ec5d98847809cd3d4d1f0af3fb5",
68+
"version_major": 2,
69+
"version_minor": 0
70+
},
71+
"text/plain": [
72+
"config.json: 0%| | 0.00/588 [00:00<?, ?B/s]"
73+
]
74+
},
75+
"metadata": {},
76+
"output_type": "display_data"
77+
},
78+
{
79+
"data": {
80+
"application/vnd.jupyter.widget-view+json": {
81+
"model_id": "843d8e08d9a94b268c8f7c7df3030765",
82+
"version_major": 2,
83+
"version_minor": 0
84+
},
85+
"text/plain": [
86+
"model.safetensors.index.json: 0%| | 0.00/37.6k [00:00<?, ?B/s]"
87+
]
88+
},
89+
"metadata": {},
90+
"output_type": "display_data"
91+
},
92+
{
93+
"data": {
94+
"application/vnd.jupyter.widget-view+json": {
95+
"model_id": "cefc9c5dbd1b4bc2b9a2a903192b33f8",
96+
"version_major": 2,
97+
"version_minor": 0
98+
},
99+
"text/plain": [
100+
"Downloading shards: 0%| | 0/7 [00:00<?, ?it/s]"
101+
]
102+
},
103+
"metadata": {},
104+
"output_type": "display_data"
105+
},
106+
{
107+
"data": {
108+
"application/vnd.jupyter.widget-view+json": {
109+
"model_id": "a5bedd3959124f1f8875178b13450682",
110+
"version_major": 2,
111+
"version_minor": 0
112+
},
113+
"text/plain": [
114+
"model-00001-of-00007.safetensors: 0%| | 0.00/9.85G [00:00<?, ?B/s]"
115+
]
116+
},
117+
"metadata": {},
118+
"output_type": "display_data"
119+
}
120+
],
121+
"source": [
122+
"# Load model directly\n",
123+
"from transformers import AutoTokenizer, AutoModelForCausalLM\n",
124+
"\n",
125+
"tokenizer = AutoTokenizer.from_pretrained(\"codellama/CodeLlama-34b-hf\")\n",
126+
"model = AutoModelForCausalLM.from_pretrained(\"codellama/CodeLlama-34b-hf\")"
127+
]
128+
},
129+
{
130+
"cell_type": "code",
131+
"execution_count": null,
132+
"metadata": {},
133+
"outputs": [],
134+
"source": [
135+
"from transformers import AutoTokenizer\n",
136+
"import transformers\n",
137+
"import torch\n",
138+
"\n",
139+
"model = \"codellama/CodeLlama-34b-hf\"\n",
140+
"\n",
141+
"tokenizer = AutoTokenizer.from_pretrained(model)\n",
142+
"pipeline = transformers.pipeline(\n",
143+
" \"text-generation\",\n",
144+
" model=model,\n",
145+
" torch_dtype=torch.float16,\n",
146+
" device_map=\"auto\",\n",
147+
")\n",
148+
"\n",
149+
"sequences = pipeline(\n",
150+
" 'import socket\\n\\ndef ping_exponential_backoff(host: str):',\n",
151+
" do_sample=True,\n",
152+
" top_k=10,\n",
153+
" temperature=0.1,\n",
154+
" top_p=0.95,\n",
155+
" num_return_sequences=1,\n",
156+
" eos_token_id=tokenizer.eos_token_id,\n",
157+
" max_length=200,\n",
158+
")\n",
159+
"for seq in sequences:\n",
160+
" print(f\"Result: {seq['generated_text']}\")\n"
161+
]
162+
}
163+
],
164+
"metadata": {
165+
"kernelspec": {
166+
"display_name": "workspace",
167+
"language": "python",
168+
"name": "python3"
169+
},
170+
"language_info": {
171+
"codemirror_mode": {
172+
"name": "ipython",
173+
"version": 3
174+
},
175+
"file_extension": ".py",
176+
"mimetype": "text/x-python",
177+
"name": "python",
178+
"nbconvert_exporter": "python",
179+
"pygments_lexer": "ipython3",
180+
"version": "3.9.16"
181+
}
182+
},
183+
"nbformat": 4,
184+
"nbformat_minor": 2
185+
}

0 commit comments

Comments
 (0)