Skip to content

Commit 2873e21

Browse files
committed
feat: add humaneval+ evaluation task
1 parent 3910745 commit 2873e21

2 files changed

Lines changed: 49 additions & 1 deletion

File tree

bigcode_eval/tasks/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
from pprint import pprint
33

44
from . import (apps, codexglue_code_to_text, codexglue_text_to_text, conala,
5-
concode, ds1000, gsm, humaneval, humanevalpack,
5+
concode, ds1000, gsm, humaneval, humanevalplus, humanevalpack,
66
instruct_humaneval, instruct_wizard_humaneval, mbpp, multiple,
77
parity, python_bugs, quixbugs, recode, santacoder_fim)
88

@@ -16,6 +16,7 @@
1616
"concode": concode.Concode,
1717
**ds1000.create_all_tasks(),
1818
**humaneval.create_all_tasks(),
19+
**humanevalplus.create_all_tasks(),
1920
**humanevalpack.create_all_tasks(),
2021
"mbpp": mbpp.MBPP,
2122
"parity": parity.Parity,
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
"""Is Your Code Generated by ChatGPT Really Correct? Rigorous Evaluation of Large Language Models for Code Generation
2+
https://openreview.net/forum?id=1qvx610Cu7
3+
4+
The HumanEval+ dataset is created by the EvalPlus framework which extends the original HumanEval dataset
5+
by adding more automatically generated test cases to each problem.
6+
7+
Homepage: https://github.com/evalplus/evalplus
8+
"""
9+
10+
from bigcode_eval.tasks.humaneval import GeneralHumanEval
11+
12+
_CITATION = """
13+
@inproceedings{evalplus,
14+
title = {Is Your Code Generated by Chat{GPT} Really Correct? Rigorous Evaluation of Large Language Models for Code Generation},
15+
author = {Liu, Jiawei and Xia, Chunqiu Steven and Wang, Yuyao and Zhang, Lingming},
16+
booktitle = {Thirty-seventh Conference on Neural Information Processing Systems},
17+
year = {2023},
18+
url = {https://openreview.net/forum?id=1qvx610Cu7},
19+
}
20+
"""
21+
22+
23+
class GeneralHumanEvalPlus(GeneralHumanEval):
24+
"""A task represents an entire benchmark including its dataset, problems,
25+
answers, generation settings and evaluation methods.
26+
"""
27+
28+
DATASET_PATH = "evalplus/humanevalplus"
29+
30+
31+
def create_task(strip_prompt):
32+
class HumanEvalPlus(GeneralHumanEvalPlus):
33+
def __init__(self, **kwargs):
34+
super().__init__(strip_prompt, **kwargs)
35+
36+
return HumanEvalPlus
37+
38+
39+
def create_all_tasks():
40+
"""Creates a dictionary of tasks from a list of levels
41+
:return: {task_name: task}
42+
e.g. {multiple-py: Task, multiple-java: Task}
43+
"""
44+
return {
45+
"humanevalplus": create_task(True),
46+
"humanevalplus-unstripped": create_task(False),
47+
}

0 commit comments

Comments
 (0)