feat: add humaneval+ evaluation task

ganler · ganler · commit 2873e2134c02 · 2024-01-22T03:52:28.000-06:00
diff --git a/bigcode_eval/tasks/__init__.py b/bigcode_eval/tasks/__init__.py
@@ -2,7 +2,7 @@
 from pprint import pprint
 
 from . import (apps, codexglue_code_to_text, codexglue_text_to_text, conala,
-               concode, ds1000, gsm, humaneval, humanevalpack,
+               concode, ds1000, gsm, humaneval, humanevalplus, humanevalpack,
                instruct_humaneval, instruct_wizard_humaneval, mbpp, multiple,
                parity, python_bugs, quixbugs, recode, santacoder_fim)
 
@@ -16,6 +16,7 @@
     "concode": concode.Concode,
     **ds1000.create_all_tasks(),
     **humaneval.create_all_tasks(),
+    **humanevalplus.create_all_tasks(),
     **humanevalpack.create_all_tasks(),
     "mbpp": mbpp.MBPP,
     "parity": parity.Parity,
diff --git a/bigcode_eval/tasks/humanevalplus.py b/bigcode_eval/tasks/humanevalplus.py
@@ -0,0 +1,47 @@
+"""Is Your Code Generated by ChatGPT Really Correct? Rigorous Evaluation of Large Language Models for Code Generation
+https://openreview.net/forum?id=1qvx610Cu7
+
+The HumanEval+ dataset is created by the EvalPlus framework which extends the original HumanEval dataset
+by adding more automatically generated test cases to each problem.
+
+Homepage: https://github.com/evalplus/evalplus
+"""
+
+from bigcode_eval.tasks.humaneval import GeneralHumanEval
+
+_CITATION = """
+@inproceedings{evalplus,
+  title = {Is Your Code Generated by Chat{GPT} Really Correct? Rigorous Evaluation of Large Language Models for Code Generation},
+  author = {Liu, Jiawei and Xia, Chunqiu Steven and Wang, Yuyao and Zhang, Lingming},
+  booktitle = {Thirty-seventh Conference on Neural Information Processing Systems},
+  year = {2023},
+  url = {https://openreview.net/forum?id=1qvx610Cu7},
+}
+"""
+
+
+class GeneralHumanEvalPlus(GeneralHumanEval):
+    """A task represents an entire benchmark including its dataset, problems,
+    answers, generation settings and evaluation methods.
+    """
+
+    DATASET_PATH = "evalplus/humanevalplus"
+
+
+def create_task(strip_prompt):
+    class HumanEvalPlus(GeneralHumanEvalPlus):
+        def __init__(self, **kwargs):
+            super().__init__(strip_prompt, **kwargs)
+
+    return HumanEvalPlus
+
+
+def create_all_tasks():
+    """Creates a dictionary of tasks from a list of levels
+    :return: {task_name: task}
+        e.g. {multiple-py: Task, multiple-java: Task}
+    """
+    return {
+        "humanevalplus": create_task(True),
+        "humanevalplus-unstripped": create_task(False),
+    }