bigcode-project
diff --git a/‎CodeLlama-7b-hf-mercury-result.json‎
Lines changed: 69 additions & 0 deletions b/‎CodeLlama-7b-hf-mercury-result.json‎
Lines changed: 69 additions & 0 deletions
diff --git a/‎CodeQwen1.5-7B-mercury-generations_mercury.json‎
Lines changed: 1 addition & 0 deletions b/‎CodeQwen1.5-7B-mercury-generations_mercury.json‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎CodeQwen1.5-7B-mercury-result.json‎
Lines changed: 69 additions & 0 deletions b/‎CodeQwen1.5-7B-mercury-result.json‎
Lines changed: 69 additions & 0 deletions
diff --git a/‎bigcode_eval/generation.py‎
Lines changed: 4 additions & 2 deletions b/‎bigcode_eval/generation.py‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎bigcode_eval/tasks/custom_metrics/beyond_eval.py‎
Lines changed: 19 additions & 23 deletions b/‎bigcode_eval/tasks/custom_metrics/beyond_eval.py‎
Lines changed: 19 additions & 23 deletions
diff --git a/‎bigcode_eval/tasks/mercury.py‎
Lines changed: 36 additions & 4 deletions b/‎bigcode_eval/tasks/mercury.py‎
Lines changed: 36 additions & 4 deletions
diff --git a/‎bigcode_eval/utils.py‎
Lines changed: 3 additions & 2 deletions b/‎bigcode_eval/utils.py‎
Lines changed: 3 additions & 2 deletions
@@ -0,0 +1,69 @@
+{
+  "mercury": {
+    "Easy_pass@1": 0.5568181818181818,
+    "Easy_pass@3": 0.7022727272727273,
+    "Easy_pass@5": 0.7613636363636364,
+    "Easy_beyond@1": 0.3868611560557632,
+    "Easy_beyond@3": 0.41686202322642835,
+    "Easy_beyond@5": 0.4255333541664179,
+    "Medium_pass@1": 0.4172839506172839,
+    "Medium_pass@3": 0.6061728395061728,
+    "Medium_pass@5": 0.6666666666666666,
+    "Medium_beyond@1": 0.36229925158807574,
+    "Medium_beyond@3": 0.32596671221913265,
+    "Medium_beyond@5": 0.30994619901730974,
+    "Hard_pass@1": 0.12873563218390804,
+    "Hard_pass@3": 0.2206896551724138,
+    "Hard_pass@5": 0.25287356321839083,
+    "Hard_beyond@1": 0.07089000795798558,
+    "Hard_beyond@3": 0.08270522875441813,
+    "Hard_beyond@5": 0.08881214320311258,
+    "Average_pass@1": 0.3671875,
+    "Average_pass@3": 0.508203125,
+    "Average_pass@5": 0.55859375,
+    "Average_beyond@1": 0.27170879610892984,
+    "Average_beyond@3": 0.274541080606679,
+    "Average_beyond@5": 0.27452825681920967
+  },
+  "config": {
+    "prefix": "",
+    "do_sample": true,
+    "temperature": 0.2,
+    "top_k": 0,
+    "top_p": 0.95,
+    "n_samples": 5,
+    "eos": "<|endoftext|>",
+    "seed": 0,
+    "model": "codellama/CodeLlama-7b-hf",
+    "modeltype": "causal",
+    "peft_model": null,
+    "revision": null,
+    "use_auth_token": false,
+    "trust_remote_code": false,
+    "tasks": "mercury",
+    "instruction_tokens": null,
+    "batch_size": 10,
+    "max_length_generation": 2048,
+    "precision": "fp32",
+    "load_in_8bit": false,
+    "load_in_4bit": true,
+    "left_padding": false,
+    "limit": null,
+    "limit_start": 0,
+    "save_every_k_tasks": -1,
+    "postprocess": true,
+    "allow_code_execution": true,
+    "generation_only": false,
+    "load_generations_path": null,
+    "load_data_path": null,
+    "metric_output_path": "CodeLlama-7b-hf-mercury-result.json",
+    "save_generations": true,
+    "load_generations_intermediate_paths": null,
+    "save_generations_path": "generations.json",
+    "save_references": false,
+    "save_references_path": "references.json",
+    "prompt": "prompt",
+    "max_memory_per_gpu": null,
+    "check_references": false
+  }
+}
@@ -0,0 +1,69 @@
+{
+  "mercury": {
+    "Easy_pass@1": 0.7136363636363636,
+    "Easy_pass@3": 0.7363636363636363,
+    "Easy_pass@5": 0.7386363636363636,
+    "Easy_beyond@1": 0.5346616692949514,
+    "Easy_beyond@3": 0.543085393353663,
+    "Easy_beyond@5": 0.5445871487558871,
+    "Medium_pass@1": 0.6938271604938272,
+    "Medium_pass@3": 0.7444444444444445,
+    "Medium_pass@5": 0.7530864197530864,
+    "Medium_beyond@1": 0.5439194762152021,
+    "Medium_beyond@3": 0.5416128864524683,
+    "Medium_beyond@5": 0.549752418636262,
+    "Hard_pass@1": 0.5218390804597701,
+    "Hard_pass@3": 0.6505747126436782,
+    "Hard_pass@5": 0.6781609195402298,
+    "Hard_beyond@1": 0.4255225142333653,
+    "Hard_beyond@3": 0.4055039346171537,
+    "Hard_beyond@5": 0.40262274727501585,
+    "Average_pass@1": 0.6421874999999999,
+    "Average_pass@3": 0.7097656250000001,
+    "Average_pass@5": 0.72265625,
+    "Average_beyond@1": 0.5005006375378508,
+    "Average_beyond@3": 0.4958632840994711,
+    "Average_beyond@5": 0.49797575786320986
+  },
+  "config": {
+    "prefix": "",
+    "do_sample": true,
+    "temperature": 0.2,
+    "top_k": 0,
+    "top_p": 0.95,
+    "n_samples": 5,
+    "eos": "<|endoftext|>",
+    "seed": 0,
+    "model": "Qwen/CodeQwen1.5-7B",
+    "modeltype": "causal",
+    "peft_model": null,
+    "revision": null,
+    "use_auth_token": false,
+    "trust_remote_code": false,
+    "tasks": "mercury",
+    "instruction_tokens": null,
+    "batch_size": 5,
+    "max_length_generation": 2048,
+    "precision": "fp32",
+    "load_in_8bit": true,
+    "load_in_4bit": false,
+    "left_padding": false,
+    "limit": null,
+    "limit_start": 0,
+    "save_every_k_tasks": -1,
+    "postprocess": true,
+    "allow_code_execution": true,
+    "generation_only": false,
+    "load_generations_path": null,
+    "load_data_path": null,
+    "metric_output_path": "CodeQwen1.5-7B-mercury-result.json",
+    "save_generations": true,
+    "load_generations_intermediate_paths": null,
+    "save_generations_path": "CodeQwen1.5-7B-mercury-generations.json",
+    "save_references": false,
+    "save_references_path": "references.json",
+    "prompt": "prompt",
+    "max_memory_per_gpu": null,
+    "check_references": false
+  }
+}
@@ -37,7 +37,6 @@ def __init__(self, input_length, multiplier):
     def __call__(self, input_ids, scores, **kwargs):
         """Returns true if generated sequence is too long."""
         return input_ids.shape[1] > int(self.input_length * self.multiplier)
-        
 
 def parallel_generations(
         task,
@@ -108,6 +107,7 @@ def parallel_generations(
         print(f"number of problems for this task is {n_tasks}")
     n_copies = ceil(args.n_samples / args.batch_size)
 
+    print("TokenizedDataset...")
     ds_tokenized = TokenizedDataset(
         task,
         dataset,
@@ -137,7 +137,8 @@ def parallel_generations(
     else:
         # model.to() is not supported for 8bit and 4bit models
         model, ds_loader = accelerator.prepare(model, ds_loader)
-
+    
+    print("complete_code...")
     generations = complete_code(
         task,
         accelerator,
@@ -156,4 +157,5 @@ def parallel_generations(
         intermediate_save_generations_path=intermediate_save_generations_path,
         **gen_kwargs,
     )
+
     return generations
@@ -291,15 +291,13 @@ def run_samples(samples, n_workers=4):
                 args = (sample,)
                 future = executor.submit(Sandbox.run_sample, *args)
                 futures.append(future)
-                n_samples += 1
 
             for future in tqdm(as_completed(futures), total=len(futures), desc='Reading futures'):
                 result = future.result()
                 results.append(result)
 
         return results
 
-
 def estimate_pass_at_k(num_samples, num_correct, k):
     """Estimates pass@k of each problem and returns them in an array."""
 
@@ -317,18 +315,10 @@ def estimator(n: int, c: int, k: int) -> float:
 
     return np.array([estimator(int(n), int(c), k) for n, c in zip(num_samples_it, num_correct)])
 
-def estimate_beyond_at_k(runtimes, k):
-    """Estimates pass@k of each problem and returns them in an array."""
-
-    def estimator(runtimes: list, k: int) -> float:
-        """Calculates 1 - comb(n - c, k) / comb(n, k)."""
-        print(runtimes)
-        print("============")
-        return sum(runtimes[:k])/len(runtimes)
-
-    return np.array([estimator(r, k) for r in runtimes])
+def estimate_beyond_at_k(beyonds, k):
+    return sum([sum(b[:k]) / k for b in beyonds]) / len(beyonds)
 
-def compute_beyond_eval(generations_list, reference_list, timeout=30):
+def compute_beyond_eval(generations_list, reference_list, timeout=10):
     sandbox = Sandbox()
 
     scores = {
@@ -338,10 +328,10 @@ def compute_beyond_eval(generations_list, reference_list, timeout=30):
         "Average": dict(total_c=list(), correct_c=list(), beyond_c=list()),
     }
 
-    for generations, instance in tqdm(zip(generations_list, reference_list), total=len(generations_list)):
+    for generations, instance in tqdm(zip(generations_list, reference_list), total=len(generations_list), desc='compute_beyond_eval'):
         # Construct runtime distribution from sample solutions
         runtimes = list()
-        for index, solution in enumerate(instance['solutions']):
+        for index, solution in tqdm(enumerate(instance['solutions']), desc="Construct runtime distribution from sample solutions"):
             sample = {
 				"solution": solution['solution'],
 				"convert_offline": instance['convert_offline'],
@@ -366,23 +356,24 @@ def compute_beyond_eval(generations_list, reference_list, timeout=30):
         b_l = list()
         difficulty = instance['difficulty']
 
-        for index, solution in enumerate(generations):                   
+        for index, solution in tqdm(enumerate(generations), desc="generation execution", total=len(generations)):                   
             sample = {
-                "solution": instance['prompt'] + solution,
+                "solution": solution,
                 "convert_offline": instance['convert_offline'],
                 "evaluate_offline": instance['evaluate_offline'],
                 "entry_point": instance['entry_point'],
                 "test_cases": json.loads(instance['test_cases']),
                 "solution_index": index,
                 "timeout": timeout,
             }
-
-            result = sandbox.run_sample(sample)
+            
+            results = [sandbox.run_sample(sample) for _ in range(3)]
+            print(results[0])
             t_c += 1
 
             # Calculate Beyond
-            if result['result'] == "passed":
-                runtime = result['runtime']
+            if results[0]['result'] == "passed":
+                runtime = sum([r['runtime'] for r in results]) / len(results)
                 p_c += 1
             else:
                 runtime = float('inf')
@@ -398,15 +389,20 @@ def compute_beyond_eval(generations_list, reference_list, timeout=30):
         scores['Average']['total_c'] += [t_c]
         scores['Average']['correct_c'] += [p_c]
         scores['Average']['beyond_c'] += [b_l]
+        
+        print(f'total: {t_c}')
+        print(f'correct: {p_c}')
+        print(f'beyond: {b_l}')
+        print("-" * 60)
 
     results = dict()
     for difficulty in ['Easy', "Medium", "Hard", "Average"]:
         total = np.array(scores[difficulty]['total_c'])
         correct = np.array(scores[difficulty]['correct_c'])
         beyond = scores[difficulty]['beyond_c']
 
-        pass_at_k = {f"{difficulty}_pass@{k}": estimate_pass_at_k(total, correct, k).mean() for k in [1,3,5] if (total >= k).all()}
-        beyond_at_k = {f"{difficulty}_beyond@{k}": estimate_beyond_at_k(beyond, k).mean() for k in [1,3,5] if (total >= k).all()}
+        pass_at_k = {f"{difficulty}_pass@{k}": estimate_pass_at_k(total, correct, k).mean() for k in [1,3,5,10,15,20,30,50,100] if (total >= k).all()}
+        beyond_at_k = {f"{difficulty}_beyond@{k}": estimate_beyond_at_k(beyond, k) for k in [1,3,5,10,15,20,30,50,100] if (total >= k).all()}
 
         results.update(pass_at_k)
         results.update(beyond_at_k)
 
@@ -6,7 +6,8 @@
 
 Homepage: https://github.com/Elfsong/Mercury
 """
-
+import os
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
 
 from bigcode_eval.base import Task
 from bigcode_eval.tasks.custom_metrics.beyond_eval import compute_beyond_eval
@@ -20,7 +21,6 @@
 }
 """
 
-
 class Mercury(Task):
     """
     A task represents an entire benchmark including its dataset, problems,
@@ -31,18 +31,50 @@ class Mercury(Task):
 
     def __init__(self, prompt):
         super().__init__(
-            stop_words=["\nclass", "\ndef", "\n#", "\n@", "\nprint", "\nif", "\n```", "<file_sep>", "<｜end▁of▁sentence｜>"],
+            # stop_words=["\nclass", "\ndef", "\n#", "\n@", "\nprint", "\nif", "\n```", "<file_sep>", "<｜end▁of▁sentence｜>"],
+            stop_words=["\nclass", "\ndef", "\n#", "\n@", "\nprint", "\nif", "\n```", "<file_sep>", "<｜end▁of▁sentence｜>", "\n###", "\n\n\n\n\n", "<|endoftext|>"],
             requires_execution=True,
         )
         self.prompt = prompt
 
     def get_dataset(self):
         """Returns dataset for the task or an iterable of any object, that get_prompt can handle"""
         return self.dataset["eval"]
+    
+    @staticmethod
+    def prompt_generate(question_content, starter_code):
+        examples_json = {
+            "question": "You are given a 0-indexed array of positive integers nums. Find the number of triplets (i, j, k) that meet the following conditions:\n\n0 <= i < j < k < nums.length\nnums[i], nums[j], and nums[k] are pairwise distinct.\n\t\nIn other words, nums[i] != nums[j], nums[i] != nums[k], and nums[j] != nums[k].\n\n\n\nReturn the number of triplets that meet the conditions.\n \nExample 1:\n\nInput: nums = [4,4,2,4,3]\nOutput: 3\nExplanation: The following triplets meet the conditions:\n- (0, 2, 4) because 4 != 2 != 3\n- (1, 2, 4) because 4 != 2 != 3\n- (2, 3, 4) because 2 != 4 != 3\nSince there are 3 triplets, we return 3.\nNote that (2, 0, 4) is not a valid triplet because 2 > 0.\n\nExample 2:\n\nInput: nums = [1,1,1,1,1]\nOutput: 0\nExplanation: No triplets meet the conditions so we return 0.\n\n \nConstraints:\n\n3 <= nums.length <= 100\n1 <= nums[i] <= 1000\n\n",
+            "sample_code": 'class Solution(object):\n    def unequalTriplets(self, nums: List[int]) -> int:\n        """\n\t:type nums: List[int]\n\t:rtype: int\n\t"""\n        \n',
+            "answer": 'class Solution(object):\n    def unequalTriplets(self, nums: List[int]) -> int:\n        """\n\t:type nums: List[int]\n\t:rtype: int\n\t"""\n        \n        ans = 0\n        n = len(a)\n        for i in range(n):\n            for j in range(i + 1, n):\n                for k in range(j + 1, n):\n                    ans += len({a[i], a[j], a[k]}) == 3\n        return ans'
+        }
+
+        def get_example_prompt(example):
+            prompt = ""
+            prompt += "### Question\n"
+            prompt += example["question"]
+            prompt += "\n\n"
+            if starter_code:
+                prompt += "### Code Prompt\n"
+                prompt += example["sample_code"]
+                prompt += "\n\n"
+            prompt += "### Completion\n"
+            prompt += example["answer"]
+            if example["answer"]:
+                prompt += "\n\n"
+            return prompt
+
+        prompt = ""
+        # one-shot generation example
+        prompt += get_example_prompt(examples_json)
+        # code generation
+        prompt += get_example_prompt({"question": question_content,"sample_code": starter_code,"answer": ""})
+        
+        return prompt
 
     def get_prompt(self, doc):
         """Builds the prompt for the LM to generate from."""
-        return f'\'\'\'{doc["pretty_content"][0]}\'\'\'\n{doc["prompt"]}'
+        return self.prompt_generate(doc["pretty_content"][0], doc["prompt"])
 
     def get_reference(self, doc):
         """Builds the reference solutions for the doc (sample from the test dataset)."""
 
@@ -120,7 +120,7 @@ def __iter__(self):
                 "n_copies (n_samples/batch_size) was changed from 1 to 2 because n_tasks isn't proportional to num devices"
             )
 
-        for sample in range(self.n_tasks):
+        for sample in tqdm(range(self.n_tasks), desc="Task Encoding"):
             for _ in range(self.n_copies):
                 if self.has_encoder:
                     yield {
@@ -220,7 +220,6 @@ def _parse_instruction(code, instruction_tokens):
         shift = len("```python")
     return code[idx + shift :]
 
-
 def complete_code(
     task,
     accelerator,
@@ -249,11 +248,13 @@ def complete_code(
     code_gens: List[List[Optional[str]]] = [[] for _ in range(n_tasks)]
     generations = [] if not intermediate_generations else intermediate_generations
     gen_token_dict = defaultdict(list)  # dict of list of generated tokens
+    
     for step, batch in tqdm(
         enumerate(dataloader),
         total=math.ceil(
             n_tasks * dataloader.dataset.n_copies / accelerator.num_processes
         ),
+        desc="batch generation",
     ):
         with torch.no_grad():
             if task.stop_words: