Update

Elfsong · Elfsong · commit 74f8e19978cd · 2024-04-15T08:43:10.000Z
diff --git a/bigcode_eval/tasks/custom_metrics/beyond_eval.py b/bigcode_eval/tasks/custom_metrics/beyond_eval.py
@@ -300,16 +300,14 @@ def run_samples(samples, n_workers=4):
         return results
 
 
-def estimate_at_k(num_samples, num_correct, k):
-    """Estimates beyond@k of each problem and returns them in an array."""
-    
-    def cf(n, k):
-        return math.gamma(n+1) / (math.gamma(k+1) * (math.gamma(n-k+1)))
+def estimate_pass_at_k(num_samples, num_correct, k):
+    """Estimates pass@k of each problem and returns them in an array."""
 
     def estimator(n: int, c: int, k: int) -> float:
+        """Calculates 1 - comb(n - c, k) / comb(n, k)."""
         if n - c < k:
             return 1.0
-        return 1 - cf(n-c, k) / cf(n, k)
+        return 1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1))
 
     if isinstance(num_samples, int):
         num_samples_it = itertools.repeat(num_samples, len(num_correct))
@@ -319,6 +317,17 @@ def estimator(n: int, c: int, k: int) -> float:
 
     return np.array([estimator(int(n), int(c), k) for n, c in zip(num_samples_it, num_correct)])
 
+def estimate_beyond_at_k(runtimes, k):
+    """Estimates pass@k of each problem and returns them in an array."""
+
+    def estimator(runtimes: list, k: int) -> float:
+        """Calculates 1 - comb(n - c, k) / comb(n, k)."""
+        print(runtimes)
+        print("============")
+        return sum(runtimes[:k])/len(runtimes)
+
+    return np.array([estimator(r, k) for r in runtimes])
+
 def compute_beyond_eval(generations_list, reference_list, timeout=30):
     sandbox = Sandbox()
     
@@ -353,7 +362,8 @@ def compute_beyond_eval(generations_list, reference_list, timeout=30):
         max_runtime = max(runtimes)
         
         # Evaluate generated solutions
-        t_c, p_c, b_c = 0, 0, 0
+        t_c, p_c = 0, 0
+        b_l = list()
         difficulty = instance['difficulty']
         
         for index, solution in enumerate(generations):                   
@@ -373,29 +383,30 @@ def compute_beyond_eval(generations_list, reference_list, timeout=30):
             # Calculate Beyond
             if result['result'] == "passed":
                 runtime = result['runtime']
-                runtime = min(runtime, max_runtime)
-                runtime = max(runtime, min_runtime)
-                b_c += (max_runtime - runtime) / (max_runtime - min_runtime)          
                 p_c += 1
             else:
                 runtime = float('inf')
+                
+            runtime = min(runtime, max_runtime)
+            runtime = max(runtime, min_runtime)
+            b_l += [(max_runtime - runtime) / (max_runtime - min_runtime)]  
         
         scores[difficulty]['total_c'] += [t_c]
         scores[difficulty]['correct_c'] += [p_c]
-        scores[difficulty]['beyond_c'] += [b_c]
+        scores[difficulty]['beyond_c'] += [b_l]
         
         scores['Average']['total_c'] += [t_c]
         scores['Average']['correct_c'] += [p_c]
-        scores['Average']['beyond_c'] += [b_c]
+        scores['Average']['beyond_c'] += [b_l]
     
     results = dict()
     for difficulty in ['Easy', "Medium", "Hard", "Average"]:
         total = np.array(scores[difficulty]['total_c'])
         correct = np.array(scores[difficulty]['correct_c'])
-        beyond = np.array(scores[difficulty]['beyond_c'])
+        beyond = scores[difficulty]['beyond_c']
         
-        pass_at_k = {f"{difficulty}_pass@{k}": estimate_at_k(total, correct, k).mean() for k in [1,3,5] if (total >= k).all()}
-        beyond_at_k = {f"{difficulty}_beyond@{k}": estimate_at_k(total, beyond, k).mean() for k in [1,3,5] if (total >= k).all()}
+        pass_at_k = {f"{difficulty}_pass@{k}": estimate_pass_at_k(total, correct, k).mean() for k in [1,3,5] if (total >= k).all()}
+        beyond_at_k = {f"{difficulty}_beyond@{k}": estimate_beyond_at_k(beyond, k).mean() for k in [1,3,5] if (total >= k).all()}
         
         results.update(pass_at_k)
         results.update(beyond_at_k)
diff --git a/main.py b/main.py
@@ -266,7 +266,7 @@ def main():
         model_kwargs = {
             "revision": args.revision,
             "trust_remote_code": args.trust_remote_code,
-            "use_auth_token": args.use_auth_token,
+            "token": args.use_auth_token,
         }
         if args.load_in_8bit:
             print("Loading model in 8bit")
@@ -275,6 +275,8 @@ def main():
         elif args.load_in_4bit:
             print("Loading model in 4bit")
             model_kwargs["load_in_4bit"] = args.load_in_4bit
+            model_kwargs["torch_dtype"] = torch.float16
+            model_kwargs["bnb_4bit_compute_dtype"] = torch.float16            
             model_kwargs["device_map"] = {"": accelerator.process_index}
         else:
             print(f"Loading model in {args.precision}")
@@ -322,7 +324,7 @@ def main():
                 args.model,
                 revision=args.revision,
                 trust_remote_code=args.trust_remote_code,
-                use_auth_token=args.use_auth_token,
+                token=args.use_auth_token,
                 padding_side="left",  
             )
         else:
@@ -331,7 +333,7 @@ def main():
                 args.model,
                 revision=args.revision,
                 trust_remote_code=args.trust_remote_code,
-                use_auth_token=args.use_auth_token,
+                token=args.use_auth_token,
                 truncation_side="left",
                 padding_side="right",  
             )
diff --git a/mercury.sh b/mercury.sh
@@ -0,0 +1,22 @@
+accelerate  launch --main_process_port 29501  main.py  \
+    --model deepseek-ai/deepseek-coder-6.7b-base   \
+    --load_in_4bit   \
+    --limit 256 \
+    --max_length_generation 1024   \
+    --tasks mercury   \
+    --n_samples 5  \
+    --temperature 0.2  \
+    --batch_size 6   \
+    --allow_code_execution
+
+accelerate  launch --main_process_port 29502  main.py  \
+    --model deepseek-ai/deepseek-coder-6.7b-instruct   \
+    --load_in_4bit   \
+    --limit 256 \
+    --max_length_generation 1024   \
+    --tasks mercury   \
+    --n_samples 5  \
+    --temperature 0.2  \
+    --batch_size 6   \
+    --allow_code_execution
+
diff --git a/playground.ipynb b/playground.ipynb
@@ -0,0 +1,185 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "070e463eddd94abf8d5d2384fcd7674a",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "tokenizer_config.json:   0%|          | 0.00/749 [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "7122460fac9643cda39186565c42eeb4",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "b51157e2cc5f436cb49b972b4bd23ecc",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "cf661fb7a0224e68be5f5d1255b608c2",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "special_tokens_map.json:   0%|          | 0.00/411 [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "06601ec5d98847809cd3d4d1f0af3fb5",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "config.json:   0%|          | 0.00/588 [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "843d8e08d9a94b268c8f7c7df3030765",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "model.safetensors.index.json:   0%|          | 0.00/37.6k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "cefc9c5dbd1b4bc2b9a2a903192b33f8",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading shards:   0%|          | 0/7 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "a5bedd3959124f1f8875178b13450682",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "model-00001-of-00007.safetensors:   0%|          | 0.00/9.85G [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "# Load model directly\n",
+    "from transformers import AutoTokenizer, AutoModelForCausalLM\n",
+    "\n",
+    "tokenizer = AutoTokenizer.from_pretrained(\"codellama/CodeLlama-34b-hf\")\n",
+    "model = AutoModelForCausalLM.from_pretrained(\"codellama/CodeLlama-34b-hf\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers import AutoTokenizer\n",
+    "import transformers\n",
+    "import torch\n",
+    "\n",
+    "model = \"codellama/CodeLlama-34b-hf\"\n",
+    "\n",
+    "tokenizer = AutoTokenizer.from_pretrained(model)\n",
+    "pipeline = transformers.pipeline(\n",
+    "    \"text-generation\",\n",
+    "    model=model,\n",
+    "    torch_dtype=torch.float16,\n",
+    "    device_map=\"auto\",\n",
+    ")\n",
+    "\n",
+    "sequences = pipeline(\n",
+    "    'import socket\\n\\ndef ping_exponential_backoff(host: str):',\n",
+    "    do_sample=True,\n",
+    "    top_k=10,\n",
+    "    temperature=0.1,\n",
+    "    top_p=0.95,\n",
+    "    num_return_sequences=1,\n",
+    "    eos_token_id=tokenizer.eos_token_id,\n",
+    "    max_length=200,\n",
+    ")\n",
+    "for seq in sequences:\n",
+    "    print(f\"Result: {seq['generated_text']}\")\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "workspace",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.16"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}