SecurityLab-UCD
diff --git a/‎.envrc‎
Lines changed: 1 addition & 0 deletions b/‎.envrc‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎README.md‎
Lines changed: 2 additions & 2 deletions b/‎README.md‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎main.py‎
Lines changed: 50 additions & 58 deletions b/‎main.py‎
Lines changed: 50 additions & 58 deletions
diff --git a/‎pyproject.toml‎
Lines changed: 5 additions & 3 deletions b/‎pyproject.toml‎
Lines changed: 5 additions & 3 deletions
diff --git a/‎ollama_pull.py‎ ‎scripts/ollama_pull.py‎ollama_pull.py renamed to scripts/ollama_pull.py b/‎ollama_pull.py‎ ‎scripts/ollama_pull.py‎ollama_pull.py renamed to scripts/ollama_pull.py
diff --git a/‎scripts/result_token_stat.py‎
Lines changed: 24 additions & 0 deletions b/‎scripts/result_token_stat.py‎
Lines changed: 24 additions & 0 deletions
diff --git a/‎src/__init__.py‎
Lines changed: 4 additions & 0 deletions b/‎src/__init__.py‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎src/common.py‎
Lines changed: 3 additions & 6 deletions b/‎src/common.py‎
Lines changed: 3 additions & 6 deletions
@@ -0,0 +1 @@
+export PYTHONPATH=$PYTHONPATH:`pwd`
@@ -21,7 +21,7 @@ uv sync # create a virtual environment, and install dependencies
 This script will build the benchmark (Prelude with NL) from the raw data.
 
 ```sh
-uv run --project . scripts/preprocess_benchmark.py
+uv run scripts/preprocess_benchmark.py
 ```
 
 ### TF-Bench_pure
@@ -68,7 +68,7 @@ We use [Ollama](https://ollama.com/) to manage and run the OSS models.
 ```sh
 curl -fsSL https://ollama.com/install.sh | sh # install ollama, you need sudo for this
 ollama serve # start your own instance instead of a system service
-uv run ollama_pull.sh # install required models
+uv run --project . scripts/ollama_pull.sh # install required models
 ```
 
 ```sh
 
@@ -10,115 +10,102 @@
 from openai import OpenAI
 from ollama import Client as OllamaClient
 from anthropic import Anthropic
+from google import genai
 
 import fire
 from src.common import (
     BenchmarkTask,
-    SEED,
-    TEMPERATURE,
     get_prompt,
 )
 
 from src.experiment import (
-    O1_MODELS,
-    GPT_MODELS,
+    OAI_MODELS,
+    OAI_TTC_MODELS,
     CLAUDE_MODELS,
+    CLAUDE_TTC_MODELS,
     DEEPSEEK_MODELS,
     GEMINI_MODELS,
+    GEMINI_TTC_MODELS,
     get_ant_model,
     get_ant_ttc_model,
     get_oai_model,
-    get_o1_model,
+    get_oai_ttc_model,
+    get_gemini_model,
+    get_gemini_ttc_model,
 )
-from src.experiment_ollama import OLLAMA_MODELS, get_model as get_ollama_model
+from src.experiment_ollama import OLLAMA_MODELS, get_ollama_model
 from src.postprocessing import postprocess, RESPONSE_STRATEGIES
 from src.evaluation import evaluate
 
 
 def main(
-    input_file: str = "Benchmark-F.removed.json",
-    output_file: str | None = None,
-    log_file: str | None = None,
-    full_type: bool = True,
-    model: str = "gpt-3.5-turbo",
-    seed: int = SEED,
-    temperature: float = TEMPERATURE,
+    model: str,
     port: int = 11434,
     pure: bool = False,
-    reasoning: bool = False,
+    thinking_budget: int = 1000,
+    output_file: str | None = None,
+    log_file: str = "evaluation_log.jsonl",
 ):
     """
     Run an experiment using various AI models to generate and evaluate type signatures.
 
     Parameters:
-        input_file (str): Path to the input JSON file containing benchmark tasks.
-                          Default is "Benchmark-F.removed.json".
-
-        output_file (str | None): Path to the output file where generated type signatures will be saved.
-                                  If None, the output will be saved to "result/{model}.txt". Default is None.
-
-        log_file (str | None): Path to the log file where evaluation metrics will be appended.
-                               If None, defaults to "evaluation_log.jsonl". Default is None.
-
-        full_type (bool): Determines whether to ask the model to predict the full type signature in the prompt.
-                          If True, the model will be asked to complete full type signature.
-                          If False, the model will be asked to complete the return type in type signature. Default is True.
-
         model (str): Name of the model to use for generating type signatures. Must be one of:
                      - GPT_MODELS: ["gpt-3.5-turbo-0125", "gpt-4-turbo-2024-04-09", ...]
                      - OLLAMA_MODELS, CLAUDE_MODELS, or O1_MODELS.
                      Default is "gpt-3.5-turbo".
 
-        seed (int): Random seed to ensure reproducibility in experiments. Default is 0.
-
-        temperature (float): Sampling temperature for the model's outputs. Higher values
-                             produce more diverse outputs. Default is 0.0 (deterministic outputs).
-
         port (int): Port number for connecting to the Ollama server (if using Ollama models).
                     Ignored for other models. Default is 11434.
 
         pure (bool): If True, uses the original variable naming in type inference.
                      If False, uses rewritten variable naming (e.g., `v1`, `v2`, ...). Default is False.
-
-        reasoning (bool): If True, uses the reasoning prompt for the model. NOTE: this is not for claude-3-7-sonnet.
     """
     assert (
         model
-        in GPT_MODELS
+        in OAI_MODELS
+        + OAI_TTC_MODELS
         + OLLAMA_MODELS
-        + CLAUDE_MODELS
-        + O1_MODELS
         + DEEPSEEK_MODELS
+        + CLAUDE_MODELS
+        + CLAUDE_TTC_MODELS
         + GEMINI_MODELS
+        + GEMINI_TTC_MODELS
     ), f"{model} is not supported."
 
+    # hard-coding benchmark file path for experiment
+    input_file = "tfb.pure.json" if pure else "tfb.json"
+    input_file = os.path.abspath(input_file)
+    assert os.path.exists(
+        input_file
+    ), f"{input_file} does not exist! Please download or build it first."
+
     if output_file is None:
         os.makedirs("result", exist_ok=True)
         output_file = f"result/{model}.txt"
 
-    if log_file is None:
-        log_file = "evaluation_log.jsonl"
-
-    client: OpenAI | Anthropic | OllamaClient
+    client: OpenAI | Anthropic | OllamaClient | genai.Client
     generate: Callable[[str], str | None]
 
-    if model in GPT_MODELS:
+    if model in OAI_MODELS:
         assert "OPENAI_API_KEY" in os.environ, "Please set OPEN_API_KEY in environment!"
         client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])
         generate = get_oai_model(client, model, pure)
-    elif model in O1_MODELS:
+
+    elif model in OAI_TTC_MODELS:
         assert "OPENAI_API_KEY" in os.environ, "Please set OPEN_API_KEY in environment!"
         client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])
-        generate = get_o1_model(client, model, pure)
+        generate = get_oai_ttc_model(client, model, pure)
     elif model in CLAUDE_MODELS:
         assert (
             "ANTHROPIC_API_KEY" in os.environ
         ), "Please set ANTHROPIC_API_KEY in environment!"
         client = Anthropic(api_key=os.environ["ANTHROPIC_API_KEY"])
-        if reasoning:
-            generate = get_ant_ttc_model(client, model, pure)
-        else:
-            generate = get_ant_model(client, model, pure)
+        generate = get_ant_model(client, model, pure)
+    elif model in CLAUDE_TTC_MODELS:
+        client = Anthropic(api_key=os.environ["ANTHROPIC_API_KEY"])
+        generate = get_ant_ttc_model(client, model, pure, thinking_budget)
+
     elif model in DEEPSEEK_MODELS:
         assert (
             "DEEPSEEK_API_KEY" in os.environ
@@ -127,23 +114,28 @@ def main(
             api_key=os.environ["DEEPSEEK_API_KEY"], base_url="https://api.deepseek.com"
         )
         generate = get_oai_model(client, model, pure)
+
     elif model in GEMINI_MODELS:
         assert (
-            "GEMINI_API_KEY" in os.environ
-        ), "Please set GEMINI_API_KEY in environment!"
-        client = OpenAI(
-            api_key=os.environ["GEMINI_API_KEY"],
-            base_url="https://generativelanguage.googleapis.com/v1beta/openai/",
-        )
-        generate = get_oai_model(client, model)
+            "GOOGLE_API_KEY" in os.environ
+        ), "Please set GOOGLE_API_KEY in environment!"
+        client = genai.Client(api_key=os.environ["GOOGLE_API_KEY"])
+        generate = get_gemini_model(client, model, pure)
+    elif model in GEMINI_TTC_MODELS:
+        assert (
+            "GOOGLE_API_KEY" in os.environ
+        ), "Please set GOOGLE_API_KEY in environment!"
+        client = genai.Client(api_key=os.environ["GOOGLE_API_KEY"])
+        generate = get_gemini_ttc_model(client, model, pure, thinking_budget)
+
     else:
         client = OllamaClient(host=f"http://localhost:{port}")
-        generate = get_ollama_model(client, model, seed, temperature, pure)
+        generate = get_ollama_model(client, model, pure)
 
     with open(input_file, "r") as fp:
         tasks = [from_dict(data_class=BenchmarkTask, data=d) for d in json.load(fp)]
 
-    prompts = lmap(lambda x: get_prompt(x, full_type), tasks)
+    prompts = lmap(get_prompt, tasks)
     responses = lmap(generate, tqdm(prompts, desc=model))
     gen_results = (
         Chain(responses)
@@ -161,7 +153,7 @@ def main(
 
     os.makedirs(os.path.dirname(output_file), exist_ok=True)
     with open(log_file, "a") as fp:
-        logging_result = {"model_name": model, **eval_acc}
+        logging_result = {"model_name": model, **eval_acc, "pure": pure}
         fp.write(f"{json.dumps(logging_result)}\n")
 
 
 
@@ -10,14 +10,14 @@ dependencies = [
     "fire==0.5.0",
     "funcy==2.0",
     "funcy-chain==0.2.0",
-    "google-genai>=1.2.0",
+    "google-genai>=1.11.0",
     "groq==0.8.0",
     "hypothesis>=6.98.6",
     "markdown-to-json==2.1.2",
     "matplotlib>=3.8.3",
     "numpy>=1.26.4",
-    "ollama==0.2.1",
-    "openai==1.30.5",
+    "ollama>=0.2.1",
+    "openai==1.75.0",
     "pathos>=0.3.3",
     "pylint>=3.3.6",
     "pytest>=8.0.0",
@@ -35,3 +35,5 @@ dependencies = [
 [tool.pytest.ini_options]
 pythonpath = ["."]
 
+[tool.uv]
+package = true
@@ -0,0 +1,24 @@
+import tiktoken
+import fire
+from dacite import from_dict
+import json
+from funcy_chain import Chain
+import pandas
+
+from src.common import BenchmarkTask, get_prompt
+
+
+def main(input_file="tfb.json"):
+    with open(input_file, "r") as fp:
+        tasks = [from_dict(data_class=BenchmarkTask, data=d) for d in json.load(fp)]
+    # count the max, min, and average token length of the task.code
+
+    enc = tiktoken.encoding_for_model("gpt-4o")
+    token_counts = [len(enc.encode(task.signature)) for task in tasks]
+    df = pandas.DataFrame(token_counts, columns=["token_count"])
+    print(f"max: {df.token_count.max()}")
+    print(f"min: {df.token_count.min()}")
+    print(f"avg: {df.token_count.mean()}")
+
+if __name__ == "__main__":
+    fire.Fire(main)
@@ -0,0 +1,4 @@
+import logging
+
+logging.getLogger("openai").setLevel(logging.ERROR)
+logging.getLogger("httpx").setLevel(logging.ERROR)
@@ -8,8 +8,7 @@
 import markdown_to_json
 
 # Default hyper-parameters
-SEED = 0
-TEMPERATURE = 0.0
+MAX_TOKENS = 1024
 
 SYSTEM_PROMPT = """
 Act as a static analysis tool for type inference.
@@ -105,11 +104,9 @@ def remove_return_type(sig: str) -> str:
         return sig
 
 
-def get_prompt(task: BenchmarkTask, full_type: bool = True) -> str:
+def get_prompt(task: BenchmarkTask) -> str:
     """get prompt from a task instance"""
 
-    signature = "" if full_type else remove_return_type(task.signature)
-
     fn_name = extract_function_name(task)
     assert fn_name is not None
 
@@ -125,7 +122,7 @@ def get_prompt(task: BenchmarkTask, full_type: bool = True) -> str:
 \n\n
 {code}
 --complete the following type signature for '{fn_name}'
-{fn_name} :: {signature}
+{fn_name} ::
 """
     return prompt