diff --git a/README.md b/README.md index 988eaac2..d5abbcd5 100644 --- a/README.md +++ b/README.md @@ -98,3 +98,24 @@ To integrate a new backend, follow these steps: This project uses code from [KernelBench](https://github.com/ScalingIntelligence/KernelBench), licensed under the MIT License. + + +## Agent Evaluation (Directory Workflow) + +Use the minimal directory workflow in `scripts/agent_eval/`: +- each task directory only contains `reference.py` + `agent_prompt.txt` initially +- agent writes final output to `final_response.txt` (MultiKernelBench-compatible text format) +- external scripts handle launch + evaluation + +```bash +# 1) Prepare isolated tasks +python scripts/agent_eval/prepare_workspaces.py --language cuda --strategy add_shot --categories activation --workspace-root experiment/tasks --clean --readonly + +# 2) Launch agent CLI (example) +python scripts/agent_eval/launch_agent.py --workspace-root experiment/tasks --agent-cmd 'qwen -p "{prompt}"' --timeout 1200 --output-filename final_response.txt --auto-yolo --max-tasks 10 + +# 3) Externally evaluate generated txt outputs +python scripts/agent_eval/collect_results.py --workspace-root experiment/tasks --language cuda --output-filename final_response.txt --out-json experiment/outputs/agent_summary.json --out-csv experiment/outputs/agent_summary.csv +``` + +See `scripts/agent_eval/README.md` for details. diff --git a/scripts/agent_eval/README.md b/scripts/agent_eval/README.md new file mode 100644 index 00000000..ba832bfe --- /dev/null +++ b/scripts/agent_eval/README.md @@ -0,0 +1,72 @@ +# Agent Evaluation Workflow (Minimal Directory Mode) + +This workflow is the **minimal agent setting**: +- each task directory only pre-creates `reference.py` and `agent_prompt.txt` +- no in-directory self-evaluation scripts are provided to the agent +- the agent must write a single output text file: `final_response.txt` +- external controller evaluates those outputs using MultiKernelBench evaluator + +## 1) Prepare minimal workspaces + +```bash +python scripts/agent_eval/prepare_workspaces.py \ + --language cuda \ + --strategy add_shot \ + --categories activation \ + --workspace-root experiment/tasks \ + --clean \ + --readonly +``` + +After preparation, each task folder initially contains only: +- `reference.py` +- `agent_prompt.txt` + +`agent_prompt.txt` is generated dynamically from `prompt_generators/{language}_{strategy}.py` (aligned with existing benchmark prompts), then appends only the directory-output requirement (`final_response.txt`). + +## 2) Launch your agent + +`launch_agent.py` supports placeholders in `--agent-cmd`: +- `{prompt_file}` +- `{prompt}` +- `{workdir}` + +Example: + +```bash +python scripts/agent_eval/launch_agent.py \ + --workspace-root experiment/tasks \ + --agent-cmd 'qwen -p "{prompt}"' \ + --timeout 1200 \ + --output-filename final_response.txt \ + --auto-yolo \ + --max-tasks 10 +``` + +Expected: the agent writes `final_response.txt` in each task directory. + +`--max-tasks` is mainly for quick smoke runs / cost control: it only runs the first N task directories (sorted by task name), so you can debug workflow on small subsets before full-scale runs. + +## 3) Evaluate agent outputs externally + +```bash +python scripts/agent_eval/collect_results.py \ + --workspace-root experiment/tasks \ + --language cuda \ + --output-filename final_response.txt \ + --out-json experiment/outputs/agent_summary.json \ + --out-csv experiment/outputs/agent_summary.csv +``` + +This calls `eval_single_runner.py` for each task output and writes: +- per-task `eval_result.json` +- aggregated JSON/CSV summary + +## Notes + +- Keep agent output compatible with MultiKernelBench model output style (plain response text or fenced code block). +- If evaluation environment lacks hardware/runtime (e.g., no CUDA), records may show compile/runtime failures due to environment limitations. + + +If you use Qwen CLI in non-interactive mode, `--auto-yolo` is recommended (it appends `-y` automatically when missing). + diff --git a/scripts/agent_eval/collect_results.py b/scripts/agent_eval/collect_results.py new file mode 100644 index 00000000..dd7c1a0a --- /dev/null +++ b/scripts/agent_eval/collect_results.py @@ -0,0 +1,168 @@ +#!/usr/bin/env python3 +"""Collect agent outputs and run external MultiKernelBench evaluation.""" + +from __future__ import annotations + +import argparse +import csv +import json +import subprocess +import sys +from pathlib import Path + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Evaluate agent outputs from workspaces") + parser.add_argument("--workspace-root", type=Path, default=Path("experiment/tasks")) + parser.add_argument("--language", type=str, required=True, help="Backend language for eval_single_runner") + parser.add_argument("--output-filename", type=str, default="final_response.txt", help="Agent output txt filename") + parser.add_argument("--out-json", type=Path, default=Path("experiment/outputs/agent_summary.json")) + parser.add_argument("--out-csv", type=Path, default=Path("experiment/outputs/agent_summary.csv")) + parser.add_argument("--eval-timeout", type=int, default=300, help="Timeout per task evaluation (seconds)") + return parser.parse_args() + + +def evaluate_single(task_dir: Path, op: str, language: str, output_filename: str, timeout_sec: int) -> dict: + output_path = task_dir / output_filename + if not output_path.exists(): + return { + "task": op, + "status": "missing_output", + "compiled": False, + "correctness": False, + "performance": None, + "returncode": None, + "error": f"{output_filename} not found", + } + + tmp_result_path = task_dir / "raw_eval_result.json" + cmd = [ + sys.executable, + "eval_single_runner.py", + str(output_path), + op, + language, + str(tmp_result_path), + ] + + try: + proc = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout_sec) + except subprocess.TimeoutExpired: + return { + "task": op, + "status": "eval_timeout", + "compiled": False, + "correctness": False, + "performance": None, + "returncode": None, + "error": "evaluation timeout", + } + + if tmp_result_path.exists(): + raw = json.loads(tmp_result_path.read_text()) + tmp_result_path.unlink(missing_ok=True) + else: + raw = { + "compiled": False, + "correctness": False, + "performance": None, + "compile_info": "raw eval result missing", + } + + compiled = bool(raw.get("compiled", False)) + correctness = bool(raw.get("correctness", False)) + + if not compiled: + status = "compile_error" + elif compiled and not correctness: + status = "wrong_output" + else: + status = "passed" + + record = { + "task": op, + "status": status, + "compiled": compiled, + "correctness": correctness, + "performance": raw.get("performance"), + "returncode": proc.returncode, + "stdout_tail": (proc.stdout or "")[-2000:], + "stderr_tail": (proc.stderr or "")[-2000:], + "raw": raw, + "error": None, + } + + eval_result_path = task_dir / "eval_result.json" + eval_result_path.write_text(json.dumps(record, indent=2, ensure_ascii=False), encoding="utf-8") + return record + + +def main() -> None: + args = parse_args() + task_dirs = sorted([d for d in args.workspace_root.iterdir() if d.is_dir()], key=lambda p: p.name) + + rows = [] + for task_dir in task_dirs: + row = evaluate_single( + task_dir=task_dir, + op=task_dir.name, + language=args.language, + output_filename=args.output_filename, + timeout_sec=args.eval_timeout, + ) + perf = row.get("performance") or {} + row_csv = { + "task": row["task"], + "status": row["status"], + "compiled": row["compiled"], + "correctness": row["correctness"], + "perf_mean": perf.get("mean"), + "perf_std": perf.get("std"), + "returncode": row["returncode"], + "error": row.get("error"), + } + rows.append((row, row_csv)) + + total = len(rows) + compiled_cnt = sum(1 for r, _ in rows if r.get("compiled") is True) + correct_cnt = sum(1 for r, _ in rows if r.get("correctness") is True) + + summary = { + "workspace_root": str(args.workspace_root), + "language": args.language, + "output_filename": args.output_filename, + "total_tasks": total, + "compiled_tasks": compiled_cnt, + "correct_tasks": correct_cnt, + "compile_rate": compiled_cnt / total if total else 0.0, + "correct_rate": correct_cnt / total if total else 0.0, + "rows": [r for r, _ in rows], + } + + args.out_json.parent.mkdir(parents=True, exist_ok=True) + args.out_csv.parent.mkdir(parents=True, exist_ok=True) + args.out_json.write_text(json.dumps(summary, indent=2, ensure_ascii=False), encoding="utf-8") + + fieldnames = [ + "task", + "status", + "compiled", + "correctness", + "perf_mean", + "perf_std", + "returncode", + "error", + ] + with args.out_csv.open("w", newline="", encoding="utf-8") as f: + writer = csv.DictWriter(f, fieldnames=fieldnames) + writer.writeheader() + for _, row_csv in rows: + writer.writerow(row_csv) + + print(f"[INFO] Wrote JSON summary to {args.out_json}") + print(f"[INFO] Wrote CSV summary to {args.out_csv}") + print(f"[INFO] compile_rate={summary['compile_rate']:.3f}, correct_rate={summary['correct_rate']:.3f}") + + +if __name__ == "__main__": + main() diff --git a/scripts/agent_eval/launch_agent.py b/scripts/agent_eval/launch_agent.py new file mode 100644 index 00000000..4f19bb8e --- /dev/null +++ b/scripts/agent_eval/launch_agent.py @@ -0,0 +1,173 @@ +#!/usr/bin/env python3 +"""Launch directory-style coding agents on prepared minimal workspaces.""" + +from __future__ import annotations + +import argparse +import json +import shlex +import subprocess +import time +from pathlib import Path + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Run agent command for each workspace") + parser.add_argument("--workspace-root", type=Path, default=Path("experiment/tasks")) + parser.add_argument( + "--agent-cmd", + type=str, + required=True, + help=( + "Agent command template. Use {prompt_file}, {prompt}, {workdir}. " + "Example: qwen -p \"{prompt}\"" + ), + ) + parser.add_argument("--timeout", type=int, default=1200, help="Per-task timeout seconds") + parser.add_argument("--output-filename", type=str, default="final_response.txt", help="Expected agent output txt filename") + parser.add_argument("--max-tasks", type=int, default=None, help="Optional cap on number of tasks (useful for smoke tests / cost control)") + parser.add_argument("--task-filter", nargs="*", default=None, help="Optional explicit task ids") + parser.add_argument( + "--auto-yolo", + action="store_true", + help="If command is qwen and -y is missing, append -y automatically.", + ) + return parser.parse_args() + + +def load_manifest(workspace_root: Path) -> dict: + manifest_path = workspace_root / "manifest.json" + if manifest_path.exists(): + return json.loads(manifest_path.read_text()) + return {} + + +def pick_task_dirs(workspace_root: Path, task_filter: list[str] | None, max_tasks: int | None) -> list[Path]: + task_dirs = [d for d in workspace_root.iterdir() if d.is_dir()] + task_dirs.sort(key=lambda p: p.name) + if task_filter: + allow = set(task_filter) + task_dirs = [d for d in task_dirs if d.name in allow] + if max_tasks is not None: + task_dirs = task_dirs[:max_tasks] + return task_dirs + + +def maybe_append_yolo(argv: list[str], auto_yolo: bool) -> list[str]: + if not auto_yolo: + return argv + if not argv: + return argv + + exe = Path(argv[0]).name + if exe == "qwen" and "-y" not in argv: + return [*argv, "-y"] + return argv + + +def run_single_task( + task_dir: Path, + command_template: str, + timeout: int, + output_filename: str, + auto_yolo: bool, +) -> dict: + prompt_file = task_dir / "agent_prompt.txt" + prompt_text = prompt_file.read_text(encoding="utf-8") if prompt_file.exists() else "" + + command = command_template.format( + prompt_file=str(prompt_file), + prompt=prompt_text.replace('"', '\\"').replace("\n", " "), + workdir=str(task_dir), + ) + argv = maybe_append_yolo(shlex.split(command), auto_yolo) + + started_at = time.time() + status = "finished" + error = None + + try: + proc = subprocess.run( + argv, + cwd=task_dir, + capture_output=True, + text=True, + timeout=timeout, + ) + returncode = proc.returncode + stdout_tail = (proc.stdout or "")[-2000:] + stderr_tail = (proc.stderr or "")[-2000:] + except subprocess.TimeoutExpired as e: + status = "timeout" + returncode = None + stdout_tail = ((e.stdout or "") if isinstance(e.stdout, str) else "")[-2000:] + stderr_tail = ((e.stderr or "") if isinstance(e.stderr, str) else "")[-2000:] + proc = None + except Exception as e: # noqa: BLE001 + status = "launch_error" + returncode = None + stdout_tail = "" + stderr_tail = "" + error = str(e) + proc = None + + output_path = task_dir / output_filename + output_exists = output_path.exists() + + + if "requires user approval" in stderr_tail and "-y" in stderr_tail and not auto_yolo: + status = "needs_yolo" + + finished_at = time.time() + return { + "task": task_dir.name, + "status": status, + "returncode": returncode, + "duration_sec": round(finished_at - started_at, 3), + "command": " ".join(shlex.quote(x) for x in argv), + "output_filename": output_filename, + "output_exists": output_exists, + "stdout_tail": stdout_tail, + "stderr_tail": stderr_tail, + "error": error, + } + + +def main() -> None: + args = parse_args() + manifest = load_manifest(args.workspace_root) + task_dirs = pick_task_dirs(args.workspace_root, args.task_filter, args.max_tasks) + + if not task_dirs: + raise RuntimeError(f"No tasks found under {args.workspace_root}") + + run_records = [] + for task_dir in task_dirs: + print(f"[INFO] Running agent on {task_dir.name}") + record = run_single_task( + task_dir=task_dir, + command_template=args.agent_cmd, + timeout=args.timeout, + output_filename=args.output_filename, + auto_yolo=args.auto_yolo, + ) + run_records.append(record) + + run_record_path = task_dir / "agent_run.json" + run_record_path.write_text(json.dumps(record, indent=2, ensure_ascii=False), encoding="utf-8") + + summary = { + "workspace_root": str(args.workspace_root), + "manifest": manifest, + "output_filename": args.output_filename, + "auto_yolo": args.auto_yolo, + "num_tasks": len(task_dirs), + "runs": run_records, + } + out_path = args.workspace_root / "agent_launch_summary.json" + out_path.write_text(json.dumps(summary, indent=2, ensure_ascii=False), encoding="utf-8") + print(f"[INFO] Agent launch summary: {out_path}") + + +if __name__ == "__main__": + main() diff --git a/scripts/agent_eval/prepare_workspaces.py b/scripts/agent_eval/prepare_workspaces.py new file mode 100644 index 00000000..62774ab3 --- /dev/null +++ b/scripts/agent_eval/prepare_workspaces.py @@ -0,0 +1,118 @@ +#!/usr/bin/env python3 +"""Prepare minimal per-op workspaces for directory-style agent generation.""" + +from __future__ import annotations + +import argparse +import importlib +import json +import os +import shutil +import sys +from pathlib import Path + +REPO_ROOT = Path(__file__).resolve().parents[2] +if str(REPO_ROOT) not in sys.path: + sys.path.insert(0, str(REPO_ROOT)) + +from dataset import dataset +from prompt_generators.prompt_registry import PROMPT_REGISTRY +from utils.utils import get_ref_src_path + +OUTPUT_REQUIREMENT_SUFFIX = ''' + +# Additional execution constraints for directory workflow +- You are running in an isolated task directory. +- Do NOT modify any file except writing your final answer. +- Write your final answer to `final_response.txt` in this directory. +- Keep the same output style required above (code-only if requested, no extra commentary if requested). +- Do not self-evaluate or access other directories. +''' + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Prepare isolated workspaces for agent generation") + parser.add_argument("--language", type=str, default="cuda", help="Prompt language/backend (must exist in prompt registry)") + parser.add_argument("--strategy", type=str, default="add_shot", help="Prompt strategy (e.g. add_shot / selected_shot)") + parser.add_argument("--categories", nargs="+", default=["activation"], help="Dataset categories or all") + parser.add_argument("--ops", nargs="*", default=None, help="Optional explicit op list") + parser.add_argument("--workspace-root", type=Path, default=Path("experiment/tasks"), help="Workspace root") + parser.add_argument("--clean", action="store_true", help="Delete workspace root before prepare") + parser.add_argument("--readonly", action="store_true", help="Mark reference.py and agent_prompt.txt as read-only") + return parser.parse_args() + + +def pick_ops(categories: list[str], explicit_ops: list[str] | None) -> list[str]: + if explicit_ops: + unknown = [op for op in explicit_ops if op not in dataset] + if unknown: + raise ValueError(f"Unknown ops: {unknown}") + return explicit_ops + + ops = list(dataset.keys()) + if categories != ["all"]: + ops = [op for op in ops if dataset[op]["category"] in categories] + return ops + + +def generate_prompt(language: str, strategy_name: str, op: str) -> str: + if language not in PROMPT_REGISTRY or strategy_name not in PROMPT_REGISTRY[language]: + try: + importlib.import_module(f"prompt_generators.{language}_{strategy_name}") + except ImportError as e: + raise ValueError( + f"Unsupported prompt config: language={language}, strategy={strategy_name}" + ) from e + + strategy = PROMPT_REGISTRY[language][strategy_name] + return strategy.generate(op) + + +def build_agent_prompt(language: str, strategy: str, op: str) -> str: + base_prompt = generate_prompt(language, strategy, op) + return base_prompt.rstrip() + "\n" + OUTPUT_REQUIREMENT_SUFFIX + + +def write_workspace(root: Path, op: str, language: str, strategy: str, readonly: bool) -> None: + task_dir = root / op + task_dir.mkdir(parents=True, exist_ok=True) + + ref_src_path = Path(get_ref_src_path(op)) + shutil.copyfile(ref_src_path, task_dir / "reference.py") + + agent_prompt = build_agent_prompt(language, strategy, op) + (task_dir / "agent_prompt.txt").write_text(agent_prompt, encoding="utf-8") + + if readonly: + os.chmod(task_dir / "reference.py", 0o444) + os.chmod(task_dir / "agent_prompt.txt", 0o444) + + +def main() -> None: + args = parse_args() + ops = pick_ops(args.categories, args.ops) + + workspace_root = args.workspace_root + if args.clean and workspace_root.exists(): + shutil.rmtree(workspace_root) + + workspace_root.mkdir(parents=True, exist_ok=True) + + for op in ops: + write_workspace(workspace_root, op, args.language, args.strategy, args.readonly) + + manifest = { + "language": args.language, + "strategy": args.strategy, + "categories": args.categories, + "ops": ops, + "workspace_root": str(workspace_root), + "expected_agent_output_file": "final_response.txt", + "notes": "Only reference.py + agent_prompt.txt are pre-created in each task directory.", + } + (workspace_root / "manifest.json").write_text(json.dumps(manifest, indent=2, ensure_ascii=False), encoding="utf-8") + print(f"[INFO] Prepared {len(ops)} minimal workspaces at {workspace_root}") + + +if __name__ == "__main__": + main()