diff --git a/README.md b/README.md
index 988eaac2..d5abbcd5 100644
--- a/README.md
+++ b/README.md
@@ -98,3 +98,24 @@ To integrate a new backend, follow these steps:
 
 This project uses code from [KernelBench](https://github.com/ScalingIntelligence/KernelBench), licensed under the MIT License.
 
+
+
+## Agent Evaluation (Directory Workflow)
+
+Use the minimal directory workflow in `scripts/agent_eval/`:
+- each task directory only contains `reference.py` + `agent_prompt.txt` initially
+- agent writes final output to `final_response.txt` (MultiKernelBench-compatible text format)
+- external scripts handle launch + evaluation
+
+```bash
+# 1) Prepare isolated tasks
+python scripts/agent_eval/prepare_workspaces.py --language cuda --strategy add_shot --categories activation --workspace-root experiment/tasks --clean --readonly
+
+# 2) Launch agent CLI (example)
+python scripts/agent_eval/launch_agent.py --workspace-root experiment/tasks --agent-cmd 'qwen -p "{prompt}"' --timeout 1200 --output-filename final_response.txt --auto-yolo --max-tasks 10
+
+# 3) Externally evaluate generated txt outputs
+python scripts/agent_eval/collect_results.py --workspace-root experiment/tasks --language cuda --output-filename final_response.txt --out-json experiment/outputs/agent_summary.json --out-csv experiment/outputs/agent_summary.csv
+```
+
+See `scripts/agent_eval/README.md` for details.
diff --git a/scripts/agent_eval/README.md b/scripts/agent_eval/README.md
new file mode 100644
index 00000000..ba832bfe
--- /dev/null
+++ b/scripts/agent_eval/README.md
@@ -0,0 +1,72 @@
+# Agent Evaluation Workflow (Minimal Directory Mode)
+
+This workflow is the **minimal agent setting**:
+- each task directory only pre-creates `reference.py` and `agent_prompt.txt`
+- no in-directory self-evaluation scripts are provided to the agent
+- the agent must write a single output text file: `final_response.txt`
+- external controller evaluates those outputs using MultiKernelBench evaluator
+
+## 1) Prepare minimal workspaces
+
+```bash
+python scripts/agent_eval/prepare_workspaces.py \
+  --language cuda \
+  --strategy add_shot \
+  --categories activation \
+  --workspace-root experiment/tasks \
+  --clean \
+  --readonly
+```
+
+After preparation, each task folder initially contains only:
+- `reference.py`
+- `agent_prompt.txt`
+
+`agent_prompt.txt` is generated dynamically from `prompt_generators/{language}_{strategy}.py` (aligned with existing benchmark prompts), then appends only the directory-output requirement (`final_response.txt`).
+
+## 2) Launch your agent
+
+`launch_agent.py` supports placeholders in `--agent-cmd`:
+- `{prompt_file}`
+- `{prompt}`
+- `{workdir}`
+
+Example:
+
+```bash
+python scripts/agent_eval/launch_agent.py \
+  --workspace-root experiment/tasks \
+  --agent-cmd 'qwen -p "{prompt}"' \
+  --timeout 1200 \
+  --output-filename final_response.txt \
+  --auto-yolo \
+  --max-tasks 10
+```
+
+Expected: the agent writes `final_response.txt` in each task directory.
+
+`--max-tasks` is mainly for quick smoke runs / cost control: it only runs the first N task directories (sorted by task name), so you can debug workflow on small subsets before full-scale runs.
+
+## 3) Evaluate agent outputs externally
+
+```bash
+python scripts/agent_eval/collect_results.py \
+  --workspace-root experiment/tasks \
+  --language cuda \
+  --output-filename final_response.txt \
+  --out-json experiment/outputs/agent_summary.json \
+  --out-csv experiment/outputs/agent_summary.csv
+```
+
+This calls `eval_single_runner.py` for each task output and writes:
+- per-task `eval_result.json`
+- aggregated JSON/CSV summary
+
+## Notes
+
+- Keep agent output compatible with MultiKernelBench model output style (plain response text or fenced code block).
+- If evaluation environment lacks hardware/runtime (e.g., no CUDA), records may show compile/runtime failures due to environment limitations.
+
+
+If you use Qwen CLI in non-interactive mode, `--auto-yolo` is recommended (it appends `-y` automatically when missing).
+
diff --git a/scripts/agent_eval/collect_results.py b/scripts/agent_eval/collect_results.py
new file mode 100644
index 00000000..dd7c1a0a
--- /dev/null
+++ b/scripts/agent_eval/collect_results.py
@@ -0,0 +1,168 @@
+#!/usr/bin/env python3
+"""Collect agent outputs and run external MultiKernelBench evaluation."""
+
+from __future__ import annotations
+
+import argparse
+import csv
+import json
+import subprocess
+import sys
+from pathlib import Path
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Evaluate agent outputs from workspaces")
+    parser.add_argument("--workspace-root", type=Path, default=Path("experiment/tasks"))
+    parser.add_argument("--language", type=str, required=True, help="Backend language for eval_single_runner")
+    parser.add_argument("--output-filename", type=str, default="final_response.txt", help="Agent output txt filename")
+    parser.add_argument("--out-json", type=Path, default=Path("experiment/outputs/agent_summary.json"))
+    parser.add_argument("--out-csv", type=Path, default=Path("experiment/outputs/agent_summary.csv"))
+    parser.add_argument("--eval-timeout", type=int, default=300, help="Timeout per task evaluation (seconds)")
+    return parser.parse_args()
+
+
+def evaluate_single(task_dir: Path, op: str, language: str, output_filename: str, timeout_sec: int) -> dict:
+    output_path = task_dir / output_filename
+    if not output_path.exists():
+        return {
+            "task": op,
+            "status": "missing_output",
+            "compiled": False,
+            "correctness": False,
+            "performance": None,
+            "returncode": None,
+            "error": f"{output_filename} not found",
+        }
+
+    tmp_result_path = task_dir / "raw_eval_result.json"
+    cmd = [
+        sys.executable,
+        "eval_single_runner.py",
+        str(output_path),
+        op,
+        language,
+        str(tmp_result_path),
+    ]
+
+    try:
+        proc = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout_sec)
+    except subprocess.TimeoutExpired:
+        return {
+            "task": op,
+            "status": "eval_timeout",
+            "compiled": False,
+            "correctness": False,
+            "performance": None,
+            "returncode": None,
+            "error": "evaluation timeout",
+        }
+
+    if tmp_result_path.exists():
+        raw = json.loads(tmp_result_path.read_text())
+        tmp_result_path.unlink(missing_ok=True)
+    else:
+        raw = {
+            "compiled": False,
+            "correctness": False,
+            "performance": None,
+            "compile_info": "raw eval result missing",
+        }
+
+    compiled = bool(raw.get("compiled", False))
+    correctness = bool(raw.get("correctness", False))
+
+    if not compiled:
+        status = "compile_error"
+    elif compiled and not correctness:
+        status = "wrong_output"
+    else:
+        status = "passed"
+
+    record = {
+        "task": op,
+        "status": status,
+        "compiled": compiled,
+        "correctness": correctness,
+        "performance": raw.get("performance"),
+        "returncode": proc.returncode,
+        "stdout_tail": (proc.stdout or "")[-2000:],
+        "stderr_tail": (proc.stderr or "")[-2000:],
+        "raw": raw,
+        "error": None,
+    }
+
+    eval_result_path = task_dir / "eval_result.json"
+    eval_result_path.write_text(json.dumps(record, indent=2, ensure_ascii=False), encoding="utf-8")
+    return record
+
+
+def main() -> None:
+    args = parse_args()
+    task_dirs = sorted([d for d in args.workspace_root.iterdir() if d.is_dir()], key=lambda p: p.name)
+
+    rows = []
+    for task_dir in task_dirs:
+        row = evaluate_single(
+            task_dir=task_dir,
+            op=task_dir.name,
+            language=args.language,
+            output_filename=args.output_filename,
+            timeout_sec=args.eval_timeout,
+        )
+        perf = row.get("performance") or {}
+        row_csv = {
+            "task": row["task"],
+            "status": row["status"],
+            "compiled": row["compiled"],
+            "correctness": row["correctness"],
+            "perf_mean": perf.get("mean"),
+            "perf_std": perf.get("std"),
+            "returncode": row["returncode"],
+            "error": row.get("error"),
+        }
+        rows.append((row, row_csv))
+
+    total = len(rows)
+    compiled_cnt = sum(1 for r, _ in rows if r.get("compiled") is True)
+    correct_cnt = sum(1 for r, _ in rows if r.get("correctness") is True)
+
+    summary = {
+        "workspace_root": str(args.workspace_root),
+        "language": args.language,
+        "output_filename": args.output_filename,
+        "total_tasks": total,
+        "compiled_tasks": compiled_cnt,
+        "correct_tasks": correct_cnt,
+        "compile_rate": compiled_cnt / total if total else 0.0,
+        "correct_rate": correct_cnt / total if total else 0.0,
+        "rows": [r for r, _ in rows],
+    }
+
+    args.out_json.parent.mkdir(parents=True, exist_ok=True)
+    args.out_csv.parent.mkdir(parents=True, exist_ok=True)
+    args.out_json.write_text(json.dumps(summary, indent=2, ensure_ascii=False), encoding="utf-8")
+
+    fieldnames = [
+        "task",
+        "status",
+        "compiled",
+        "correctness",
+        "perf_mean",
+        "perf_std",
+        "returncode",
+        "error",
+    ]
+    with args.out_csv.open("w", newline="", encoding="utf-8") as f:
+        writer = csv.DictWriter(f, fieldnames=fieldnames)
+        writer.writeheader()
+        for _, row_csv in rows:
+            writer.writerow(row_csv)
+
+    print(f"[INFO] Wrote JSON summary to {args.out_json}")
+    print(f"[INFO] Wrote CSV summary to {args.out_csv}")
+    print(f"[INFO] compile_rate={summary['compile_rate']:.3f}, correct_rate={summary['correct_rate']:.3f}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/agent_eval/launch_agent.py b/scripts/agent_eval/launch_agent.py
new file mode 100644
index 00000000..4f19bb8e
--- /dev/null
+++ b/scripts/agent_eval/launch_agent.py
@@ -0,0 +1,173 @@
+#!/usr/bin/env python3
+"""Launch directory-style coding agents on prepared minimal workspaces."""
+
+from __future__ import annotations
+
+import argparse
+import json
+import shlex
+import subprocess
+import time
+from pathlib import Path
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Run agent command for each workspace")
+    parser.add_argument("--workspace-root", type=Path, default=Path("experiment/tasks"))
+    parser.add_argument(
+        "--agent-cmd",
+        type=str,
+        required=True,
+        help=(
+            "Agent command template. Use {prompt_file}, {prompt}, {workdir}. "
+            "Example: qwen -p \"{prompt}\""
+        ),
+    )
+    parser.add_argument("--timeout", type=int, default=1200, help="Per-task timeout seconds")
+    parser.add_argument("--output-filename", type=str, default="final_response.txt", help="Expected agent output txt filename")
+    parser.add_argument("--max-tasks", type=int, default=None, help="Optional cap on number of tasks (useful for smoke tests / cost control)")
+    parser.add_argument("--task-filter", nargs="*", default=None, help="Optional explicit task ids")
+    parser.add_argument(
+        "--auto-yolo",
+        action="store_true",
+        help="If command is qwen and -y is missing, append -y automatically.",
+    )
+    return parser.parse_args()
+
+
+def load_manifest(workspace_root: Path) -> dict:
+    manifest_path = workspace_root / "manifest.json"
+    if manifest_path.exists():
+        return json.loads(manifest_path.read_text())
+    return {}
+
+
+def pick_task_dirs(workspace_root: Path, task_filter: list[str] | None, max_tasks: int | None) -> list[Path]:
+    task_dirs = [d for d in workspace_root.iterdir() if d.is_dir()]
+    task_dirs.sort(key=lambda p: p.name)
+    if task_filter:
+        allow = set(task_filter)
+        task_dirs = [d for d in task_dirs if d.name in allow]
+    if max_tasks is not None:
+        task_dirs = task_dirs[:max_tasks]
+    return task_dirs
+
+
+def maybe_append_yolo(argv: list[str], auto_yolo: bool) -> list[str]:
+    if not auto_yolo:
+        return argv
+    if not argv:
+        return argv
+
+    exe = Path(argv[0]).name
+    if exe == "qwen" and "-y" not in argv:
+        return [*argv, "-y"]
+    return argv
+
+
+def run_single_task(
+    task_dir: Path,
+    command_template: str,
+    timeout: int,
+    output_filename: str,
+    auto_yolo: bool,
+) -> dict:
+    prompt_file = task_dir / "agent_prompt.txt"
+    prompt_text = prompt_file.read_text(encoding="utf-8") if prompt_file.exists() else ""
+
+    command = command_template.format(
+        prompt_file=str(prompt_file),
+        prompt=prompt_text.replace('"', '\\"').replace("\n", " "),
+        workdir=str(task_dir),
+    )
+    argv = maybe_append_yolo(shlex.split(command), auto_yolo)
+
+    started_at = time.time()
+    status = "finished"
+    error = None
+
+    try:
+        proc = subprocess.run(
+            argv,
+            cwd=task_dir,
+            capture_output=True,
+            text=True,
+            timeout=timeout,
+        )
+        returncode = proc.returncode
+        stdout_tail = (proc.stdout or "")[-2000:]
+        stderr_tail = (proc.stderr or "")[-2000:]
+    except subprocess.TimeoutExpired as e:
+        status = "timeout"
+        returncode = None
+        stdout_tail = ((e.stdout or "") if isinstance(e.stdout, str) else "")[-2000:]
+        stderr_tail = ((e.stderr or "") if isinstance(e.stderr, str) else "")[-2000:]
+        proc = None
+    except Exception as e:  # noqa: BLE001
+        status = "launch_error"
+        returncode = None
+        stdout_tail = ""
+        stderr_tail = ""
+        error = str(e)
+        proc = None
+
+    output_path = task_dir / output_filename
+    output_exists = output_path.exists()
+
+
+    if "requires user approval" in stderr_tail and "-y" in stderr_tail and not auto_yolo:
+        status = "needs_yolo"
+
+    finished_at = time.time()
+    return {
+        "task": task_dir.name,
+        "status": status,
+        "returncode": returncode,
+        "duration_sec": round(finished_at - started_at, 3),
+        "command": " ".join(shlex.quote(x) for x in argv),
+        "output_filename": output_filename,
+        "output_exists": output_exists,
+        "stdout_tail": stdout_tail,
+        "stderr_tail": stderr_tail,
+        "error": error,
+    }
+
+
+def main() -> None:
+    args = parse_args()
+    manifest = load_manifest(args.workspace_root)
+    task_dirs = pick_task_dirs(args.workspace_root, args.task_filter, args.max_tasks)
+
+    if not task_dirs:
+        raise RuntimeError(f"No tasks found under {args.workspace_root}")
+
+    run_records = []
+    for task_dir in task_dirs:
+        print(f"[INFO] Running agent on {task_dir.name}")
+        record = run_single_task(
+            task_dir=task_dir,
+            command_template=args.agent_cmd,
+            timeout=args.timeout,
+            output_filename=args.output_filename,
+            auto_yolo=args.auto_yolo,
+        )
+        run_records.append(record)
+
+        run_record_path = task_dir / "agent_run.json"
+        run_record_path.write_text(json.dumps(record, indent=2, ensure_ascii=False), encoding="utf-8")
+
+    summary = {
+        "workspace_root": str(args.workspace_root),
+        "manifest": manifest,
+        "output_filename": args.output_filename,
+        "auto_yolo": args.auto_yolo,
+        "num_tasks": len(task_dirs),
+        "runs": run_records,
+    }
+    out_path = args.workspace_root / "agent_launch_summary.json"
+    out_path.write_text(json.dumps(summary, indent=2, ensure_ascii=False), encoding="utf-8")
+    print(f"[INFO] Agent launch summary: {out_path}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/agent_eval/prepare_workspaces.py b/scripts/agent_eval/prepare_workspaces.py
new file mode 100644
index 00000000..62774ab3
--- /dev/null
+++ b/scripts/agent_eval/prepare_workspaces.py
@@ -0,0 +1,118 @@
+#!/usr/bin/env python3
+"""Prepare minimal per-op workspaces for directory-style agent generation."""
+
+from __future__ import annotations
+
+import argparse
+import importlib
+import json
+import os
+import shutil
+import sys
+from pathlib import Path
+
+REPO_ROOT = Path(__file__).resolve().parents[2]
+if str(REPO_ROOT) not in sys.path:
+    sys.path.insert(0, str(REPO_ROOT))
+
+from dataset import dataset
+from prompt_generators.prompt_registry import PROMPT_REGISTRY
+from utils.utils import get_ref_src_path
+
+OUTPUT_REQUIREMENT_SUFFIX = '''
+
+# Additional execution constraints for directory workflow
+- You are running in an isolated task directory.
+- Do NOT modify any file except writing your final answer.
+- Write your final answer to `final_response.txt` in this directory.
+- Keep the same output style required above (code-only if requested, no extra commentary if requested).
+- Do not self-evaluate or access other directories.
+'''
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Prepare isolated workspaces for agent generation")
+    parser.add_argument("--language", type=str, default="cuda", help="Prompt language/backend (must exist in prompt registry)")
+    parser.add_argument("--strategy", type=str, default="add_shot", help="Prompt strategy (e.g. add_shot / selected_shot)")
+    parser.add_argument("--categories", nargs="+", default=["activation"], help="Dataset categories or all")
+    parser.add_argument("--ops", nargs="*", default=None, help="Optional explicit op list")
+    parser.add_argument("--workspace-root", type=Path, default=Path("experiment/tasks"), help="Workspace root")
+    parser.add_argument("--clean", action="store_true", help="Delete workspace root before prepare")
+    parser.add_argument("--readonly", action="store_true", help="Mark reference.py and agent_prompt.txt as read-only")
+    return parser.parse_args()
+
+
+def pick_ops(categories: list[str], explicit_ops: list[str] | None) -> list[str]:
+    if explicit_ops:
+        unknown = [op for op in explicit_ops if op not in dataset]
+        if unknown:
+            raise ValueError(f"Unknown ops: {unknown}")
+        return explicit_ops
+
+    ops = list(dataset.keys())
+    if categories != ["all"]:
+        ops = [op for op in ops if dataset[op]["category"] in categories]
+    return ops
+
+
+def generate_prompt(language: str, strategy_name: str, op: str) -> str:
+    if language not in PROMPT_REGISTRY or strategy_name not in PROMPT_REGISTRY[language]:
+        try:
+            importlib.import_module(f"prompt_generators.{language}_{strategy_name}")
+        except ImportError as e:
+            raise ValueError(
+                f"Unsupported prompt config: language={language}, strategy={strategy_name}"
+            ) from e
+
+    strategy = PROMPT_REGISTRY[language][strategy_name]
+    return strategy.generate(op)
+
+
+def build_agent_prompt(language: str, strategy: str, op: str) -> str:
+    base_prompt = generate_prompt(language, strategy, op)
+    return base_prompt.rstrip() + "\n" + OUTPUT_REQUIREMENT_SUFFIX
+
+
+def write_workspace(root: Path, op: str, language: str, strategy: str, readonly: bool) -> None:
+    task_dir = root / op
+    task_dir.mkdir(parents=True, exist_ok=True)
+
+    ref_src_path = Path(get_ref_src_path(op))
+    shutil.copyfile(ref_src_path, task_dir / "reference.py")
+
+    agent_prompt = build_agent_prompt(language, strategy, op)
+    (task_dir / "agent_prompt.txt").write_text(agent_prompt, encoding="utf-8")
+
+    if readonly:
+        os.chmod(task_dir / "reference.py", 0o444)
+        os.chmod(task_dir / "agent_prompt.txt", 0o444)
+
+
+def main() -> None:
+    args = parse_args()
+    ops = pick_ops(args.categories, args.ops)
+
+    workspace_root = args.workspace_root
+    if args.clean and workspace_root.exists():
+        shutil.rmtree(workspace_root)
+
+    workspace_root.mkdir(parents=True, exist_ok=True)
+
+    for op in ops:
+        write_workspace(workspace_root, op, args.language, args.strategy, args.readonly)
+
+    manifest = {
+        "language": args.language,
+        "strategy": args.strategy,
+        "categories": args.categories,
+        "ops": ops,
+        "workspace_root": str(workspace_root),
+        "expected_agent_output_file": "final_response.txt",
+        "notes": "Only reference.py + agent_prompt.txt are pre-created in each task directory.",
+    }
+    (workspace_root / "manifest.json").write_text(json.dumps(manifest, indent=2, ensure_ascii=False), encoding="utf-8")
+    print(f"[INFO] Prepared {len(ops)} minimal workspaces at {workspace_root}")
+
+
+if __name__ == "__main__":
+    main()