-
Notifications
You must be signed in to change notification settings - Fork 110
Expand file tree
/
Copy pathrun_miniwob.py
More file actions
81 lines (71 loc) · 2.75 KB
/
run_miniwob.py
File metadata and controls
81 lines (71 loc) · 2.75 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import argparse
import logging
import os
from bgym import DEFAULT_BENCHMARKS
from dotenv import load_dotenv
from agentlab.agents.generic_agent.agent_configs import GPT5_MINI_FLAGS
from agentlab.agents.generic_agent.generic_agent import GenericAgentArgs
from agentlab.agents.react_toolcall_agent import AgentConfig, LLMArgs, ReactToolCallAgentArgs
from agentlab.backends.browser.mcp_playwright import MCPPlaywright
from agentlab.backends.browser.playwright import SyncPlaywright
from agentlab.benchmarks.miniwob import MiniWobBenchmark
from agentlab.experiments.study import make_study
from agentlab.llm.llm_configs import CHAT_MODEL_ARGS_DICT
fmt = "%(asctime)s - %(levelname)s - %(name)s:%(lineno)d - %(funcName)s() - %(message)s"
logging.basicConfig(level=logging.INFO, force=True, format=fmt, handlers=[logging.StreamHandler()])
logger = logging.getLogger(__name__)
load_dotenv()
def parse_args():
parser = argparse.ArgumentParser(description="Run MiniWob benchmark experiments")
parser.add_argument(
"--backend",
choices=["playwright", "mcp", "bgym"],
default="playwright",
help="Browser backend to use (default: playwright)",
)
parser.add_argument(
"--agent",
choices=["tape", "generic", "react"],
default="react",
help="Agent type to use (default: react)",
)
parser.add_argument(
"--config",
type=str,
default="miniwob",
help="Hydra config name to load (default: miniwob)",
)
return parser.parse_args()
if __name__ == "__main__":
args = parse_args()
if args.backend == "bgym":
benchmark = DEFAULT_BENCHMARKS["miniwob"](n_repeats=1)
elif args.backend == "playwright":
benchmark = MiniWobBenchmark(backend_cls=SyncPlaywright)
elif args.backend == "mcp":
benchmark = MiniWobBenchmark(backend_cls=MCPPlaywright)
else:
raise ValueError(f"Unknown backend: {args.backend}")
if args.agent == "generic":
agent_args = GenericAgentArgs(
chat_model_args=CHAT_MODEL_ARGS_DICT["azure/gpt-5-mini-2025-08-07"],
flags=GPT5_MINI_FLAGS,
)
else: # react
agent_args = ReactToolCallAgentArgs(
llm_args=LLMArgs(
model_name="azure/gpt-5-mini", temperature=1.0, max_total_tokens=128000
),
config=AgentConfig(),
)
study = make_study(
benchmark=benchmark,
agent_args=agent_args,
logging_level=logging.INFO,
logging_level_stdout=logging.INFO,
)
if os.environ.get("AGENTLAB_DEBUG"):
study.exp_args_list = study.exp_args_list[23:27]
study.run(n_jobs=1, n_relaunch=1, parallel_backend="sequential")
else:
study.run(n_jobs=8, n_relaunch=1, parallel_backend="ray")