Skip to content

Commit 1722276

Browse files
feat: update evaluation scenarios and add performance and qualitative scorers to run configuration
1 parent d4f796a commit 1722276

2 files changed

Lines changed: 58 additions & 27 deletions

File tree

evals/dataset.json

Lines changed: 45 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1,55 +1,73 @@
11
{
22
"scenarios": [
33
{
4-
"id": "cloud-sql-list-instances",
5-
"starting_prompt": "Show me all the Cloud SQL instances in this project.",
6-
"conversation_plan": "Ask the agent to list the Cloud SQL instances in the current project.",
4+
"id": "cloud-sql-debug-instance",
5+
"starting_prompt": "Check on my databases in project ext-test-cloud-sql-postgres.",
6+
"conversation_plan": "Ask the agent to list all Cloud SQL instances in the project. Once all instances are listed, if 'daily-ci-evals-db' exists, get its details and validate it is RUNNABLE.",
77
"expected_trajectory": [
8-
"list_instances"
8+
"list_instances",
9+
"get_instance"
910
],
10-
"kind": "tool",
11-
"max_turns": 5
11+
"env": {
12+
"GOOGLE_CLOUD_PROJECT": "ext-test-cloud-sql-postgres"
13+
},
14+
"kind": "tools",
15+
"max_turns": 4
1216
},
1317
{
14-
"id": "cloud-sql-data-explore",
15-
"starting_prompt": "What schemas and tables do we have in this database? Please list them.",
16-
"conversation_plan": "Ask the agent to list the schemas in the database. Then ask to list the tables.",
18+
"id": "cloud-sql-schema-tables-explore",
19+
"starting_prompt": "I want to understand the structure of my database.",
20+
"conversation_plan": "First, ask the agent to list the schemas in the database. After the agent provides the schemas, ask it to list the tables specifically for the 'public' schema.",
1721
"expected_trajectory": [
1822
"list_schemas",
1923
"list_tables"
2024
],
21-
"kind": "tool",
22-
"max_turns": 5
25+
"env": {
26+
"GOOGLE_CLOUD_PROJECT": "ext-test-cloud-sql-postgres"
27+
},
28+
"kind": "tools",
29+
"max_turns": 6
2330
},
2431
{
25-
"id": "cloud-sql-perf-troubleshoot",
26-
"starting_prompt": "The database is running slow. Are there any active queries running for more than 10 seconds or any locks?",
27-
"conversation_plan": "Ask the agent to check for active queries running longer than 10 seconds. Then ask to check for locks.",
32+
"id": "cloud-sql-performance-check",
33+
"starting_prompt": "Our database performance seems degraded.",
34+
"conversation_plan": "Start by asking the agent to check for any active queries that are running for a long time (e.g., more than 10 seconds). After the agent responds, follow up by asking if there are any database locks that might be causing issues.",
2835
"expected_trajectory": [
2936
"list_active_queries",
3037
"list_locks"
3138
],
32-
"kind": "tool",
33-
"max_turns": 5
39+
"env": {
40+
"GOOGLE_CLOUD_PROJECT": "ext-test-cloud-sql-postgres"
41+
},
42+
"kind": "tools",
43+
"max_turns": 6
3444
},
3545
{
36-
"id": "cloud-sql-metrics-cpu",
37-
"starting_prompt": "Can you show me the CPU utilization for instance 'daily-ci-evals-db' in project 'ext-test-cloud-sql-postgres' for the last 5 minutes?",
38-
"conversation_plan": "Ask the agent to query the CPU utilization metric for the specified instance and project using PromQL.",
46+
"id": "cloud-sql-metrics-cpu-investigation",
47+
"starting_prompt": "I'm worried about the database load for daily-ci-evals-db.",
48+
"conversation_plan": "First, ask the agent to check the CPU utilization for the instance 'daily-ci-evals-db' for the last 5 minutes. After the agent provides the CPU data, ask it to check the overall database stats to see connection counts or transaction volume.",
3949
"expected_trajectory": [
40-
"get_system_metrics"
50+
"get_system_metrics",
51+
"list_database_stats"
4152
],
42-
"kind": "tool",
43-
"max_turns": 4
53+
"env": {
54+
"GOOGLE_CLOUD_PROJECT": "ext-test-cloud-sql-postgres"
55+
},
56+
"kind": "tools",
57+
"max_turns": 6
4458
},
4559
{
46-
"id": "cloud-sql-unused-indexes",
47-
"starting_prompt": "Are there any unused indexes in the database that we can clean up?",
48-
"conversation_plan": "Ask the agent to list unused indexes in the database.",
60+
"id": "cloud-sql-instance-not-found",
61+
"starting_prompt": "Get details for the instance 'missing-db-123'.",
62+
"conversation_plan": "The user asks for details of an instance named 'missing-db-123' that doesn't exist. The agent should try to get it, fail, and inform the user. The user will then ask to list instances to find the correct name.",
4963
"expected_trajectory": [
50-
"list_indexes"
64+
"get_instance",
65+
"list_instances"
5166
],
52-
"kind": "tool",
67+
"env": {
68+
"GOOGLE_CLOUD_PROJECT": "ext-test-cloud-sql-postgres"
69+
},
70+
"kind": "tools",
5371
"max_turns": 4
5472
}
5573
]

evals/run_config.yaml

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,9 +23,22 @@ model_config: /workspace/evals/model_config.yaml
2323
simulated_user_model_config: /workspace/evals/gemini_2.5_pro_model.yaml
2424

2525
scorers:
26+
# Structural
2627
trajectory_matcher: {}
28+
29+
# Qualitative (Judge-based)
2730
goal_completion:
2831
model_config: /workspace/evals/gemini_2.5_pro_model.yaml
32+
behavioral_metrics:
33+
model_config: /workspace/evals/gemini_2.5_pro_model.yaml
34+
parameter_analysis:
35+
model_config: /workspace/evals/gemini_2.5_pro_model.yaml
36+
37+
# Performance
38+
turn_count: {}
39+
end_to_end_latency: {}
40+
tool_call_latency: {}
41+
token_consumption: {}
2942

3043
reporting:
3144
bigquery:

0 commit comments

Comments
 (0)