Skip to content

Commit 16862db

Browse files
ci: daily Evals CI for Extensions/Skills on github using Evalbench (#152)
1 parent e3f0d60 commit 16862db

6 files changed

Lines changed: 227 additions & 0 deletions

File tree

cloudbuild.yaml

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
# Copyright 2025 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
options:
16+
logging: CLOUD_LOGGING_ONLY
17+
18+
steps:
19+
20+
# --- Evaluation Step ---
21+
- name: 'us-central1-docker.pkg.dev/cloud-db-nl2sql/evalbench/eval_server:latest'
22+
entrypoint: 'bash'
23+
# Decrypts the secret from Secret Manager into the DB_PASSWORD environment variable
24+
secretEnv: ['DB_PASSWORD']
25+
args:
26+
- '-c'
27+
- |
28+
set -e
29+
# Workaround for evalbench bug: settings are only applied if path basename matches extension ID
30+
ln -s /workspace /workspace/cloud-sql-postgresql
31+
cd /evalbench
32+
33+
export EVAL_GCP_PROJECT_ID=$PROJECT_ID
34+
export EVAL_GCP_PROJECT_REGION=us-central1
35+
export GOOGLE_CLOUD_PROJECT=$PROJECT_ID
36+
export CLOUD_SQL_POSTGRES_PROJECT=$PROJECT_ID
37+
export CLOUD_SQL_POSTGRES_INSTANCE=$_CLOUD_SQL_INSTANCE
38+
export CLOUD_SQL_POSTGRES_REGION=$_CLOUD_SQL_REGION
39+
export CLOUD_SQL_POSTGRES_DATABASE=$_CLOUD_SQL_DATABASE
40+
export CLOUD_SQL_POSTGRES_USER=$_CLOUD_SQL_USER
41+
export CLOUD_SQL_POSTGRES_IP_TYPE=$_CLOUD_SQL_IP_TYPE
42+
43+
# Maps the decrypted DB_PASSWORD to the exact variable expected by gemini_cli and extension skills
44+
export CLOUD_SQL_POSTGRES_PASSWORD=$$DB_PASSWORD
45+
46+
# Substitute environment variables in model_config.yaml
47+
python3 /workspace/evals/substitute_env.py
48+
49+
cd /evalbench
50+
export PYTHONPATH=./evalbench:./evalbench/evalproto
51+
export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
52+
53+
echo "Launching Standalone Evaluation..."
54+
python3 evalbench/evalbench.py --experiment_config=/workspace/evals/run_config.yaml
55+
56+
57+
availableSecrets:
58+
secretManager:
59+
- versionName: projects/$PROJECT_ID/secrets/daily-ci-evals-db-password/versions/latest
60+
env: 'DB_PASSWORD'

evals/dataset.json

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
{
2+
"scenarios": [
3+
{
4+
"id": "cloud-sql-debug-instance",
5+
"starting_prompt": "Check on my databases in project ext-test-cloud-sql-postgres.",
6+
"conversation_plan": "Ask the agent to list all Cloud SQL instances in the project. Once all instances are listed, if 'daily-ci-evals-db' exists, get its details and validate it is RUNNABLE.",
7+
"expected_trajectory": [
8+
"list_instances",
9+
"get_instance"
10+
],
11+
"env": {
12+
"GOOGLE_CLOUD_PROJECT": "ext-test-cloud-sql-postgres"
13+
},
14+
"kind": "tools",
15+
"max_turns": 3
16+
},
17+
{
18+
"id": "cloud-sql-schema-tables-explore",
19+
"starting_prompt": "I want to understand the structure of my database.",
20+
"conversation_plan": "First, ask the agent to list the schemas in the database. After the agent provides the schemas, ask it to list the tables specifically for the 'public' schema.",
21+
"expected_trajectory": [
22+
"list_schemas",
23+
"list_tables"
24+
],
25+
"env": {
26+
"GOOGLE_CLOUD_PROJECT": "ext-test-cloud-sql-postgres"
27+
},
28+
"kind": "tools",
29+
"max_turns": 3
30+
},
31+
{
32+
"id": "cloud-sql-performance-check",
33+
"starting_prompt": "Our database performance seems degraded.",
34+
"conversation_plan": "Start by asking the agent to check for any active queries that are running for a long time (e.g., more than 10 seconds). After the agent responds, follow up by asking if there are any database locks that might be causing issues.",
35+
"expected_trajectory": [
36+
"list_active_queries",
37+
"list_locks"
38+
],
39+
"env": {
40+
"GOOGLE_CLOUD_PROJECT": "ext-test-cloud-sql-postgres"
41+
},
42+
"kind": "tools",
43+
"max_turns": 3
44+
},
45+
{
46+
"id": "cloud-sql-metrics-cpu-investigation",
47+
"starting_prompt": "I'm worried about the database load for daily-ci-evals-db.",
48+
"conversation_plan": "First, ask the agent to check the CPU utilization for the instance 'daily-ci-evals-db' for the last 5 minutes. After the agent provides the CPU data, ask it to check the overall database stats to see connection counts or transaction volume.",
49+
"expected_trajectory": [
50+
"get_system_metrics",
51+
"list_database_stats"
52+
],
53+
"env": {
54+
"GOOGLE_CLOUD_PROJECT": "ext-test-cloud-sql-postgres"
55+
},
56+
"kind": "tools",
57+
"max_turns": 3
58+
}
59+
]
60+
}

evals/gemini_2.5_pro_model.yaml

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
# Copyright 2025 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
generator: gcp_vertex_gemini
16+
vertex_model: gemini-2.5-pro
17+
base_prompt: ""
18+
execs_per_minute: 5

evals/model_config.yaml

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
# Copyright 2025 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
gemini_cli_version: "@google/gemini-cli@0.38.1"
16+
generator: gemini_cli
17+
env:
18+
GOOGLE_CLOUD_PROJECT: "${GOOGLE_CLOUD_PROJECT}"
19+
GOOGLE_CLOUD_LOCATION: "global"
20+
GOOGLE_GENAI_USE_VERTEXAI: "true"
21+
setup:
22+
extensions:
23+
# Points to the symlink created in cloudbuild.yaml to match the extension ID
24+
"/workspace/cloud-sql-postgresql":
25+
settings:
26+
CLOUD_SQL_POSTGRES_PROJECT: "${CLOUD_SQL_POSTGRES_PROJECT}"
27+
CLOUD_SQL_POSTGRES_INSTANCE: "${CLOUD_SQL_POSTGRES_INSTANCE}"
28+
CLOUD_SQL_POSTGRES_REGION: "${CLOUD_SQL_POSTGRES_REGION}"
29+
CLOUD_SQL_POSTGRES_DATABASE: "${CLOUD_SQL_POSTGRES_DATABASE}"
30+
CLOUD_SQL_POSTGRES_USER: "${CLOUD_SQL_POSTGRES_USER}"
31+
CLOUD_SQL_POSTGRES_PASSWORD: '${CLOUD_SQL_POSTGRES_PASSWORD}'
32+
CLOUD_SQL_POSTGRES_IP_TYPE: "${CLOUD_SQL_POSTGRES_IP_TYPE}"

evals/run_config.yaml

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
# Copyright 2025 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
extension_id: cloud-sql-postgresql
16+
17+
dataset_config: /workspace/evals/dataset.json
18+
dataset_format: gemini-cli-format
19+
20+
orchestrator: geminicli
21+
model_config: /workspace/evals/model_config.yaml
22+
# You can reference default simulated user models provided by the evalbench repo:
23+
simulated_user_model_config: /workspace/evals/gemini_2.5_pro_model.yaml
24+
25+
scorers:
26+
# Qualitative (Judge-based)
27+
goal_completion:
28+
model_config: /workspace/evals/gemini_2.5_pro_model.yaml
29+
behavioral_metrics:
30+
model_config: /workspace/evals/gemini_2.5_pro_model.yaml
31+
32+
# Performance
33+
turn_count: {}
34+
end_to_end_latency: {}
35+
tool_call_latency: {}
36+
token_consumption: {}
37+
38+
reporting:
39+
bigquery:
40+
gcp_project_id: cloud-db-nl2sql

evals/substitute_env.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
import os
2+
import re
3+
4+
def main():
5+
yaml_path = '/workspace/evals/model_config.yaml'
6+
if os.path.exists(yaml_path):
7+
with open(yaml_path, 'r') as f:
8+
content = f.read()
9+
content = re.sub(r'\${(\w+)}', lambda m: os.environ.get(m.group(1), m.group(0)), content)
10+
with open(yaml_path, 'w') as f:
11+
f.write(content)
12+
print(f"Successfully substituted environment variables in {yaml_path}")
13+
else:
14+
print(f"File not found: {yaml_path}")
15+
16+
if __name__ == '__main__':
17+
main()

0 commit comments

Comments
 (0)