Skip to content

Commit 1b1f1af

Browse files
feat: implement automated evaluation pipeline using Evalbench with custom dataset, model configurations, and CI integration.
1 parent e2d4d64 commit 1b1f1af

7 files changed

Lines changed: 243 additions & 0 deletions

File tree

cloudbuild.yaml

Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
# Copyright 2025 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
options:
16+
logging: CLOUD_LOGGING_ONLY
17+
18+
steps:
19+
20+
# --- Evaluation Step ---
21+
- name: 'us-central1-docker.pkg.dev/cloud-db-nl2sql/evalbench/eval_server:latest'
22+
entrypoint: 'bash'
23+
# Decrypts the secret from Secret Manager into the GITHUB_TOKEN environment variable
24+
secretEnv: ['GITHUB_TOKEN']
25+
args:
26+
- '-c'
27+
- |
28+
set -e
29+
30+
# Only run on release branches
31+
if [[ "$_HEAD_BRANCH" != release-please-* ]]; then
32+
echo "Not a release-please branch. Exiting."
33+
exit 0
34+
fi
35+
echo "Release branch detected. Fetching PR data from GitHub API..."
36+
37+
# Fetch PR data and status code
38+
HTTP_STATUS=$(curl -s -o pr_data.json -w "%{http_code}" -H "Authorization: token $$GITHUB_TOKEN" \
39+
"https://api.github.com/repos/$REPO_FULL_NAME/pulls/$_PR_NUMBER")
40+
41+
if [ "$$HTTP_STATUS" -ne 200 ]; then
42+
echo "Error fetching PR data: HTTP $$HTTP_STATUS"
43+
cat pr_data.json
44+
exit 1
45+
fi
46+
47+
PR_DATA=$(cat pr_data.json)
48+
49+
# Extract labels and title from PR data (Use $$ to escape bash variables)
50+
PR_LABELS=$(echo "$$PR_DATA" | jq -r '[.labels[].name] | join(",")')
51+
PR_TITLE=$(echo "$$PR_DATA" | jq -r '.title')
52+
53+
# Determine Release Version (Use double quotes and $$ for bash variables)
54+
if [[ "$$PR_LABELS" == *"autorelease: triggered"* ]]; then
55+
if [[ "$$PR_TITLE" =~ release\ ([0-9]+\.[0-9]+\.[0-9]+) ]]; then
56+
export RELEASE_VERSION="$${BASH_REMATCH[1]}"
57+
else
58+
export RELEASE_VERSION="unknown"
59+
fi
60+
else
61+
export RELEASE_VERSION="unknown"
62+
fi
63+
64+
# Workaround for evalbench bug: settings are only applied if path basename matches extension ID
65+
ln -s /workspace /workspace/bigquery-data-analytics
66+
cd /evalbench
67+
68+
export EVAL_GCP_PROJECT_ID=$PROJECT_ID
69+
export GOOGLE_CLOUD_PROJECT=$PROJECT_ID
70+
71+
# Combine CI metadata with run config
72+
cat /workspace/evals/ci_metadata.yaml >> /workspace/evals/run_config.yaml
73+
74+
# Substitute environment variables in model_config.yaml
75+
python3 /workspace/evals/substitute_env.py
76+
77+
cd /evalbench
78+
export PYTHONPATH=./evalbench:./evalbench/evalproto
79+
export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
80+
81+
echo "Launching Standalone Evaluation..."
82+
python3 evalbench/evalbench.py --experiment_config=/workspace/evals/run_config.yaml
83+
84+
availableSecrets:
85+
secretManager:
86+
- versionName: projects/$PROJECT_ID/secrets/GITHUB_TOKEN/versions/latest
87+
env: 'GITHUB_TOKEN'

evals/ci_metadata.yaml

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
# Copyright 2025 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
############################################################
16+
### CI Metadata (Repository Specific)
17+
### Note: These fields are used for version tracking in BQ
18+
### and are not part of the core Evalbench schema.
19+
############################################################
20+
21+
extension_id: bigquery-data-analytics
22+
release_version: ${RELEASE_VERSION}

evals/dataset.json

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
{
2+
"scenarios": [
3+
{
4+
"id": "bq-search-catalog",
5+
"starting_prompt": "Search for tables related to sales in project ext-test-bigquery-analytics.",
6+
"conversation_plan": "Ask the agent to search for tables with 'sales' in the prompt.",
7+
"expected_trajectory": [
8+
"search_catalog"
9+
],
10+
"env": {
11+
"GOOGLE_CLOUD_PROJECT": "ext-test-bigquery-analytics"
12+
},
13+
"kind": "tools",
14+
"max_turns": 3
15+
},
16+
{
17+
"id": "bq-ask-insights",
18+
"starting_prompt": "What are the top products by sales in the table 'sales_data' in dataset 'sales' and project 'ext-test-bigquery-analytics'?",
19+
"conversation_plan": "Ask the agent to get insights about the sales table.",
20+
"expected_trajectory": [
21+
"ask_data_insights"
22+
],
23+
"env": {
24+
"GOOGLE_CLOUD_PROJECT": "ext-test-bigquery-analytics"
25+
},
26+
"kind": "tools",
27+
"max_turns": 3
28+
}
29+
]
30+
}

evals/gemini_2.5_pro_model.yaml

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
# Copyright 2025 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
generator: gcp_vertex_gemini
16+
vertex_model: gemini-2.5-pro
17+
base_prompt: ""
18+
execs_per_minute: 5

evals/model_config.yaml

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
# Copyright 2025 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
gemini_cli_version: "@google/gemini-cli@0.38.1"
16+
generator: gemini_cli
17+
env:
18+
GOOGLE_CLOUD_PROJECT: "${GOOGLE_CLOUD_PROJECT}"
19+
GOOGLE_CLOUD_LOCATION: "global"
20+
GOOGLE_GENAI_USE_VERTEXAI: "true"
21+
setup:
22+
extensions:
23+
# Points to the symlink created in cloudbuild.yaml to match the extension ID
24+
"/workspace/bigquery-data-analytics":
25+
settings: {}

evals/run_config.yaml

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
# Copyright 2025 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
dataset_config: /workspace/evals/dataset.json
16+
dataset_format: gemini-cli-format
17+
18+
orchestrator: geminicli
19+
model_config: /workspace/evals/model_config.yaml
20+
simulated_user_model_config: /workspace/evals/gemini_2.5_pro_model.yaml
21+
22+
scorers:
23+
# Qualitative (Judge-based)
24+
goal_completion:
25+
model_config: /workspace/evals/gemini_2.5_pro_model.yaml
26+
behavioral_metrics:
27+
model_config: /workspace/evals/gemini_2.5_pro_model.yaml
28+
29+
# Performance
30+
turn_count: {}
31+
end_to_end_latency: {}
32+
tool_call_latency: {}
33+
token_consumption: {}
34+
35+
reporting:
36+
bigquery:
37+
gcp_project_id: ext-test-bigquery-analytics

evals/substitute_env.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
import os
2+
import re
3+
4+
def main():
5+
# Use EVAL_WORKSPACE env var if set, otherwise default to /workspace (CI default)
6+
workspace = os.environ.get('EVAL_WORKSPACE', '/workspace')
7+
yaml_paths = [
8+
os.path.join(workspace, 'evals/model_config.yaml'),
9+
os.path.join(workspace, 'evals/run_config.yaml')
10+
]
11+
12+
for yaml_path in yaml_paths:
13+
if os.path.exists(yaml_path):
14+
with open(yaml_path, 'r') as f:
15+
content = f.read()
16+
content = re.sub(r'\${(\w+)}', lambda m: os.environ.get(m.group(1), m.group(0)), content)
17+
with open(yaml_path, 'w') as f:
18+
f.write(content)
19+
print(f"Successfully substituted environment variables in {yaml_path}")
20+
else:
21+
print(f"File not found: {yaml_path}")
22+
23+
if __name__ == '__main__':
24+
main()

0 commit comments

Comments
 (0)