Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
88 changes: 88 additions & 0 deletions cloudbuild.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

options:
logging: CLOUD_LOGGING_ONLY

steps:

# --- Evaluation Step ---
- name: 'us-central1-docker.pkg.dev/cloud-db-nl2sql/evalbench/eval_server:latest'
entrypoint: 'bash'
# Decrypts the secret from Secret Manager into the GITHUB_TOKEN environment variable
secretEnv: ['GITHUB_TOKEN']
args:
- '-c'
- |
set -e

# Only run on release branches
if [[ "$_HEAD_BRANCH" != release-please-* ]]; then
echo "Not a release-please branch. Exiting."
exit 0
fi
echo "Release branch detected. Fetching PR data from GitHub API..."

# Fetch PR data and status code
HTTP_STATUS=$(curl -s -o pr_data.json -w "%{http_code}" -H "Authorization: token $$GITHUB_TOKEN" \
"https://api.github.com/repos/$REPO_FULL_NAME/pulls/$_PR_NUMBER")

if [ "$$HTTP_STATUS" -ne 200 ]; then
echo "Error fetching PR data: HTTP $$HTTP_STATUS"
cat pr_data.json
exit 1
fi

PR_DATA=$(cat pr_data.json)

# Extract labels and title from PR data (Use $$ to escape bash variables)
PR_LABELS=$(echo "$$PR_DATA" | jq -r '[.labels[].name] | join(",")')
PR_TITLE=$(echo "$$PR_DATA" | jq -r '.title')

# Determine Release Version (Use double quotes and $$ for bash variables)
if [[ "$$PR_LABELS" == *"autorelease: triggered"* ]]; then
if [[ "$$PR_TITLE" =~ release\ ([0-9]+\.[0-9]+\.[0-9]+) ]]; then
export RELEASE_VERSION="$${BASH_REMATCH[1]}"
else
export RELEASE_VERSION="unknown"
fi
else
export RELEASE_VERSION="unknown"
fi

# Workaround for evalbench bug: settings are only applied if path basename matches extension ID
ln -s /workspace /workspace/bigquery-data-analytics
cd /evalbench

export EVAL_GCP_PROJECT_ID=$PROJECT_ID
export GOOGLE_CLOUD_PROJECT=$PROJECT_ID
export EVAL_GCP_PROJECT_REGION=$_EVAL_REGION

# Combine CI metadata with run config
cat /workspace/evals/ci_metadata.yaml >> /workspace/evals/run_config.yaml

# Substitute environment variables in model_config.yaml
python3 /workspace/evals/substitute_env.py

cd /evalbench
export PYTHONPATH=./evalbench:./evalbench/evalproto
export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python

echo "Launching Standalone Evaluation..."
python3 evalbench/evalbench.py --experiment_config=/workspace/evals/run_config.yaml

availableSecrets:
secretManager:
- versionName: projects/$PROJECT_ID/secrets/GITHUB_TOKEN/versions/latest
env: 'GITHUB_TOKEN'
22 changes: 22 additions & 0 deletions evals/ci_metadata.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

############################################################
### CI Metadata (Repository Specific)
### Note: These fields are used for version tracking in BQ
### and are not part of the core Evalbench schema.
############################################################

extension_id: bigquery-data-analytics
release_version: ${RELEASE_VERSION}
32 changes: 32 additions & 0 deletions evals/dataset.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
{
"scenarios": [
{
"id": "bq-search-and-insight",
"starting_prompt": "Find tables related to sales in project ext-test-bigquery-analytics.",
"conversation_plan": "First, ask the agent to find tables related to sales. Once it lists the tables (which should include 'sales_data' in 'evalbench_ci'), ask it to identify the top product by sales in that table.",
"expected_trajectory": [
"search_catalog",
"ask_data_insights"
],
"env": {
"GOOGLE_CLOUD_PROJECT": "ext-test-bigquery-analytics"
},
"kind": "tools",
"max_turns": 4
},
{
"id": "bq-insight-and-forecast",
"starting_prompt": "What are the top products by sales in the table 'sales_data' in dataset 'evalbench_ci' and project 'ext-test-bigquery-analytics'?",
"conversation_plan": "First, ask the agent to find the top products by sales in the sales_data table. After it identifies the top products, ask it to forecast the sales for the top product for the next 5 steps.",
"expected_trajectory": [
"ask_data_insights",
"forecast"
],
"env": {
"GOOGLE_CLOUD_PROJECT": "ext-test-bigquery-analytics"
},
"kind": "tools",
"max_turns": 4
}
]
}
18 changes: 18 additions & 0 deletions evals/gemini_2.5_pro_model.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

generator: gcp_vertex_gemini
vertex_model: gemini-2.5-pro
base_prompt: ""
execs_per_minute: 5
25 changes: 25 additions & 0 deletions evals/model_config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

gemini_cli_version: "@google/gemini-cli@0.38.1"
generator: gemini_cli
env:
GOOGLE_CLOUD_PROJECT: "${GOOGLE_CLOUD_PROJECT}"
GOOGLE_CLOUD_LOCATION: "global"
GOOGLE_GENAI_USE_VERTEXAI: "true"
setup:
extensions:
# Points to the symlink created in cloudbuild.yaml to match the extension ID
"/workspace/bigquery-data-analytics":
settings: {}
37 changes: 37 additions & 0 deletions evals/run_config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

dataset_config: /workspace/evals/dataset.json
dataset_format: gemini-cli-format

orchestrator: geminicli
model_config: /workspace/evals/model_config.yaml
simulated_user_model_config: /workspace/evals/gemini_2.5_pro_model.yaml

scorers:
# Qualitative (Judge-based)
goal_completion:
model_config: /workspace/evals/gemini_2.5_pro_model.yaml
behavioral_metrics:
model_config: /workspace/evals/gemini_2.5_pro_model.yaml

Comment thread
omkargaikwad23 marked this conversation as resolved.
# Performance
turn_count: {}
end_to_end_latency: {}
tool_call_latency: {}
token_consumption: {}

reporting:
bigquery:
gcp_project_id: cloud-db-nl2sql
24 changes: 24 additions & 0 deletions evals/substitute_env.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
import os
import re

def main():
# Use EVAL_WORKSPACE env var if set, otherwise default to /workspace (CI default)
workspace = os.environ.get('EVAL_WORKSPACE', '/workspace')
yaml_paths = [
os.path.join(workspace, 'evals/model_config.yaml'),
os.path.join(workspace, 'evals/run_config.yaml')
]

for yaml_path in yaml_paths:
if os.path.exists(yaml_path):
with open(yaml_path, 'r') as f:
content = f.read()
content = re.sub(r'\${(\w+)}', lambda m: os.environ.get(m.group(1), m.group(0)), content)
with open(yaml_path, 'w') as f:
f.write(content)
print(f"Successfully substituted environment variables in {yaml_path}")
else:
print(f"File not found: {yaml_path}")

if __name__ == '__main__':
main()
Loading