From 1b1f1afccde2f5aaf16a43dc0c696ec19331fde9 Mon Sep 17 00:00:00 2001 From: Omkar Gaikwad Date: Tue, 28 Apr 2026 06:13:29 +0000 Subject: [PATCH 1/5] feat: implement automated evaluation pipeline using Evalbench with custom dataset, model configurations, and CI integration. --- cloudbuild.yaml | 87 +++++++++++++++++++++++++++++++++ evals/ci_metadata.yaml | 22 +++++++++ evals/dataset.json | 30 ++++++++++++ evals/gemini_2.5_pro_model.yaml | 18 +++++++ evals/model_config.yaml | 25 ++++++++++ evals/run_config.yaml | 37 ++++++++++++++ evals/substitute_env.py | 24 +++++++++ 7 files changed, 243 insertions(+) create mode 100644 cloudbuild.yaml create mode 100644 evals/ci_metadata.yaml create mode 100644 evals/dataset.json create mode 100644 evals/gemini_2.5_pro_model.yaml create mode 100644 evals/model_config.yaml create mode 100644 evals/run_config.yaml create mode 100644 evals/substitute_env.py diff --git a/cloudbuild.yaml b/cloudbuild.yaml new file mode 100644 index 0000000..72af40a --- /dev/null +++ b/cloudbuild.yaml @@ -0,0 +1,87 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +options: + logging: CLOUD_LOGGING_ONLY + +steps: + + # --- Evaluation Step --- + - name: 'us-central1-docker.pkg.dev/cloud-db-nl2sql/evalbench/eval_server:latest' + entrypoint: 'bash' + # Decrypts the secret from Secret Manager into the GITHUB_TOKEN environment variable + secretEnv: ['GITHUB_TOKEN'] + args: + - '-c' + - | + set -e + + # Only run on release branches + if [[ "$_HEAD_BRANCH" != release-please-* ]]; then + echo "Not a release-please branch. Exiting." + exit 0 + fi + echo "Release branch detected. Fetching PR data from GitHub API..." + + # Fetch PR data and status code + HTTP_STATUS=$(curl -s -o pr_data.json -w "%{http_code}" -H "Authorization: token $$GITHUB_TOKEN" \ + "https://api.github.com/repos/$REPO_FULL_NAME/pulls/$_PR_NUMBER") + + if [ "$$HTTP_STATUS" -ne 200 ]; then + echo "Error fetching PR data: HTTP $$HTTP_STATUS" + cat pr_data.json + exit 1 + fi + + PR_DATA=$(cat pr_data.json) + + # Extract labels and title from PR data (Use $$ to escape bash variables) + PR_LABELS=$(echo "$$PR_DATA" | jq -r '[.labels[].name] | join(",")') + PR_TITLE=$(echo "$$PR_DATA" | jq -r '.title') + + # Determine Release Version (Use double quotes and $$ for bash variables) + if [[ "$$PR_LABELS" == *"autorelease: triggered"* ]]; then + if [[ "$$PR_TITLE" =~ release\ ([0-9]+\.[0-9]+\.[0-9]+) ]]; then + export RELEASE_VERSION="$${BASH_REMATCH[1]}" + else + export RELEASE_VERSION="unknown" + fi + else + export RELEASE_VERSION="unknown" + fi + + # Workaround for evalbench bug: settings are only applied if path basename matches extension ID + ln -s /workspace /workspace/bigquery-data-analytics + cd /evalbench + + export EVAL_GCP_PROJECT_ID=$PROJECT_ID + export GOOGLE_CLOUD_PROJECT=$PROJECT_ID + + # Combine CI metadata with run config + cat /workspace/evals/ci_metadata.yaml >> /workspace/evals/run_config.yaml + + # Substitute environment variables in model_config.yaml + python3 /workspace/evals/substitute_env.py + + cd /evalbench + export PYTHONPATH=./evalbench:./evalbench/evalproto + export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python + + echo "Launching Standalone Evaluation..." + python3 evalbench/evalbench.py --experiment_config=/workspace/evals/run_config.yaml + +availableSecrets: + secretManager: + - versionName: projects/$PROJECT_ID/secrets/GITHUB_TOKEN/versions/latest + env: 'GITHUB_TOKEN' diff --git a/evals/ci_metadata.yaml b/evals/ci_metadata.yaml new file mode 100644 index 0000000..43b487f --- /dev/null +++ b/evals/ci_metadata.yaml @@ -0,0 +1,22 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +############################################################ +### CI Metadata (Repository Specific) +### Note: These fields are used for version tracking in BQ +### and are not part of the core Evalbench schema. +############################################################ + +extension_id: bigquery-data-analytics +release_version: ${RELEASE_VERSION} diff --git a/evals/dataset.json b/evals/dataset.json new file mode 100644 index 0000000..17ce24b --- /dev/null +++ b/evals/dataset.json @@ -0,0 +1,30 @@ +{ + "scenarios": [ + { + "id": "bq-search-catalog", + "starting_prompt": "Search for tables related to sales in project ext-test-bigquery-analytics.", + "conversation_plan": "Ask the agent to search for tables with 'sales' in the prompt.", + "expected_trajectory": [ + "search_catalog" + ], + "env": { + "GOOGLE_CLOUD_PROJECT": "ext-test-bigquery-analytics" + }, + "kind": "tools", + "max_turns": 3 + }, + { + "id": "bq-ask-insights", + "starting_prompt": "What are the top products by sales in the table 'sales_data' in dataset 'sales' and project 'ext-test-bigquery-analytics'?", + "conversation_plan": "Ask the agent to get insights about the sales table.", + "expected_trajectory": [ + "ask_data_insights" + ], + "env": { + "GOOGLE_CLOUD_PROJECT": "ext-test-bigquery-analytics" + }, + "kind": "tools", + "max_turns": 3 + } + ] +} diff --git a/evals/gemini_2.5_pro_model.yaml b/evals/gemini_2.5_pro_model.yaml new file mode 100644 index 0000000..7154ec3 --- /dev/null +++ b/evals/gemini_2.5_pro_model.yaml @@ -0,0 +1,18 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +generator: gcp_vertex_gemini +vertex_model: gemini-2.5-pro +base_prompt: "" +execs_per_minute: 5 diff --git a/evals/model_config.yaml b/evals/model_config.yaml new file mode 100644 index 0000000..6f99dc1 --- /dev/null +++ b/evals/model_config.yaml @@ -0,0 +1,25 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +gemini_cli_version: "@google/gemini-cli@0.38.1" +generator: gemini_cli +env: + GOOGLE_CLOUD_PROJECT: "${GOOGLE_CLOUD_PROJECT}" + GOOGLE_CLOUD_LOCATION: "global" + GOOGLE_GENAI_USE_VERTEXAI: "true" +setup: + extensions: + # Points to the symlink created in cloudbuild.yaml to match the extension ID + "/workspace/bigquery-data-analytics": + settings: {} diff --git a/evals/run_config.yaml b/evals/run_config.yaml new file mode 100644 index 0000000..7cba9a1 --- /dev/null +++ b/evals/run_config.yaml @@ -0,0 +1,37 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +dataset_config: /workspace/evals/dataset.json +dataset_format: gemini-cli-format + +orchestrator: geminicli +model_config: /workspace/evals/model_config.yaml +simulated_user_model_config: /workspace/evals/gemini_2.5_pro_model.yaml + +scorers: + # Qualitative (Judge-based) + goal_completion: + model_config: /workspace/evals/gemini_2.5_pro_model.yaml + behavioral_metrics: + model_config: /workspace/evals/gemini_2.5_pro_model.yaml + + # Performance + turn_count: {} + end_to_end_latency: {} + tool_call_latency: {} + token_consumption: {} + +reporting: + bigquery: + gcp_project_id: ext-test-bigquery-analytics diff --git a/evals/substitute_env.py b/evals/substitute_env.py new file mode 100644 index 0000000..ded7c37 --- /dev/null +++ b/evals/substitute_env.py @@ -0,0 +1,24 @@ +import os +import re + +def main(): + # Use EVAL_WORKSPACE env var if set, otherwise default to /workspace (CI default) + workspace = os.environ.get('EVAL_WORKSPACE', '/workspace') + yaml_paths = [ + os.path.join(workspace, 'evals/model_config.yaml'), + os.path.join(workspace, 'evals/run_config.yaml') + ] + + for yaml_path in yaml_paths: + if os.path.exists(yaml_path): + with open(yaml_path, 'r') as f: + content = f.read() + content = re.sub(r'\${(\w+)}', lambda m: os.environ.get(m.group(1), m.group(0)), content) + with open(yaml_path, 'w') as f: + f.write(content) + print(f"Successfully substituted environment variables in {yaml_path}") + else: + print(f"File not found: {yaml_path}") + +if __name__ == '__main__': + main() From 77c5479f88ab327c6ec20b1256bbd0453ed1eeef Mon Sep 17 00:00:00 2001 From: Omkar Gaikwad Date: Tue, 28 Apr 2026 07:08:48 +0000 Subject: [PATCH 2/5] refactor: consolidate dataset scenarios by updating bq-search-and-insight and adding bq-insight-and-forecast --- evals/dataset.json | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/evals/dataset.json b/evals/dataset.json index 17ce24b..dcd8414 100644 --- a/evals/dataset.json +++ b/evals/dataset.json @@ -1,30 +1,32 @@ { "scenarios": [ { - "id": "bq-search-catalog", - "starting_prompt": "Search for tables related to sales in project ext-test-bigquery-analytics.", - "conversation_plan": "Ask the agent to search for tables with 'sales' in the prompt.", + "id": "bq-search-and-insight", + "starting_prompt": "Find tables related to sales in project ext-test-bigquery-analytics.", + "conversation_plan": "First, ask the agent to find tables related to sales. Once it lists the tables (which should include 'sales_data' in 'evalbench_ci'), ask it to identify the top product by sales in that table.", "expected_trajectory": [ - "search_catalog" + "search_catalog", + "ask_data_insights" ], "env": { "GOOGLE_CLOUD_PROJECT": "ext-test-bigquery-analytics" }, "kind": "tools", - "max_turns": 3 + "max_turns": 4 }, { - "id": "bq-ask-insights", - "starting_prompt": "What are the top products by sales in the table 'sales_data' in dataset 'sales' and project 'ext-test-bigquery-analytics'?", - "conversation_plan": "Ask the agent to get insights about the sales table.", + "id": "bq-insight-and-forecast", + "starting_prompt": "What are the top products by sales in the table 'sales_data' in dataset 'evalbench_ci' and project 'ext-test-bigquery-analytics'?", + "conversation_plan": "First, ask the agent to find the top products by sales in the sales_data table. After it identifies the top products, ask it to forecast the sales for the top product for the next 5 steps.", "expected_trajectory": [ - "ask_data_insights" + "ask_data_insights", + "forecast" ], "env": { "GOOGLE_CLOUD_PROJECT": "ext-test-bigquery-analytics" }, "kind": "tools", - "max_turns": 3 + "max_turns": 4 } ] } From 09ac96c4fd2ea12ee4788de2626deffb7bb81b8d Mon Sep 17 00:00:00 2001 From: Omkar Gaikwad Date: Tue, 28 Apr 2026 07:24:51 +0000 Subject: [PATCH 3/5] chore: update BQ project ID and add region environment variable for evaluation runs --- cloudbuild.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/cloudbuild.yaml b/cloudbuild.yaml index 72af40a..65eee10 100644 --- a/cloudbuild.yaml +++ b/cloudbuild.yaml @@ -67,6 +67,7 @@ steps: export EVAL_GCP_PROJECT_ID=$PROJECT_ID export GOOGLE_CLOUD_PROJECT=$PROJECT_ID + export EVAL_GCP_PROJECT_REGION=$_EVAL_REGION # Combine CI metadata with run config cat /workspace/evals/ci_metadata.yaml >> /workspace/evals/run_config.yaml From 96b47f9ca818d7be9cdc6e8b687b969a594f5775 Mon Sep 17 00:00:00 2001 From: Omkar Gaikwad Date: Tue, 28 Apr 2026 07:25:00 +0000 Subject: [PATCH 4/5] chore: update bigquery gcp_project_id in run configuration --- evals/run_config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/evals/run_config.yaml b/evals/run_config.yaml index 7cba9a1..0f45e6e 100644 --- a/evals/run_config.yaml +++ b/evals/run_config.yaml @@ -34,4 +34,4 @@ scorers: reporting: bigquery: - gcp_project_id: ext-test-bigquery-analytics + gcp_project_id: cloud-db-nl2sql From 2aa9c3c15c2664810e31b1518047041d7fbe78d8 Mon Sep 17 00:00:00 2001 From: Omkar Gaikwad Date: Tue, 28 Apr 2026 08:41:47 +0000 Subject: [PATCH 5/5] refactor: update evaluation dataset entries in dataset.json --- evals/dataset.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/evals/dataset.json b/evals/dataset.json index dcd8414..ea7327a 100644 --- a/evals/dataset.json +++ b/evals/dataset.json @@ -29,4 +29,4 @@ "max_turns": 4 } ] -} +} \ No newline at end of file