From 1b1f1afccde2f5aaf16a43dc0c696ec19331fde9 Mon Sep 17 00:00:00 2001
From: Omkar Gaikwad <omkargaikwad@google.com>
Date: Tue, 28 Apr 2026 06:13:29 +0000
Subject: [PATCH 1/5] feat: implement automated evaluation pipeline using
 Evalbench with custom dataset, model configurations, and CI integration.

---
 cloudbuild.yaml                 | 87 +++++++++++++++++++++++++++++++++
 evals/ci_metadata.yaml          | 22 +++++++++
 evals/dataset.json              | 30 ++++++++++++
 evals/gemini_2.5_pro_model.yaml | 18 +++++++
 evals/model_config.yaml         | 25 ++++++++++
 evals/run_config.yaml           | 37 ++++++++++++++
 evals/substitute_env.py         | 24 +++++++++
 7 files changed, 243 insertions(+)
 create mode 100644 cloudbuild.yaml
 create mode 100644 evals/ci_metadata.yaml
 create mode 100644 evals/dataset.json
 create mode 100644 evals/gemini_2.5_pro_model.yaml
 create mode 100644 evals/model_config.yaml
 create mode 100644 evals/run_config.yaml
 create mode 100644 evals/substitute_env.py

diff --git a/cloudbuild.yaml b/cloudbuild.yaml
new file mode 100644
index 0000000..72af40a
--- /dev/null
+++ b/cloudbuild.yaml
@@ -0,0 +1,87 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+options:
+  logging: CLOUD_LOGGING_ONLY
+
+steps:
+
+  # --- Evaluation Step ---
+  - name: 'us-central1-docker.pkg.dev/cloud-db-nl2sql/evalbench/eval_server:latest'
+    entrypoint: 'bash'
+    # Decrypts the secret from Secret Manager into the GITHUB_TOKEN environment variable
+    secretEnv: ['GITHUB_TOKEN']
+    args:
+      - '-c'
+      - |
+        set -e
+
+        # Only run on release branches
+        if [[ "$_HEAD_BRANCH" != release-please-* ]]; then
+          echo "Not a release-please branch. Exiting."
+          exit 0
+        fi
+        echo "Release branch detected. Fetching PR data from GitHub API..."
+
+        # Fetch PR data and status code
+        HTTP_STATUS=$(curl -s -o pr_data.json -w "%{http_code}" -H "Authorization: token $$GITHUB_TOKEN" \
+          "https://api.github.com/repos/$REPO_FULL_NAME/pulls/$_PR_NUMBER")
+
+        if [ "$$HTTP_STATUS" -ne 200 ]; then
+          echo "Error fetching PR data: HTTP $$HTTP_STATUS"
+          cat pr_data.json
+          exit 1
+        fi
+
+        PR_DATA=$(cat pr_data.json)
+
+        # Extract labels and title from PR data (Use $$ to escape bash variables)
+        PR_LABELS=$(echo "$$PR_DATA" | jq -r '[.labels[].name] | join(",")')
+        PR_TITLE=$(echo "$$PR_DATA" | jq -r '.title')
+
+        # Determine Release Version (Use double quotes and $$ for bash variables)
+        if [[ "$$PR_LABELS" == *"autorelease: triggered"* ]]; then
+          if [[ "$$PR_TITLE" =~ release\ ([0-9]+\.[0-9]+\.[0-9]+) ]]; then
+            export RELEASE_VERSION="$${BASH_REMATCH[1]}"
+          else
+            export RELEASE_VERSION="unknown"
+          fi
+        else
+          export RELEASE_VERSION="unknown"
+        fi
+
+        # Workaround for evalbench bug: settings are only applied if path basename matches extension ID
+        ln -s /workspace /workspace/bigquery-data-analytics
+        cd /evalbench
+
+        export EVAL_GCP_PROJECT_ID=$PROJECT_ID
+        export GOOGLE_CLOUD_PROJECT=$PROJECT_ID
+
+        # Combine CI metadata with run config
+        cat /workspace/evals/ci_metadata.yaml >> /workspace/evals/run_config.yaml
+
+        # Substitute environment variables in model_config.yaml
+        python3 /workspace/evals/substitute_env.py
+
+        cd /evalbench
+        export PYTHONPATH=./evalbench:./evalbench/evalproto
+        export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
+
+        echo "Launching Standalone Evaluation..."
+        python3 evalbench/evalbench.py --experiment_config=/workspace/evals/run_config.yaml
+
+availableSecrets:
+  secretManager:
+  - versionName: projects/$PROJECT_ID/secrets/GITHUB_TOKEN/versions/latest
+    env: 'GITHUB_TOKEN'
diff --git a/evals/ci_metadata.yaml b/evals/ci_metadata.yaml
new file mode 100644
index 0000000..43b487f
--- /dev/null
+++ b/evals/ci_metadata.yaml
@@ -0,0 +1,22 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+############################################################
+### CI Metadata (Repository Specific)
+### Note: These fields are used for version tracking in BQ
+### and are not part of the core Evalbench schema.
+############################################################
+
+extension_id: bigquery-data-analytics
+release_version: ${RELEASE_VERSION}
diff --git a/evals/dataset.json b/evals/dataset.json
new file mode 100644
index 0000000..17ce24b
--- /dev/null
+++ b/evals/dataset.json
@@ -0,0 +1,30 @@
+{
+  "scenarios": [
+    {
+      "id": "bq-search-catalog",
+      "starting_prompt": "Search for tables related to sales in project ext-test-bigquery-analytics.",
+      "conversation_plan": "Ask the agent to search for tables with 'sales' in the prompt.",
+      "expected_trajectory": [
+        "search_catalog"
+      ],
+      "env": {
+        "GOOGLE_CLOUD_PROJECT": "ext-test-bigquery-analytics"
+      },
+      "kind": "tools",
+      "max_turns": 3
+    },
+    {
+      "id": "bq-ask-insights",
+      "starting_prompt": "What are the top products by sales in the table 'sales_data' in dataset 'sales' and project 'ext-test-bigquery-analytics'?",
+      "conversation_plan": "Ask the agent to get insights about the sales table.",
+      "expected_trajectory": [
+        "ask_data_insights"
+      ],
+      "env": {
+        "GOOGLE_CLOUD_PROJECT": "ext-test-bigquery-analytics"
+      },
+      "kind": "tools",
+      "max_turns": 3
+    }
+  ]
+}
diff --git a/evals/gemini_2.5_pro_model.yaml b/evals/gemini_2.5_pro_model.yaml
new file mode 100644
index 0000000..7154ec3
--- /dev/null
+++ b/evals/gemini_2.5_pro_model.yaml
@@ -0,0 +1,18 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+generator: gcp_vertex_gemini
+vertex_model: gemini-2.5-pro
+base_prompt: ""
+execs_per_minute: 5
diff --git a/evals/model_config.yaml b/evals/model_config.yaml
new file mode 100644
index 0000000..6f99dc1
--- /dev/null
+++ b/evals/model_config.yaml
@@ -0,0 +1,25 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+gemini_cli_version: "@google/gemini-cli@0.38.1"
+generator: gemini_cli
+env:
+  GOOGLE_CLOUD_PROJECT: "${GOOGLE_CLOUD_PROJECT}"
+  GOOGLE_CLOUD_LOCATION: "global"
+  GOOGLE_GENAI_USE_VERTEXAI: "true"
+setup:
+  extensions:
+    # Points to the symlink created in cloudbuild.yaml to match the extension ID
+    "/workspace/bigquery-data-analytics":
+      settings: {}
diff --git a/evals/run_config.yaml b/evals/run_config.yaml
new file mode 100644
index 0000000..7cba9a1
--- /dev/null
+++ b/evals/run_config.yaml
@@ -0,0 +1,37 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+dataset_config: /workspace/evals/dataset.json
+dataset_format: gemini-cli-format
+
+orchestrator: geminicli
+model_config: /workspace/evals/model_config.yaml
+simulated_user_model_config: /workspace/evals/gemini_2.5_pro_model.yaml
+
+scorers:
+  # Qualitative (Judge-based)
+  goal_completion:
+    model_config: /workspace/evals/gemini_2.5_pro_model.yaml
+  behavioral_metrics:
+    model_config: /workspace/evals/gemini_2.5_pro_model.yaml
+
+  # Performance
+  turn_count: {}
+  end_to_end_latency: {}
+  tool_call_latency: {}
+  token_consumption: {}
+
+reporting:
+  bigquery:
+    gcp_project_id: ext-test-bigquery-analytics
diff --git a/evals/substitute_env.py b/evals/substitute_env.py
new file mode 100644
index 0000000..ded7c37
--- /dev/null
+++ b/evals/substitute_env.py
@@ -0,0 +1,24 @@
+import os
+import re
+
+def main():
+    # Use EVAL_WORKSPACE env var if set, otherwise default to /workspace (CI default)
+    workspace = os.environ.get('EVAL_WORKSPACE', '/workspace')
+    yaml_paths = [
+        os.path.join(workspace, 'evals/model_config.yaml'),
+        os.path.join(workspace, 'evals/run_config.yaml')
+    ]
+    
+    for yaml_path in yaml_paths:
+        if os.path.exists(yaml_path):
+            with open(yaml_path, 'r') as f:
+                content = f.read()
+            content = re.sub(r'\${(\w+)}', lambda m: os.environ.get(m.group(1), m.group(0)), content)
+            with open(yaml_path, 'w') as f:
+                f.write(content)
+            print(f"Successfully substituted environment variables in {yaml_path}")
+        else:
+            print(f"File not found: {yaml_path}")
+
+if __name__ == '__main__':
+    main()

From 77c5479f88ab327c6ec20b1256bbd0453ed1eeef Mon Sep 17 00:00:00 2001
From: Omkar Gaikwad <omkargaikwad@google.com>
Date: Tue, 28 Apr 2026 07:08:48 +0000
Subject: [PATCH 2/5] refactor: consolidate dataset scenarios by updating
 bq-search-and-insight and adding bq-insight-and-forecast

---
 evals/dataset.json | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/evals/dataset.json b/evals/dataset.json
index 17ce24b..dcd8414 100644
--- a/evals/dataset.json
+++ b/evals/dataset.json
@@ -1,30 +1,32 @@
 {
   "scenarios": [
     {
-      "id": "bq-search-catalog",
-      "starting_prompt": "Search for tables related to sales in project ext-test-bigquery-analytics.",
-      "conversation_plan": "Ask the agent to search for tables with 'sales' in the prompt.",
+      "id": "bq-search-and-insight",
+      "starting_prompt": "Find tables related to sales in project ext-test-bigquery-analytics.",
+      "conversation_plan": "First, ask the agent to find tables related to sales. Once it lists the tables (which should include 'sales_data' in 'evalbench_ci'), ask it to identify the top product by sales in that table.",
       "expected_trajectory": [
-        "search_catalog"
+        "search_catalog",
+        "ask_data_insights"
       ],
       "env": {
         "GOOGLE_CLOUD_PROJECT": "ext-test-bigquery-analytics"
       },
       "kind": "tools",
-      "max_turns": 3
+      "max_turns": 4
     },
     {
-      "id": "bq-ask-insights",
-      "starting_prompt": "What are the top products by sales in the table 'sales_data' in dataset 'sales' and project 'ext-test-bigquery-analytics'?",
-      "conversation_plan": "Ask the agent to get insights about the sales table.",
+      "id": "bq-insight-and-forecast",
+      "starting_prompt": "What are the top products by sales in the table 'sales_data' in dataset 'evalbench_ci' and project 'ext-test-bigquery-analytics'?",
+      "conversation_plan": "First, ask the agent to find the top products by sales in the sales_data table. After it identifies the top products, ask it to forecast the sales for the top product for the next 5 steps.",
       "expected_trajectory": [
-        "ask_data_insights"
+        "ask_data_insights",
+        "forecast"
       ],
       "env": {
         "GOOGLE_CLOUD_PROJECT": "ext-test-bigquery-analytics"
       },
       "kind": "tools",
-      "max_turns": 3
+      "max_turns": 4
     }
   ]
 }

From 09ac96c4fd2ea12ee4788de2626deffb7bb81b8d Mon Sep 17 00:00:00 2001
From: Omkar Gaikwad <omkargaikwad@google.com>
Date: Tue, 28 Apr 2026 07:24:51 +0000
Subject: [PATCH 3/5] chore: update BQ project ID and add region environment
 variable for evaluation runs

---
 cloudbuild.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cloudbuild.yaml b/cloudbuild.yaml
index 72af40a..65eee10 100644
--- a/cloudbuild.yaml
+++ b/cloudbuild.yaml
@@ -67,6 +67,7 @@ steps:
 
         export EVAL_GCP_PROJECT_ID=$PROJECT_ID
         export GOOGLE_CLOUD_PROJECT=$PROJECT_ID
+        export EVAL_GCP_PROJECT_REGION=$_EVAL_REGION
 
         # Combine CI metadata with run config
         cat /workspace/evals/ci_metadata.yaml >> /workspace/evals/run_config.yaml

From 96b47f9ca818d7be9cdc6e8b687b969a594f5775 Mon Sep 17 00:00:00 2001
From: Omkar Gaikwad <omkargaikwad@google.com>
Date: Tue, 28 Apr 2026 07:25:00 +0000
Subject: [PATCH 4/5] chore: update bigquery gcp_project_id in run
 configuration

---
 evals/run_config.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/evals/run_config.yaml b/evals/run_config.yaml
index 7cba9a1..0f45e6e 100644
--- a/evals/run_config.yaml
+++ b/evals/run_config.yaml
@@ -34,4 +34,4 @@ scorers:
 
 reporting:
   bigquery:
-    gcp_project_id: ext-test-bigquery-analytics
+    gcp_project_id: cloud-db-nl2sql

From 2aa9c3c15c2664810e31b1518047041d7fbe78d8 Mon Sep 17 00:00:00 2001
From: Omkar Gaikwad <omkargaikwad@google.com>
Date: Tue, 28 Apr 2026 08:41:47 +0000
Subject: [PATCH 5/5] refactor: update evaluation dataset entries in
 dataset.json

---
 evals/dataset.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/evals/dataset.json b/evals/dataset.json
index dcd8414..ea7327a 100644
--- a/evals/dataset.json
+++ b/evals/dataset.json
@@ -29,4 +29,4 @@
       "max_turns": 4
     }
   ]
-}
+}
\ No newline at end of file