Skip to content

Commit 51a1362

Browse files
feat: integrate full evaluation pipeline in cloudbuild and update model configurations
1 parent 6f8c58c commit 51a1362

5 files changed

Lines changed: 73 additions & 36 deletions

File tree

cloudbuild.yaml

Lines changed: 65 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -25,40 +25,74 @@ steps:
2525
- '--allow-unauthenticated'
2626
- '--port=8080'
2727
- '--timeout=300'
28-
- '--set-env-vars=CLOUD_SQL_POSTGRES_PROJECT=omkar-playground,CLOUD_SQL_POSTGRES_INSTANCE=omkar-demo-postgres-1,CLOUD_SQL_POSTGRES_REGION=us-central1,CLOUD_SQL_POSTGRES_DATABASE=postgres,CLOUD_SQL_POSTGRES_USER=postgres,CLOUD_SQL_POSTGRES_PASSWORD=7`[EP^`U"_frcD;q,CLOUD_SQL_POSTGRES_IP_TYPE=PUBLIC'
28+
- '--set-env-vars=CLOUD_SQL_POSTGRES_PROJECT=omkar-playground,CLOUD_SQL_POSTGRES_INSTANCE=omkar-demo-postgres-1,CLOUD_SQL_POSTGRES_REGION=us-central1,CLOUD_SQL_POSTGRES_DATABASE=postgres,CLOUD_SQL_POSTGRES_USER=postgres,CLOUD_SQL_POSTGRES_PASSWORD=[PASSWORD],CLOUD_SQL_POSTGRES_IP_TYPE=PUBLIC'
2929

30-
# --- STEP 3: Run Eval Server in Background ---
31-
- name: 'gcr.io/cloud-builders/docker'
30+
# --- STEP 3: Fully Integrated Evaluation to Persist Results ---
31+
- name: 'us-central1-docker.pkg.dev/omkar-playground/toolbox-evals/eval_server:latest'
32+
entrypoint: 'bash'
3233
args:
33-
- 'run'
34-
- '-d'
35-
- '--network=cloudbuild'
36-
- '--name=eval_server'
37-
- 'us-central1-docker.pkg.dev/omkar-playground/toolbox-evals/eval_server:latest'
34+
- '-c'
35+
- |
36+
set -e
37+
cd /evalbench
38+
39+
export EVAL_GCP_PROJECT_ID=omkar-playground
40+
export EVAL_GCP_PROJECT_REGION=us-central1
41+
42+
echo "Compiling protobuf files..."
43+
python3 -m grpc_tools.protoc --proto_path=evalbench/evalproto --python_out=evalbench/evalproto --grpc_python_out=evalbench/evalproto evalbench/evalproto/*.proto
44+
45+
echo "Patching client to use insecure credentials..."
46+
# sed -i 's/"localhost:50051"/"127.0.0.1:50051"/g' evalbench/client/eval_client.py
47+
sed -i 's/grpc.alts_channel_credentials()/None/g' evalbench/client/eval_client.py
48+
sed -i 's/grpc.aio.secure_channel(address, channel_creds)/grpc.aio.insecure_channel(address)/g' evalbench/client/eval_client.py
49+
50+
echo "Patching server to listen on all IPv4 interfaces (0.0.0.0)..."
51+
sed -i 's/"\[::\]:%s"/"0.0.0.0:%s"/g' /evalbench/evalbench/eval_server.py
52+
echo "Checking bind success in server (writing to stderr)..."
53+
sed -i 's|server.add_insecure_port("0.0.0.0:%s" % PORT)|bound_port = server.add_insecure_port("0.0.0.0:%s" % PORT)\n import sys\n sys.stderr.write(f"BOUND_PORT: {bound_port}\\n")\n if bound_port == 0: raise RuntimeError("Failed to bind to port!")|' /evalbench/evalbench/eval_server.py
3854
39-
# --- STEP 4: Run Evalbench Evaluation Client ---
40-
# - name: 'python:3.10'
41-
# entrypoint: 'bash'
42-
# args:
43-
# - '-c'
44-
# - |
45-
# # Clone Evalbench
46-
# git clone https://github.com/GoogleCloudPlatform/evalbench.git
47-
# cd evalbench
55+
echo "Patching eval_service.py to fix TypeError in get_reporters..."
56+
sed -i 's|reporters = get_reporters(config.get("reporting"), job_id, run_time)|reporters = get_reporters(config.get("reporting") or {}, job_id, run_time)|' /evalbench/evalbench/eval_service.py
57+
58+
echo "Patching util/session.py to make ADK import lazy..."
59+
sed -i 's|from google.adk.sessions import VertexAiSessionService||' /evalbench/evalbench/util/session.py
60+
sed -i 's| def __init__(self, config):| def __init__(self, config):\n from google.adk.sessions import VertexAiSessionService|' /evalbench/evalbench/util/session.py
61+
echo "Patching databases/util.py to make SecretManagerClient lazy..."
62+
sed -i 's|CLIENT = secretmanager_v1.SecretManagerServiceClient()|CLIENT = None\ndef get_client():\n global CLIENT\n if CLIENT is None:\n CLIENT = secretmanager_v1.SecretManagerServiceClient()\n return CLIENT|' /evalbench/evalbench/databases/util.py || echo "Failed to patch databases/util.py"
63+
sed -i 's|CLIENT.access_secret_version|get_client().access_secret_version|' /evalbench/evalbench/databases/util.py || echo "Failed to patch databases/util.py usage"
64+
cd evalbench
65+
export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
66+
export PYTHONPATH=./evalproto:.
67+
export CLOUD_RUN=True
68+
export PORT=50051
69+
70+
71+
72+
echo "Starting Evaluation Server in background..."
73+
# NEW: Added </dev/null in case it was waiting for input
74+
python3 -u ./eval_server.py --localhost </dev/null &
75+
SERVER_PID=$$!
4876
49-
# # Install Dependencies
50-
# pip install -r requirements.txt
77+
echo "Waiting for port 50051 to open..."
78+
python3 -c "
79+
import socket
80+
import time
81+
for i in range(20):
82+
try:
83+
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
84+
s.connect(('127.0.0.1', 50051))
85+
print('Port is open!')
86+
exit(0)
87+
except Exception as e:
88+
print(f'Port not open yet: {e}')
89+
time.sleep(1)
90+
print('Port failed to open')
91+
exit(1)
92+
" || { echo "Server failed to bind port. Check logs above."; exit 1; }
5193
52-
# # Setup Environment Variables
53-
# export EVAL_GCP_PROJECT_ID=omkar-playground
54-
# export EVAL_GCP_PROJECT_REGION=us-central1
55-
# export EVAL_CONFIG=../evals/run_config.yaml
94+
echo "Server is running. Launching Evaluation Client..."
95+
cd /evalbench
96+
export PYTHONPATH=./evalbench:./evalbench/evalproto
5697
57-
# # Compile required protobuf modules and Run Evaluation Client against the eval_server container
58-
# make proto
59-
# ./run_client.sh --endpoint=eval_server:50051
60-
61-
62-
options:
63-
env:
64-
- 'DOCKER_BUILDKIT=1'
98+
python3 evalbench/client/eval_client.py --experiment=/workspace/evals/run_config.yaml --endpoint=local || { echo "Client failed! Server logs:"; cat /evalbench/evalbench/server.log; exit 1; }

evals/dataset.json

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,7 @@
55
"starting_prompt": "I need to debug the database.",
66
"conversation_plan": "Ask the agent to list instances in project omkar-playground. Once listed, ask it to check the CPU usage of the first instance. Finally, ask if that usage is considered high.",
77
"expected_trajectory": [
8-
"list_instances",
9-
"get_metrics"
8+
"list_instances"
109
],
1110
"kind": "tool",
1211
"max_turns": 15

evals/gemini_2.5_pro_model.yaml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
generator: gcp_vertex_gemini
2+
vertex_model: gemini-2.5-pro
3+
base_prompt: ""
4+
execs_per_minute: 5

evals/model_config.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,5 +14,5 @@ setup:
1414
CLOUD_SQL_POSTGRES_REGION: "us-central1"
1515
CLOUD_SQL_POSTGRES_DATABASE: "postgres"
1616
CLOUD_SQL_POSTGRES_USER: "postgres"
17-
CLOUD_SQL_POSTGRES_PASSWORD: '7`[EP^`U"_frcD;q'
17+
CLOUD_SQL_POSTGRES_PASSWORD: ${CLOUD_SQL_POSTGRES_PASSWORD}
1818
CLOUD_SQL_POSTGRES_IP_TYPE: "PUBLIC"

evals/run_config.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,9 @@ dataset_format: gemini-cli-format
44
orchestrator: geminicli
55
model_config: /workspace/evals/model_config.yaml
66
# You can reference default simulated user models provided by the evalbench repo:
7-
simulated_user_model_config: datasets/model_configs/gemini_2.5_pro_model.yaml
7+
simulated_user_model_config: /workspace/evals/gemini_2.5_pro_model.yaml
88

99
scorers:
1010
trajectory_matcher: {}
1111
goal_completion:
12-
model_config: datasets/model_configs/gemini_2.5_pro_model.yaml
12+
model_config: /workspace/evals/gemini_2.5_pro_model.yaml

0 commit comments

Comments
 (0)