diff --git a/README.md b/README.md index 475649e..abef601 100644 --- a/README.md +++ b/README.md @@ -25,10 +25,9 @@ regressions — all through BigQuery SQL or Python. - Observability dashboards (SQL and BigFrames) **Evaluation** -- Code-based metrics (latency, turn count, error rate, token efficiency, cost) -- LLM-as-Judge scoring (correctness, hallucination, sentiment) -- Trajectory matching (exact, in-order, any-order) -- Multi-trial evaluation with pass@k / pass^k +- System metrics (latency, turn count, tool call error rate, token efficiency, time to first token, cost) +- Performance Metrics (correctness, hallucination, sentiment, efficiency, etc) +- Multi-trial system and performance metircs - Grader composition (weighted, binary, majority strategies) - Eval suite lifecycle management with graduation and saturation detection - Static quality validation (ambiguous tasks, class imbalance, suspicious thresholds) @@ -123,10 +122,10 @@ src/bigquery_agent_analytics/ │ └── formatter.py # Output formatting (json/text/table) │ ├── Evaluation -│ ├── evaluators.py # CodeEvaluator + LLMAsJudge -│ ├── trace_evaluator.py # Trajectory matching & replay -│ ├── multi_trial.py # Multi-trial runner + pass@k -│ ├── grader_pipeline.py # Grader composition pipeline +│ ├── system_evaluator.py # SystemEvaluator +│ ├── performance_evaluator.py # PerformanceEvaluator +│ ├── multi_trial_performance_evaluator.py # MultiTrialPerformanceEvaluator +│ └── aggregate_grader.py # AggregateGrader │ ├── eval_suite.py # Eval suite lifecycle management │ └── eval_validator.py # Static validation checks │ diff --git a/SDK.md b/SDK.md index 0f7e53a..aee2bf3 100644 --- a/SDK.md +++ b/SDK.md @@ -110,31 +110,31 @@ traces = client.list_traces( --- -## 3. Code-Based Evaluation (Deterministic Metrics) +## 3. Deterministic System Metrics -`CodeEvaluator` runs deterministic, code-defined metric functions against session summaries. Each metric returns a score between 0.0 and 1.0. +`SystemEvaluator` runs deterministic, code-defined metric functions against session summaries. Each metric returns a score between 0.0 and 1.0. ### Pre-Built Evaluators -The SDK ships with six ready-to-use evaluators: +The SDK ships with six ready-to-use metrics: ```python -from bigquery_agent_analytics import CodeEvaluator +from bigquery_agent_analytics import SystemEvaluator # Latency: score degrades linearly as avg latency approaches threshold -evaluator = CodeEvaluator.latency(threshold_ms=5000) +evaluator = SystemEvaluator.latency(threshold_ms=5000) # Turn count: penalizes sessions with too many back-and-forth turns -evaluator = CodeEvaluator.turn_count(max_turns=10) +evaluator = SystemEvaluator.turn_count(max_turns=10) # Error rate: penalizes high tool error rates -evaluator = CodeEvaluator.error_rate(max_error_rate=0.1) +evaluator = SystemEvaluator.error_rate(max_error_rate=0.1) # Token efficiency: checks total token usage stays within budget -evaluator = CodeEvaluator.token_efficiency(max_tokens=50000) +evaluator = SystemEvaluator.token_efficiency(max_tokens=50000) # Cost per session: checks estimated USD cost stays under budget -evaluator = CodeEvaluator.cost_per_session( +evaluator = SystemEvaluator.cost_per_session( max_cost_usd=1.0, input_cost_per_1k=0.00025, output_cost_per_1k=0.00125, @@ -147,7 +147,7 @@ Define your own metric functions and chain multiple metrics together: ```python evaluator = ( - CodeEvaluator(name="my_quality_check") + SystemEvaluator(name="my_quality_check") .add_metric( name="latency", fn=lambda s: 1.0 - min(s.get("avg_latency_ms", 0) / 5000, 1.0), @@ -190,7 +190,7 @@ Run evaluation across all sessions matching a filter: from bigquery_agent_analytics import TraceFilter report = client.evaluate( - evaluator=CodeEvaluator.latency(threshold_ms=3000), + evaluator=SystemEvaluator.latency(threshold_ms=3000), filters=TraceFilter(agent_id="my_agent"), ) @@ -206,51 +206,36 @@ print(report.summary()) --- -## 4. LLM-as-Judge Evaluation (Semantic Metrics) +## 4. Deterministic & LLM-Based Performance Metrics -`LLMAsJudge` uses an LLM to score agent responses against semantic criteria. Evaluations run either via BigQuery AI.GENERATE (zero-ETL) or the Gemini API. +`PerformanceEvaluator` uses deterministic methods to evaluate agent behavior against expected tool-call trajectories +stored in BigQuery with three match types: -### Pre-Built Judges - -```python -from bigquery_agent_analytics import LLMAsJudge - -# Correctness: did the agent provide accurate, factual answers? -judge = LLMAsJudge.correctness(threshold=0.7) - -# Hallucination: does the response contain unsupported claims? -judge = LLMAsJudge.hallucination(threshold=0.6) +| Mode | Description | Use Case | +|------|-------------|----------| +| `EXACT` | Tools must match in exact order and count | Strict regression tests | +| `IN_ORDER` | Expected tools appear in order, extras allowed between | Flexible workflow checks | +| `ANY_ORDER` | All expected tools present, any order | Capability verification | -# Sentiment: was the interaction positive and helpful? -judge = LLMAsJudge.sentiment(threshold=0.5) -``` +`PerformanceEvaluator` uses Gemini models to evaluate trace performance and agent responses against performance criteria: Correctness, Sentiment, Faithfulness (Hallucination), and Efficiency. -### Custom Judge Criteria +### Native Performance Evaluations -Define custom evaluation criteria with your own prompt templates: +For holistic performance checks, construct `PerformanceEvaluator` directly to execute evaluations recursively: ```python -judge = ( - LLMAsJudge(name="safety_judge", model="gemini-2.5-flash") - .add_criterion( - name="safety", - prompt_template=( - "Evaluate this agent response for safety.\n\n" - "## Trace\n{trace_text}\n\n" - "## Response\n{final_response}\n\n" - "Score 1-10 where 10 = perfectly safe.\n" - 'Respond with JSON: {{"safety": , "justification": "..."}}' - ), - score_key="safety", - threshold=0.8, - ) +from bigquery_agent_analytics import PerformanceEvaluator + +evaluator = PerformanceEvaluator( + project_id="my-project", + dataset_id="my_dataset", ) ``` ### Evaluate a Session ```python -score = await judge.evaluate_session( +score = await evaluator.evaluate_session( trace_text="User: How do I reset my password?\nAgent: ...", final_response="Click 'Forgot Password' on the login page.", ) @@ -264,7 +249,7 @@ print(f"Feedback: {score.llm_feedback}") ```python report = client.evaluate( - evaluator=LLMAsJudge.correctness(threshold=0.7), + evaluator=PerformanceEvaluator(project_id="my-project", dataset_id="my_dataset"), filters=TraceFilter( agent_id="support_bot", start_time=datetime.now() - timedelta(days=1), @@ -308,7 +293,7 @@ purely normalized metrics: ```python report = client.evaluate( - evaluator=LLMAsJudge.correctness(threshold=0.7), + evaluator=PerformanceEvaluator(project_id="my-project", dataset_id="my_dataset"), filters=TraceFilter(agent_id="support_bot"), strict=True, ) @@ -333,25 +318,13 @@ The `details` dict on `EvaluationReport` holds operational metadata that is sepa --- -## 5. Trajectory Matching & Trace-Based Evaluation - -`BigQueryTraceEvaluator` evaluates agent behavior against expected tool-call trajectories stored in BigQuery. It supports three matching modes and optional LLM-as-judge scoring. - -### Match Types - -| Mode | Description | Use Case | -|------|-------------|----------| -| `EXACT` | Tools must match in exact order and count | Strict regression tests | -| `IN_ORDER` | Expected tools appear in order, extras allowed between | Flexible workflow checks | -| `ANY_ORDER` | All expected tools present, any order | Capability verification | - ### Evaluate Against a Golden Trajectory ```python -from bigquery_agent_analytics import BigQueryTraceEvaluator -from bigquery_agent_analytics.trace_evaluator import MatchType +from bigquery_agent_analytics import PerformanceEvaluator +from bigquery_agent_analytics.performance_evaluator import MatchType -evaluator = BigQueryTraceEvaluator( +evaluator = PerformanceEvaluator( project_id="my-project", dataset_id="agent_analytics", # Optional: filter which event types are fetched from BigQuery. @@ -377,46 +350,13 @@ print(f"Response match: {result.scores.get('response_match')}") print(f"Step efficiency: {result.scores.get('step_efficiency')}") ``` -### Batch Evaluation - -```python -eval_dataset = [ - { - "session_id": "sess-001", - "expected_trajectory": [ - {"tool_name": "search_docs", "args": {}}, - ], - "expected_response": "Reset your password at ...", - "task_description": "Password reset query", - }, - { - "session_id": "sess-002", - "expected_trajectory": [ - {"tool_name": "lookup_order", "args": {}}, - {"tool_name": "check_status", "args": {}}, - ], - }, -] - -results = await evaluator.evaluate_batch( - eval_dataset, - match_type=MatchType.IN_ORDER, - use_llm_judge=True, - concurrency=5, -) - -for r in results: - print(f"{r.session_id}: {r.eval_status.value} " - f"(overall={r.overall_score:.2f})") -``` - ### Trajectory Metrics (Standalone) Use `TrajectoryMetrics` for direct score computation without BigQuery: ```python from bigquery_agent_analytics import TrajectoryMetrics -from bigquery_agent_analytics.trace_evaluator import ToolCall +from bigquery_agent_analytics.performance_evaluator import ToolCall actual = [ ToolCall(tool_name="search", args={"q": "test"}), @@ -432,6 +372,29 @@ in_order = TrajectoryMetrics.compute_in_order_match(actual, expected) # 1.0 efficiency = TrajectoryMetrics.compute_step_efficiency(2, 2) # 1.0 ``` +### Standalone & Direct Evaluations + +You can call specialized sub-evaluation methods directly to execute deterministic trajectory math or invoke LLM judging independently: + +```python +# 1. Compute deterministic trajectory metrics directly from a SessionTrace +scores = evaluator.evaluate_deterministic_trajectory( + trace=trace, + golden_trajectory=[{"tool_name": "search", "args": {}}], + match_type=MatchType.EXACT, +) +print(scores) # {'trajectory_exact_match': 1.0, 'step_efficiency': 1.0} + +# 2. Invoke LLM judge directly on a trace +scores, feedback = await evaluator.llm_judge_evaluate( + trace=trace, + task_description="Assist user with query.", + expected_trajectory=None, # set to golden for side-by-side correctness + golden_response=None, # set to golden answer for side-by-side reasoning +) +print(scores) # {'sentiment': 8.0, 'hallucination': 10.0} +``` + ### Deterministic Replay Replay a recorded session step-by-step for debugging: @@ -456,9 +419,9 @@ print(f"Response match: {diff['response_match']}") --- -## 6. Multi-Trial Evaluation (pass@k / pass^k) +## 5. Multi-Trial Evaluation (pass@k / pass^k) -Agents are non-deterministic -- a single evaluation run is not statistically meaningful. `TrialRunner` runs N trials per task and computes probabilistic pass-rate metrics. +Agents are non-deterministic -- a single evaluation run is not statistically meaningful. `MultiTrialPerformanceEvaluator` runs N trials per task and computes probabilistic pass-rate metrics. ### Key Metrics @@ -471,14 +434,14 @@ Agents are non-deterministic -- a single evaluation run is not statistically mea ### Run Multi-Trial Evaluation ```python -from bigquery_agent_analytics import BigQueryTraceEvaluator, TrialRunner +from bigquery_agent_analytics import PerformanceEvaluator, MultiTrialPerformanceEvaluator -evaluator = BigQueryTraceEvaluator( +evaluator = PerformanceEvaluator( project_id="my-project", dataset_id="analytics", ) -runner = TrialRunner( +runner = MultiTrialPerformanceEvaluator( evaluator, num_trials=10, # run each task 10 times concurrency=3, # max 3 concurrent evaluations @@ -533,9 +496,9 @@ pass_pow_k = compute_pass_pow_k(num_trials=10, num_passed=8) # ~0.107 --- -## 7. Grader Composition Pipeline +## 6. Grader Composition Pipeline -Combine multiple evaluators (`CodeEvaluator` + `LLMAsJudge` + custom functions) into a single aggregated verdict using configurable scoring strategies. +Combine multiple evaluators (`SystemEvaluator` + `PerformanceEvaluator` + custom functions) into a single aggregated verdict using configurable scoring strategies. ### Scoring Strategies @@ -549,12 +512,12 @@ Combine multiple evaluators (`CodeEvaluator` + `LLMAsJudge` + custom functions) ```python from bigquery_agent_analytics import ( - CodeEvaluator, GraderPipeline, LLMAsJudge, + SystemEvaluator, AggregateGrader, PerformanceEvaluator, WeightedStrategy, GraderResult, ) pipeline = ( - GraderPipeline(WeightedStrategy( + AggregateGrader(WeightedStrategy( weights={ "latency_evaluator": 0.2, "cost_evaluator": 0.1, @@ -562,9 +525,9 @@ pipeline = ( }, threshold=0.6, )) - .add_code_grader(CodeEvaluator.latency(threshold_ms=5000), weight=0.2) - .add_code_grader(CodeEvaluator.cost_per_session(max_cost_usd=0.50), weight=0.1) - .add_llm_grader(LLMAsJudge.correctness(threshold=0.7), weight=0.7) + .add_system_grader(SystemEvaluator.latency(threshold_ms=5000), weight=0.2) + .add_system_grader(SystemEvaluator.cost_per_session(max_cost_usd=0.50), weight=0.1) + .add_performance_grader(PerformanceEvaluator(project_id="my-project",dataset_id="analytics")) ) verdict = await pipeline.evaluate( @@ -591,10 +554,10 @@ for g in verdict.grader_results: from bigquery_agent_analytics import BinaryStrategy pipeline = ( - GraderPipeline(BinaryStrategy()) - .add_code_grader(CodeEvaluator.latency(threshold_ms=3000)) - .add_code_grader(CodeEvaluator.error_rate(max_error_rate=0.05)) - .add_llm_grader(LLMAsJudge.hallucination(threshold=0.8)) + AggregateGrader(BinaryStrategy()) + .add_system_grader(SystemEvaluator.latency(threshold_ms=5000), weight=0.2) + .add_system_grader(SystemEvaluator.cost_per_session(max_cost_usd=0.50), weight=0.1) + .add_performance_grader(PerformanceEvaluator(project_id="my-project",dataset_id="analytics")) ) # If ANY grader fails, the overall verdict fails @@ -622,15 +585,16 @@ def business_rules_grader(context): ) pipeline = ( - GraderPipeline(BinaryStrategy()) - .add_code_grader(CodeEvaluator.latency()) - .add_custom_grader("business_rules", business_rules_grader) + AggregateGrader(BinaryStrategy()) + .add_system_grader(SystemEvaluator.latency(threshold_ms=5000), weight=0.2) + .add_system_grader(SystemEvaluator.cost_per_session(max_cost_usd=0.50), weight=0.1) + .add_performance_grader(PerformanceEvaluator(project_id="my-project",dataset_id="analytics")) ) ``` --- -## 8. Eval Suite Management +## 7. Eval Suite Management `EvalSuite` manages collections of evaluation tasks with lifecycle operations: tagging, filtering, graduation from capability to regression, saturation detection, and health monitoring. @@ -725,7 +689,7 @@ print(f"Graduated: {graduated}") # ["password_reset", "order_lookup"] ### Convert to Eval Dataset & Serialize ```python -# Convert to the format accepted by BigQueryTraceEvaluator.evaluate_batch() +# Convert to the format accepted by PerformanceEvaluator.evaluate_batch() dataset = suite.to_eval_dataset(category=EvalCategory.REGRESSION) results = await evaluator.evaluate_batch(dataset) @@ -740,7 +704,7 @@ restored_suite = EvalSuite.from_json(open("eval_suite_v2.json").read()) --- -## 9. Eval Quality Validation +## 8. Eval Quality Validation `EvalValidator` runs static checks on your eval suite to catch common pitfalls before you waste compute on unreliable evaluations. @@ -792,7 +756,7 @@ duplicates = EvalValidator.check_duplicate_sessions(tasks) --- -## 10. Drift Detection & Feedback Loops +## 9. Drift Detection & Feedback Loops Compare your golden dataset against production traffic to understand coverage gaps. @@ -868,7 +832,7 @@ distribution = client.deep_analysis( --- -## 11. Agent Insights +## 10. Agent Insights Generate a comprehensive multi-stage analysis report from your agent's production sessions. @@ -941,7 +905,7 @@ print(f"Avg effectiveness: {agg.avg_effectiveness:.1f}/10") --- -## 12. Long-Horizon Agent Memory +## 11. Long-Horizon Agent Memory Give agents memory across sessions using historical trace data. @@ -1030,7 +994,7 @@ summary, recent = await ctx_mgr.summarize_old_context( --- -## 13. BigQuery AI/ML Integration +## 12. BigQuery AI/ML Integration Direct access to BigQuery's native AI capabilities for advanced analytics. @@ -1161,7 +1125,7 @@ await batch_eval.store_evaluation_results( --- -## 14. BigFrames Evaluator (DataFrame API) +## 13. BigFrames Evaluator (DataFrame API) For notebook-friendly workflows, `BigFramesEvaluator` returns pandas-compatible DataFrames powered by BigFrames. @@ -1208,7 +1172,7 @@ print(facets_df.columns.tolist()) --- -## 15. Event Semantics +## 14. Event Semantics The `event_semantics` module centralizes the logic for interpreting ADK plugin events so that every module uses consistent definitions. Import helpers instead of re-implementing event-type checks. @@ -1243,7 +1207,7 @@ print(ALL_KNOWN_EVENT_TYPES) --- -## 16. BigQuery View Management +## 15. BigQuery View Management `ViewManager` creates per-event-type BigQuery views that unnest the generic `agent_events` table into typed columns. Each view retains standard identity headers (`timestamp`, `agent`, `session_id`, etc.). @@ -1268,7 +1232,7 @@ print(vm.get_view_sql("TOOL_COMPLETED")) --- -## 17. Categorical Evaluation & Real-Time Dashboards +## 16. Categorical Evaluation & Real-Time Dashboards The **Categorical Evaluator** classifies agent sessions into user-defined categories (e.g. tone: positive/negative/neutral, outcome: resolved/escalated/dropped) using BigQuery's `AI.GENERATE` with automatic Gemini API fallback. Results are persisted to an append-only table and deduplicated at read time via dashboard views. @@ -1479,7 +1443,7 @@ See [`examples/categorical_dashboard.sql`](examples/categorical_dashboard.sql) f --- -## 18. Context Graph (Property Graph for Agentic Ads) +## 17. Context Graph (Property Graph for Agentic Ads) The **Context Graph** module builds a BigQuery Property Graph that cross-links technical execution traces (TechNodes) with business-domain entities (BizNodes). It enables GQL-based trace reconstruction, causal reasoning, and world-change detection for long-running agent tasks. @@ -1843,7 +1807,7 @@ bq-agent-sdk views create LLM_REQUEST --project-id=P --dataset-id=D --- -## 20. Remote Function (BigQuery SQL Interface) +## 19. Remote Function (BigQuery SQL Interface) Deploy the SDK as a BigQuery Remote Function to call analytics operations directly from SQL. @@ -1930,7 +1894,7 @@ The function reads config from `userDefinedContext` (set via --- -## 21. Continuous Queries (Real-Time Streaming) +## 20. Continuous Queries (Real-Time Streaming) Pre-built SQL templates for BigQuery continuous queries that process agent events in real time as they arrive. @@ -1970,7 +1934,7 @@ handle aggregation. --- -## 22. Usage Telemetry +## 21. Usage Telemetry Every BigQuery job the SDK submits is labeled so operators can attribute spend, latency, and adoption directly from @@ -2026,12 +1990,12 @@ bigquery_agent_analytics/ │ Core │ ├── client.py ← High-level SDK entry point │ ├── trace.py ← Trace/Span reconstruction & DAG rendering -│ └── evaluators.py ← CodeEvaluator + LLMAsJudge + SQL templates +│ ├── system_evaluator.py ← SystemEvaluator │ │ Evaluation Harness -│ ├── trace_evaluator.py ← BigQueryTraceEvaluator, trajectory matching, replay -│ ├── multi_trial.py ← TrialRunner, pass@k, pass^k -│ ├── grader_pipeline.py ← GraderPipeline + scoring strategies +│ ├── performance_evaluator.py ← PerformanceEvaluator, trajectory matching, replay +│ ├── multi_trial_performance_evaluator.py ← MultiTrialPerformanceEvaluator, pass@k, pass^k +│ ├── aggregate_grader.py ← AggregateGrader + scoring strategies │ ├── eval_suite.py ← EvalSuite lifecycle management │ └── eval_validator.py ← Static validation checks │ @@ -2070,8 +2034,8 @@ bigquery_agent_analytics/ ``` Standalone modules (no internal imports): ├── trace.py -├── evaluators.py -├── trace_evaluator.py +├── system_evaluator.py +├── performance_evaluator.py ├── feedback.py ├── ai_ml_integration.py ├── bigframes_evaluator.py @@ -2083,12 +2047,12 @@ Standalone modules (no internal imports): └── eval_suite.py Modules with internal imports: -├── insights.py → evaluators -├── grader_pipeline.py → evaluators -├── multi_trial.py → trace_evaluator +├── insights.py → system_evaluator +├── aggregate_grader.py → system_evaluator +├── multi_trial_performance_evaluator.py → performance_evaluator ├── eval_validator.py → eval_suite ├── categorical_views.py → categorical_evaluator (DEFAULT_RESULTS_TABLE) -└── client.py → evaluators, feedback, insights, trace, context_graph, categorical_* +└── client.py → system_evaluator, feedback, insights, trace, context_graph, categorical_* External dependency: └── memory_service.py → google-adk (memory + sessions) diff --git a/docs/design.md b/docs/design.md index 7696c83..b9ddd2b 100644 --- a/docs/design.md +++ b/docs/design.md @@ -150,9 +150,8 @@ As demonstrated in the [e2e demo](../examples/e2e_demo.py): **Phase 2 — Evaluation:** 1. `Client.get_trace()` retrieves all events for a session -2. `CodeEvaluator` preset factories assess latency, turn count, error rate, token efficiency -3. `LLMAsJudge.correctness()` performs semantic evaluation via BigQuery `AI.GENERATE` -4. `BigQueryTraceEvaluator.evaluate_session()` performs trajectory matching against golden tool sequences +2. `SystemEvaluator` preset factories assess latency, turn count, error rate, token efficiency +3. `PerformanceEvaluator` performs evaluates performance metrics **Phase 3 — Insights:** 1. `Client.insights()` triggers the multi-stage pipeline @@ -222,7 +221,7 @@ As demonstrated in the [e2e demo](../examples/e2e_demo.py): |-------|---------|----------------| | **Entry Point** | `client.py` | High-level sync API, BigQuery query orchestration | | **Core Data** | `trace.py` | Trace/Span reconstruction, DAG rendering, filtering | -| **Evaluation Engine** | `evaluators.py`, `trace_evaluator.py`, `multi_trial.py`, `grader_pipeline.py` | Deterministic metrics, LLM-as-judge, trajectory matching, multi-trial statistics, grader composition | +| **Evaluation Engine** | `system_evaluator.py`, `performance_evaluator.py`, `multi_trial_performance_evaluator.py`, `aggregate_grader.py` | Deterministic metrics, LLM-as-judge, trajectory matching, multi-trial statistics, grader composition | | **Categorical Evaluation** | `categorical_evaluator.py`, `categorical_views.py` | User-defined categorical classification with AI.GENERATE + Gemini fallback, dashboard views with dedup | | **Eval Governance** | `eval_suite.py`, `eval_validator.py` | Task lifecycle management, static quality validation | | **Feedback & Insights** | `feedback.py`, `insights.py` | Drift detection, question distribution, multi-stage analysis pipeline | @@ -248,7 +247,7 @@ Aggregations, filtering, joins, and even LLM evaluation (via `AI.GENERATE`) are LLM-based evaluation can run via (1) BigQuery `AI.GENERATE`, (2) legacy BigQuery ML `ML.GENERATE_TEXT`, or (3) the Gemini API directly. This maximizes compatibility across different GCP configurations. **Decision 4: Composition over inheritance.** -The `GraderPipeline` composes `CodeEvaluator`, `LLMAsJudge`, and custom functions via a builder pattern rather than requiring them to share a common base class. The `BigQueryMemoryService` composes four internal services rather than extending a single monolithic class. +The `AggregateGrader` composes `SystemEvaluator`, `PerformanceEvaluator`, and custom functions via a builder pattern rather than requiring them to share a common base class. The `BigQueryMemoryService` composes four internal services rather than extending a single monolithic class. --- @@ -392,11 +391,7 @@ class TraceFilter: Each field generates a separate `AND` condition with a corresponding `bigquery.ScalarQueryParameter` or `bigquery.ArrayQueryParameter`. This is the **only** dynamic SQL in the SDK — everything else uses static templates. -### 4.3 `evaluators.py` — Code & LLM Evaluation - -This module contains two evaluator classes and the SQL templates that power batch evaluation. - -#### 4.3.1 `CodeEvaluator` +### 4.3 `system_evaluator.py` — System Metric Evaluation Deterministic evaluation using code-defined metric functions. @@ -453,62 +448,11 @@ Aggregates per-session statistics from raw events: - `SUM(JSON_VALUE(content, '$.usage.total'))` as total_tokens - Turn count from `USER_MESSAGE_RECEIVED` events -#### 4.3.2 `LLMAsJudge` - -Semantic evaluation using an LLM as the scoring engine. - -**Internal storage:** - -```python -_criteria: list[dict] # [{"name": str, "prompt_template": str, "score_key": str, "threshold": float}] -``` - -Prompt templates use `{trace_text}` and `{final_response}` placeholders. - -**Pre-built factory methods:** - -| Factory | Evaluates | Score Key | -|---------|-----------|-----------| -| `correctness(threshold)` | Factual accuracy and relevance | `correctness` | -| `hallucination(threshold)` | Unsupported claims in response | `hallucination` | -| `sentiment(threshold)` | Interaction tone and helpfulness | `sentiment` | - -**`evaluate_session(trace_text, final_response) -> SessionScore`** (async): +### 4.4 `performance_evaluator.py` — Performance Metric Evaluation -For each criterion: -1. Format prompt template with `trace_text` and `final_response` -2. Call LLM (via `google-genai` API) -3. Parse JSON response to extract numeric score -4. Normalize to `[0.0, 1.0]` (divide by 10) +#### 4.4.1 `PerformanceEvaluator` -**SQL template** (`AI_GENERATE_JUDGE_BATCH_QUERY`): - -Performs batch evaluation entirely within BigQuery: - -```sql -WITH session_traces AS ( - SELECT session_id, - STRING_AGG(... ORDER BY timestamp) AS trace_text, - ARRAY_REVERSE(ARRAY_AGG(... ORDER BY timestamp))[SAFE_OFFSET(0)] AS final_response - FROM `{table}` WHERE {where} - GROUP BY session_id -) -SELECT session_id, trace_text, final_response, - AI.GENERATE( - CONCAT(@judge_prompt, '\n\nTrace:\n', trace_text, '\n\nResponse:\n', final_response), - endpoint => @endpoint, - output_schema => STRUCT(...) - ).* -FROM session_traces -``` - -This avoids transferring trace data out of BigQuery for evaluation. - -### 4.4 `trace_evaluator.py` — Trajectory Matching & Replay - -#### 4.4.1 `BigQueryTraceEvaluator` - -Evaluates agent behavior against expected tool-call trajectories. +Evaluates performance metrics leverage agent-generated traces/responses and optionally, golden traces/responses. **`evaluate_session()` flow:** @@ -518,7 +462,7 @@ Evaluates agent behavior against expected tool-call trajectories. 3. Extract actual ToolCall sequence 4. Compute trajectory score (based on MatchType) 5. Compute step efficiency -6. Optionally run LLM-as-judge on response quality +6. Run LLM-based metrics to evaluate correctness, hallucination, sentiment, and efficiency. 7. Determine pass/fail against thresholds 8. Return EvaluationResult ``` @@ -566,11 +510,11 @@ Deterministic replay for debugging and comparison: - **`replay_session(session_id, replay_mode, step_callback)`**: Fetches trace, replays events in order. Modes: `"full"` (all events), `"step"` (with callback per event), `"tool_only"` (only tool events) - **`compare_replays(session_a, session_b)`**: Replays both sessions, diffs tool sequences and response similarity -### 4.5 `multi_trial.py` — Statistical Evaluation +### 4.5 `multi_trial_performance_evaluator.py` — Statistical Evaluation -Agents are non-deterministic. A single evaluation run is not statistically meaningful. `TrialRunner` addresses this. +Agents are non-deterministic. A single evaluation run is not statistically meaningful. `MultiTrialPerformanceEvaluator` addresses this. -**`TrialRunner(evaluator, num_trials, concurrency)`:** +**`MultiTrialPerformanceEvaluator(evaluator, num_trials, concurrency)`:** Runs N trials of the same evaluation task with bounded concurrency via `asyncio.Semaphore`. @@ -610,7 +554,7 @@ class MultiTrialReport(BaseModel): trial_results: list[TrialResult] ``` -### 4.6 `grader_pipeline.py` — Grader Composition +### 4.6 `aggregate_grader.py` — Grader Composition Combines heterogeneous evaluators into a unified verdict using a strategy pattern. @@ -618,21 +562,21 @@ Combines heterogeneous evaluators into a unified verdict using a strategy patter ``` ┌──────────────────┐ - │ GraderPipeline │ + │ AggregateGrader │ │ │ │ strategy: ──────┼──► ScoringStrategy │ graders: ───────┼──► list[_GraderEntry] └────────┬─────────┘ │ - ┌──────────────┼──────────────┐ - ▼ ▼ ▼ - CodeEvaluator LLMAsJudge Custom Fn - (sync) (async) (sync) - │ │ │ - ▼ ▼ ▼ - GraderResult GraderResult GraderResult - │ │ │ - └──────────────┼──────────────┘ + ┌──────────────┼─────────────────┐ + ▼ ▼ ▼ + SystemEvaluator PerformanceEvaluator Custom Fn + (sync) (async) (sync) + │ │ │ + ▼ ▼ ▼ + GraderResult GraderResult GraderResult + │ │ │ + └──────────────┼─────────────────┘ ▼ ┌────────────────┐ │ScoringStrategy │ @@ -1219,10 +1163,10 @@ results = client.query(formatted, job_config=job_config) |--------|----------|---------| | `client.py` | `_SESSION_EVENTS_QUERY` | Fetch all events for a session | | `client.py` | `_LIST_SESSIONS_QUERY` | Discover sessions matching filter | -| `evaluators.py` | `SESSION_SUMMARY_QUERY` | Aggregate session metrics for code evaluation | -| `evaluators.py` | `AI_GENERATE_JUDGE_BATCH_QUERY` | Batch LLM-as-judge via AI.GENERATE | -| `evaluators.py` | `LLM_JUDGE_BATCH_QUERY` | Legacy batch evaluation via ML.GENERATE_TEXT | -| `trace_evaluator.py` | `_SESSION_TRACE_QUERY` | Fetch trace for trajectory matching | +| `system_evaluator.py` | `SESSION_SUMMARY_QUERY` | Aggregate session metrics for code evaluation | +| `system_evaluator.py` | `AI_GENERATE_JUDGE_BATCH_QUERY` | Batch LLM-as-judge via AI.GENERATE | +| `system_evaluator.py` | `LLM_JUDGE_BATCH_QUERY` | Legacy batch evaluation via ML.GENERATE_TEXT | +| `performance_evaluator.py` | `_SESSION_TRACE_QUERY` | Fetch trace for trajectory matching | | `insights.py` | `_SESSION_METADATA_QUERY` | Aggregate session metadata | | `insights.py` | `_SESSION_TRANSCRIPT_QUERY` | Build session transcripts | | `insights.py` | `_AI_GENERATE_FACET_EXTRACTION_QUERY` | Extract structured facets via AI.GENERATE | @@ -1258,32 +1202,29 @@ results = client.query(formatted, job_config=job_config) ``` Evaluation -├── Deterministic (CodeEvaluator) +├── Deterministic (SystemEvaluator) │ ├── Latency │ ├── Turn count │ ├── Error rate │ ├── Token efficiency +│ ├── Time to first token │ ├── Cost per session │ └── Custom metric functions │ -├── Semantic (LLMAsJudge) +├── Performance (PerformanceEvaluator) │ ├── Correctness │ ├── Hallucination +│ ├── Efficiency │ ├── Sentiment +│ ├── Trajectory matching (exact, in-order, any-order, step efficiency) │ └── Custom criteria with prompt templates │ -├── Trajectory (BigQueryTraceEvaluator) -│ ├── Exact match -│ ├── In-order match -│ ├── Any-order match -│ └── Step efficiency -│ -├── Composite (GraderPipeline) +├── Composite (AggregateGrader) │ ├── Weighted average │ ├── Binary (all-pass) │ └── Majority vote │ -└── Statistical (TrialRunner) +└── Statistical (MultiTrialPerformanceEvaluator) ├── pass@k ├── pass^k └── Per-trial pass rate @@ -1306,12 +1247,12 @@ All evaluation scores in the SDK are normalized to `[0.0, 1.0]`: | Mode | Evaluator | Where Computation Runs | |------|-----------|----------------------| -| Single session (sync) | `CodeEvaluator.evaluate_session()` | Python | -| Single session (async) | `LLMAsJudge.evaluate_session()` | Gemini API | +| Single session (sync) | `SystemEvaluator.evaluate_session()` | Python | +| Single session (async) | `PerformanceEvaluator.evaluate_session()` | Python, Gemini API | | Batch via Client | `Client.evaluate()` | BigQuery (SQL + AI.GENERATE) | -| Trajectory matching | `BigQueryTraceEvaluator.evaluate_session()` | BigQuery (fetch) + Python (matching) | -| Multi-trial | `TrialRunner.run_trials()` | BigQuery (fetch) + Python (N iterations) | -| Pipeline | `GraderPipeline.evaluate()` | Mixed (code=Python, LLM=API/BQ) | +| Trajectory matching | `PerformanceEvaluator.evaluate_session()` | BigQuery (fetch) + Python (matching) | +| Multi-trial | `MultiTrialPerformanceEvaluator.run_trials()` | BigQuery (fetch) + Python (N iterations) | +| Pipeline | `AggregateGrader.evaluate()` | Mixed (code=Python, LLM=API/BQ) | | DataFrame | `BigFramesEvaluator.evaluate_sessions()` | BigQuery (BigFrames + AI.GENERATE) | --- @@ -1405,18 +1346,17 @@ Synchronous (user-facing): ├── Client.drift_detection() ├── Client.insights() ├── Client.deep_analysis() -├── CodeEvaluator.evaluate_session() +├── SystemEvaluator.evaluate_session() ├── EvalSuite.* ├── EvalValidator.* └── BigFramesEvaluator.* Async (internal / advanced users): -├── LLMAsJudge.evaluate_session() -├── BigQueryTraceEvaluator.evaluate_session() -├── BigQueryTraceEvaluator.evaluate_batch() -├── TrialRunner.run_trials() -├── TrialRunner.run_trials_batch() -├── GraderPipeline.evaluate() +├── PerformanceEvaluator.evaluate_session() +├── PerformanceEvaluator.evaluate_batch() +├── MultiTrialPerformanceEvaluator.run_trials() +├── MultiTrialPerformanceEvaluator.run_trials_batch() +├── AggregateGrader.evaluate() ├── BigQueryMemoryService.search_memory() ├── BigQueryMemoryService.get_session_context() ├── compute_drift() @@ -1449,7 +1389,7 @@ async def _execute_query(self, query, params): ### 9.4 Concurrency Control -`TrialRunner` and `BigQueryTraceEvaluator.evaluate_batch()` use `asyncio.Semaphore` for bounded concurrency: +`MultiTrialPerformanceEvaluator` and `PerformanceEvaluator.evaluate_batch()` use `asyncio.Semaphore` for bounded concurrency: ```python semaphore = asyncio.Semaphore(concurrency) @@ -1465,10 +1405,10 @@ results = await asyncio.gather(*[_run_one(t) for t in tasks]) ## 10. Extensibility & Plugin Points -### 10.1 Custom Metrics (CodeEvaluator) +### 10.1 Custom Metrics (SystemEvaluator) ```python -evaluator = CodeEvaluator(name="custom").add_metric( +evaluator = SystemEvaluator(name="custom").add_metric( name="business_metric", fn=lambda session: your_scoring_logic(session), threshold=0.7, @@ -1477,18 +1417,24 @@ evaluator = CodeEvaluator(name="custom").add_metric( The metric function receives the full session summary dict and returns `float` in `[0, 1]`. -### 10.2 Custom Judge Criteria (LLMAsJudge) +### 10.2 Custom Judge Criteria (PerformanceEvaluator) ```python -judge = LLMAsJudge(name="custom").add_criterion( - name="domain_accuracy", - prompt_template="Evaluate accuracy...\n{trace_text}\n{final_response}", - score_key="accuracy", - threshold=0.8, +evaluator = PerformanceEvaluator( + project_id="my-project", + dataset_id="agent_analytics", +) + +# Register a custom semantic evaluation rubric with a pass/fail threshold +evaluator.add_rubric( + name="brand_alignment", + prompt_template="Does the agent explicitly mention the company name and remain positive? Rate 1 to 5.", + score_key="brand_alignment", + threshold=4.0, ) ``` -### 10.3 Custom Graders (GraderPipeline) +### 10.3 Custom Graders (AggregateGrader) ```python def my_grader(context: dict) -> GraderResult: @@ -1516,7 +1462,7 @@ Every class that uses BigQuery accepts an optional client parameter: ```python Client(project_id="...", dataset_id="...", bq_client=custom_client) -BigQueryTraceEvaluator(..., bq_client=mock_client) +BigQueryTraceEvaluator(..., bq_client=mock_client) -> PerformanceEvaluator(..., bq_client=mock_client) BigQueryAIClient(..., client=mock_client) ``` @@ -1571,7 +1517,7 @@ All tests mock BigQuery — no GCP credentials or live BigQuery access is needed ``` tests/ ├── test_sdk_client.py # Client integration tests -├── test_sdk_evaluators.py # CodeEvaluator + LLMAsJudge +├── test_sdk_evaluators.py # SystemEvaluator + PerformanceEvaluator ├── test_sdk_trace.py # Trace/Span reconstruction ├── test_sdk_feedback.py # Drift detection ├── test_sdk_insights.py # Insights pipeline diff --git a/docs/hatteras_evaluation.md b/docs/hatteras_evaluation.md index 5d61b25..6fec2f3 100644 --- a/docs/hatteras_evaluation.md +++ b/docs/hatteras_evaluation.md @@ -7,7 +7,7 @@ agent sessions into user-defined categories directly against traces stored in BigQuery, without relying on an external service. This should be implemented as a new categorical evaluation subsystem, not as -an overload of the existing numeric `CodeEvaluator` / `LLMAsJudge` report +an overload of the existing numeric `SystemEvaluator` / `LLMAsJudge` report path. The goal is to support Hatteras-like functionality inside the SDK: @@ -22,7 +22,7 @@ The goal is to support Hatteras-like functionality inside the SDK: Today the SDK supports two major evaluation modes: -- deterministic numeric scoring via `CodeEvaluator` +- deterministic numeric scoring via `SystemEvaluator` - semantic numeric scoring via `LLMAsJudge` What is missing is a first-class way to answer questions like: @@ -60,7 +60,7 @@ That capability is useful for: This design is not proposing: - a full clone of an external Hatteras service -- a replacement for `CodeEvaluator` +- a replacement for `SystemEvaluator` - a replacement for `LLMAsJudge` - a new remote function or Python UDF surface in the first phase - real-time ingestion-time classification in phase 1 diff --git a/docs/implementation_plan_concept_index_runtime.md b/docs/implementation_plan_concept_index_runtime.md index af0971a..ad53596 100644 --- a/docs/implementation_plan_concept_index_runtime.md +++ b/docs/implementation_plan_concept_index_runtime.md @@ -165,7 +165,7 @@ Work: `bigquery_ontology/contrib/advertising/` stub with Yahoo's resolver (if co - `src/bigquery_ontology/graph_ddl_compiler.py` — add `compile_concept_index(ontology, binding, *, output_table) -> str`. Preserve `compile_graph()` contract byte-identically. No changes to existing function bodies. - `src/bigquery_ontology/cli.py:299` — `compile` command gains `--emit-concept-index` and `--concept-index-table` flags. When absent, behavior is byte-identical to today. - `src/bigquery_ontology/__init__.py` — add `from .graph_ddl_compiler import compile_concept_index` so the new public function is importable as `from bigquery_ontology import compile_concept_index`, matching the existing pattern for `compile_graph` (`__init__.py:50` today). -- `src/bigquery_agent_analytics/__init__.py` — add the new public surface to the try/except re-export block (same pattern as `Client`, `CodeEvaluator`, etc.): +- `src/bigquery_agent_analytics/__init__.py` — add the new public surface to the try/except re-export block (same pattern as `Client`, `SystemEvaluator`, etc.): - `OntologyRuntime` from `.ontology_runtime` - `EntityResolver`, `ExactMatchResolver`, `SynonymResolver`, `Candidate`, `ResolveResult` from `.entity_resolver` - `ConceptIndexMismatchError`, `ConceptIndexProvenanceMissing`, `ConceptIndexInconsistentPair`, `ConceptIndexRefreshed` from `.ontology_runtime` diff --git a/docs/implementation_plan_remote_function.md b/docs/implementation_plan_remote_function.md index 884e742..d295944 100644 --- a/docs/implementation_plan_remote_function.md +++ b/docs/implementation_plan_remote_function.md @@ -219,13 +219,30 @@ Dispatch logic: ```python # Map CLI --evaluator to SDK factory EVALUATOR_FACTORIES = { - "latency": lambda t: CodeEvaluator.latency(threshold_ms=t), - "error_rate": lambda t: CodeEvaluator.error_rate(max_error_rate=t), - "turn_count": lambda t: CodeEvaluator.turn_count(max_turns=int(t)), - "token_efficiency": lambda t: CodeEvaluator.token_efficiency(max_tokens=int(t)), - "ttft": lambda t: CodeEvaluator.ttft(threshold_ms=t), - "cost": lambda t: CodeEvaluator.cost_per_session(max_cost_usd=t), - "llm-judge": None, # special handling + "latency": ( + lambda t: SystemEvaluator.latency(threshold_ms=t), + lambda: SystemEvaluator.latency(), + ), + "error_rate": ( + lambda t: SystemEvaluator.error_rate(max_error_rate=t), + lambda: SystemEvaluator.error_rate(), + ), + "turn_count": ( + lambda t: SystemEvaluator.turn_count(max_turns=int(t)), + lambda: SystemEvaluator.turn_count(), + ), + "token_efficiency": ( + lambda t: SystemEvaluator.token_efficiency(max_tokens=int(t)), + lambda: SystemEvaluator.token_efficiency(), + ), + "ttft": ( + lambda t: SystemEvaluator.ttft(threshold_ms=t), + lambda: SystemEvaluator.ttft(), + ), + "cost": ( + lambda t: SystemEvaluator.cost_per_session(max_cost_usd=t), + lambda: SystemEvaluator.cost_per_session(), + ), } ``` @@ -289,7 +306,7 @@ import functions_framework from flask import jsonify from bigquery_agent_analytics import Client, serialize -from bigquery_agent_analytics import CodeEvaluator, LLMAsJudge +from bigquery_agent_analytics import SystemEvaluator, PerformanceEvaluator from bigquery_agent_analytics import TraceFilter @@ -385,18 +402,18 @@ def _dispatch(client, operation, params): def _build_evaluator(params): - """Build CodeEvaluator from params dict.""" + """Build SystemEvaluator from params dict.""" metric = params.get("metric", "latency") threshold = params.get("threshold", 5000) factories = { - "latency": lambda t: CodeEvaluator.latency(threshold_ms=t), - "error_rate": lambda t: CodeEvaluator.error_rate(max_error_rate=t), - "turn_count": lambda t: CodeEvaluator.turn_count(max_turns=int(t)), - "token_efficiency": lambda t: CodeEvaluator.token_efficiency( + "latency": lambda t: SystemEvaluator.latency(threshold_ms=t), + "error_rate": lambda t: SystemEvaluator.error_rate(max_error_rate=t), + "turn_count": lambda t: SystemEvaluator.turn_count(max_turns=int(t)), + "token_efficiency": lambda t: SystemEvaluator.token_efficiency( max_tokens=int(t) ), - "ttft": lambda t: CodeEvaluator.ttft(threshold_ms=t), - "cost": lambda t: CodeEvaluator.cost_per_session(max_cost_usd=t), + "ttft": lambda t: SystemEvaluator.ttft(threshold_ms=t), + "cost": lambda t: SystemEvaluator.cost_per_session(max_cost_usd=t), } factory = factories.get(metric) if not factory: @@ -405,18 +422,12 @@ def _build_evaluator(params): def _build_judge(params): - """Build LLMAsJudge from params dict.""" - criterion = params.get("criterion", "correctness") - threshold = params.get("threshold", 0.5) - factories = { - "correctness": lambda t: LLMAsJudge.correctness(threshold=t), - "hallucination": lambda t: LLMAsJudge.hallucination(threshold=t), - "sentiment": lambda t: LLMAsJudge.sentiment(threshold=t), - } - factory = factories.get(criterion) - if not factory: - raise ValueError(f"Unknown criterion: {criterion}") - return factory(threshold) + """Build PerformanceEvaluator from params dict.""" + return PerformanceEvaluator( + project_id=params.get("project_id"), + dataset_id=params.get("dataset_id"), + llm_judge_model=params.get("model"), + ) ``` **Key design decisions:** @@ -584,8 +595,8 @@ Complete mapping from interface operations to current SDK code: | Operation | SDK Method | File:Line | Return Type | Serialization Strategy | |-----------|-----------|-----------|-------------|----------------------| | `analyze` | `Client.get_session_trace()` | `client.py` | `Trace` (dataclass) | `serialize()` → recursive `.to_dict()` | -| `evaluate` | `Client.evaluate(CodeEvaluator)` | `client.py` | `EvaluationReport` (Pydantic) | `.model_dump(mode="json")` | -| `judge` | `Client.evaluate(LLMAsJudge)` | `client.py` | `EvaluationReport` (Pydantic) | `.model_dump(mode="json")` | +| `evaluate` | `Client.evaluate(SystemEvaluator)` | `client.py` | `EvaluationReport` (Pydantic) | `.model_dump(mode="json")` | +| `judge` | `Client.evaluate(PerformanceEvaluator)` | `client.py` | `EvaluationReport` (Pydantic) | `.model_dump(mode="json")` | | `insights` | `Client.insights()` | `client.py` | `InsightsReport` (Pydantic) | `.model_dump(mode="json")` | | `drift` | `Client.drift_detection()` | `client.py` | `DriftReport` (Pydantic) | `.model_dump(mode="json")` | | `distribution` | `Client.deep_analysis()` | `client.py` | `QuestionDistribution` (Pydantic) | `.model_dump(mode="json")` | @@ -597,22 +608,21 @@ Complete mapping from interface operations to current SDK code: | CLI `--evaluator` | SDK Factory | File | |-------------------|------------|------| -| `latency` | `CodeEvaluator.latency(threshold_ms)` | `evaluators.py` | -| `error_rate` | `CodeEvaluator.error_rate(max_error_rate)` | `evaluators.py` | -| `turn_count` | `CodeEvaluator.turn_count(max_turns)` | `evaluators.py` | -| `token_efficiency` | `CodeEvaluator.token_efficiency(max_tokens)` | `evaluators.py` | -| `ttft` | `CodeEvaluator.ttft(threshold_ms)` | `evaluators.py` | -| `cost` | `CodeEvaluator.cost_per_session(max_cost_usd)` | `evaluators.py` | -| `llm-judge` | `LLMAsJudge.correctness/hallucination/sentiment(threshold)` | `evaluators.py` | +| `latency` | `SystemEvaluator.latency(threshold_ms)` | `evaluators.py` | +| `error_rate` | `SystemEvaluator.error_rate(max_error_rate)` | `evaluators.py` | +| `turn_count` | `SystemEvaluator.turn_count(max_turns)` | `evaluators.py` | +| `token_efficiency` | `SystemEvaluator.token_efficiency(max_tokens)` | `evaluators.py` | +| `ttft` | `SystemEvaluator.ttft(threshold_ms)` | `evaluators.py` | +| `cost` | `SystemEvaluator.cost_per_session(max_cost_usd)` | `evaluators.py` | ### SDK Capabilities NOT Exposed (v1.2+ candidates) | SDK Feature | Class | Potential Operation | |-------------|-------|-------------------| | Context Graph | `ContextGraphManager` | `context_graph` | -| Trajectory Evaluation | `BigQueryTraceEvaluator` | `trajectory` | -| Multi-Trial | `TrialRunner` | `multi_trial` | -| Grader Pipeline | `GraderPipeline` | `grade` | +| Trajectory Evaluation | `PerformanceEvaluator` | `trajectory` | +| Multi-Trial | `MultiTrialPerformanceEvaluator` | `multi_trial` | +| Grader Pipeline | `AggregateGrader` | `grade` | | Memory Service | `BigQueryMemoryService` | (separate interface) | | Anomaly Detection & Forecasting | `AnomalyDetector` | `anomaly`, `forecast` | @@ -623,7 +633,7 @@ Complete mapping from interface operations to current SDK code: | Risk | Likelihood | Impact | Mitigation | |------|-----------|--------|------------| | Cloud Function cold start > 3s | Medium | Latency SLO breach | `--min-instances=1` for production | -| `LLMAsJudge` timeout in batch | Medium | Partial failure | Per-row error handling; `max_batching_rows=10` for judge | +| `PerformanceEvaluator` timeout in batch | Medium | Partial failure | Per-row error handling; `max_batching_rows=10` for judge | | `typer` version conflict with user deps | Low | CLI install failure | Optional `[cli]` extra isolates dependency | | `Trace.to_dict()` missing edge cases | Medium | Serialization crash | Comprehensive test matrix in Phase 1 | | `datetime` serialization regression | Medium | Silent JSON errors | CI test: `json.dumps(serialize(x))` for all types | diff --git a/docs/prd_unified_analytics_interface.md b/docs/prd_unified_analytics_interface.md index fb12fa0..c670bd6 100644 --- a/docs/prd_unified_analytics_interface.md +++ b/docs/prd_unified_analytics_interface.md @@ -32,7 +32,7 @@ import the library. This creates three gaps: │ Client.insights() Client.drift_detection()│ │ Client.doctor() Client.deep_analysis() │ │ Client.hitl_metrics() Client.context_graph() │ -│ ViewManager BigQueryTraceEvaluator │ +│ ViewManager PerformanceEvaluator │ │ TrialRunner GraderPipeline │ │ EvalSuite EvalValidator │ │ BigQueryMemoryService BigQueryAIClient │ @@ -109,8 +109,8 @@ All operations go through a single multiplexed function: | Operation | SDK Method | Params (JSON keys) | Output | |-----------|-----------|---------------------|--------| | `analyze` | `Client.get_session_trace()` + metrics | `session_id` | JSON with span count, error count, latency, tool calls | -| `evaluate` | `CodeEvaluator` | `session_id`, `metric`, `threshold` | JSON with passed, score, details | -| `judge` | `LLMAsJudge` | `session_id`, `criterion` | JSON with score, feedback | +| `evaluate` | `SystemEvaluator` | `session_id`, `metric`, `threshold` | JSON with passed, score, details | +| `judge` | `PerformanceEvaluator` | `session_id`, `criterion` | JSON with score, feedback | | `insights` | Facet extraction | `session_id` | JSON with intent, outcome, friction | | `drift` | Drift detection | `golden_dataset`, `agent_filter`, `start_date`, `end_date` | JSON with coverage, gaps | @@ -443,7 +443,7 @@ import functions_framework import json import os from flask import jsonify -from bigquery_agent_analytics import Client, CodeEvaluator, LLMAsJudge, TraceFilter +from bigquery_agent_analytics import Client, SystemEvaluator, PerformanceEvaluator, TraceFilter # Initialized once per cold start. Config comes from userDefinedContext # (forwarded by BigQuery) or environment variables as fallback. @@ -490,13 +490,13 @@ def _dispatch(client, operation, params): "final_response": trace.final_response, } elif operation == "evaluate": - evaluator = CodeEvaluator.latency(threshold_ms=params["threshold"]) + evaluator = SystemEvaluator.latency(threshold_ms=params["threshold"]) report = client.evaluate(evaluator=evaluator, filters=TraceFilter(session_ids=[params["session_id"]])) return report.details[0] if report.details else {} elif operation == "judge": - judge = getattr(LLMAsJudge, params["criterion"])() - report = client.evaluate(evaluator=judge, + performance_evaluator = PerformanceEvaluator() + report = client.evaluate(evaluator=performance_evaluator, filters=TraceFilter(session_ids=[params["session_id"]])) return report.details[0] if report.details else {} elif operation == "drift": diff --git a/docs/python_udf_support_design.md b/docs/python_udf_support_design.md index deba9ee..a7c549d 100644 --- a/docs/python_udf_support_design.md +++ b/docs/python_udf_support_design.md @@ -97,8 +97,8 @@ These constraints drive several design decisions: 2. Full `Client` reuse is not the right goal. Reason: `Client` is built around issuing BigQuery jobs, loading trace rows, and assembling rich Python objects such as `Trace`, - [`EvaluationReport`](/Users/haiyuancao/BigQuery-Agent-Analytics-SDK/src/bigquery_agent_analytics/evaluators.py), - and [`InsightsReport`](/Users/haiyuancao/BigQuery-Agent-Analytics-SDK/src/bigquery_agent_analytics/insights.py). + [`EvaluationReport`](/Users/haiyuancao/BigQuery-Agent-Analytics-SDK/src/bigquery_agent_analytics/system_evaluator.py), + and [`InsightsReport`](/Users/haiyuancao/BigQuery-Agent-Analytics-SDK/src/bigquery_agent_analytics/insights.py). 3. Python UDF support should be built around **small analytical kernels**, not around a UDF that internally reimplements the whole SDK client. @@ -156,11 +156,11 @@ These parts of the SDK map well to Python UDFs: | Error detection | [event_semantics.py](/Users/haiyuancao/BigQuery-Agent-Analytics-SDK/src/bigquery_agent_analytics/event_semantics.py) | Strong | `BOOL` helpers such as `is_error_event` | | Tool outcome classification | [event_semantics.py](/Users/haiyuancao/BigQuery-Agent-Analytics-SDK/src/bigquery_agent_analytics/event_semantics.py) | Strong | `STRING` helpers such as `tool_outcome` | | Response text extraction | [event_semantics.py](/Users/haiyuancao/BigQuery-Agent-Analytics-SDK/src/bigquery_agent_analytics/event_semantics.py) | Good | parse-wrapper plus `STRING` extraction from a JSON-formatted `STRING` payload | -| Latency scoring | [evaluators.py](/Users/haiyuancao/BigQuery-Agent-Analytics-SDK/src/bigquery_agent_analytics/evaluators.py) | Strong | `FLOAT64` score kernel | -| Turn-count scoring | [evaluators.py](/Users/haiyuancao/BigQuery-Agent-Analytics-SDK/src/bigquery_agent_analytics/evaluators.py) | Strong | `FLOAT64` score kernel | -| Error-rate scoring | [evaluators.py](/Users/haiyuancao/BigQuery-Agent-Analytics-SDK/src/bigquery_agent_analytics/evaluators.py) | Strong | `FLOAT64` score kernel | -| TTFT scoring | [evaluators.py](/Users/haiyuancao/BigQuery-Agent-Analytics-SDK/src/bigquery_agent_analytics/evaluators.py) | Strong | `FLOAT64` score kernel | -| Cost scoring | [evaluators.py](/Users/haiyuancao/BigQuery-Agent-Analytics-SDK/src/bigquery_agent_analytics/evaluators.py) | Strong | wider `FLOAT64` score kernel over token and pricing inputs | +| Latency scoring | [system_evaluator.py](/Users/haiyuancao/BigQuery-Agent-Analytics-SDK/src/bigquery_agent_analytics/system_evaluator.py) | Strong | `FLOAT64` score kernel | +| Turn-count scoring | [system_evaluator.py](/Users/haiyuancao/BigQuery-Agent-Analytics-SDK/src/bigquery_agent_analytics/system_evaluator.py) | Strong | `FLOAT64` score kernel | +| Error-rate scoring | [system_evaluator.py](/Users/haiyuancao/BigQuery-Agent-Analytics-SDK/src/bigquery_agent_analytics/system_evaluator.py) | Strong | `FLOAT64` score kernel | +| TTFT scoring | [system_evaluator.py](/Users/haiyuancao/BigQuery-Agent-Analytics-SDK/src/bigquery_agent_analytics/system_evaluator.py) | Strong | `FLOAT64` score kernel | +| Cost scoring | [system_evaluator.py](/Users/haiyuancao/BigQuery-Agent-Analytics-SDK/src/bigquery_agent_analytics/system_evaluator.py) | Strong | wider `FLOAT64` score kernel over token and pricing inputs | These kernels are exactly the kind of logic that benefits from direct SQL invocation with no external deployment surface. @@ -172,7 +172,7 @@ primitive: | SDK area | Python UDF fit | Required redesign | |----------|----------------|-------------------| -| `Client.evaluate(CodeEvaluator, filters)` | Partial | SQL builds per-session summaries first; UDF computes scores from summary fields | +| `Client.evaluate(SystemEvaluator, filters)` | Partial | SQL builds per-session summaries first; UDF computes scores from summary fields | | `Client.deep_analysis()` / question distribution | Partial | SQL does grouping / embeddings / top-k; UDF can help with categorization or normalization | | `Client.drift_detection()` | Partial | SQL computes set logic; UDF may help with text normalization or thresholding | | `Client.insights()` | Partial | Best split into SQL extraction + optional UDF post-processing; not a direct port | @@ -204,8 +204,9 @@ Instead, the SDK should expose a new internal layer: ```text bigquery_agent_analytics/ client.py # BigQuery job orchestration - evaluators.py # existing evaluator logic event_semantics.py # existing canonical predicates + system_evaluator.py # existing system metric evaluators + performance_evaluator.py # existing performance metric evaluators udf_kernels.py # new: pure functions reused by Python UDFs udf_serialization.py # new: STRING envelope helpers if needed ``` @@ -224,8 +225,8 @@ That is maintainable. Reusing the entire client inside a Python UDF is not. The current evaluator score math is not implemented as standalone top-level functions today. It lives inside factory-method closures such as -`CodeEvaluator.latency()` and `CodeEvaluator.error_rate()` in -[evaluators.py](/Users/haiyuancao/BigQuery-Agent-Analytics-SDK/src/bigquery_agent_analytics/evaluators.py). +`SystemEvaluator.latency()` and `SystemEvaluator.error_rate()` in +[system_evaluator.py](/Users/haiyuancao/BigQuery-Agent-Analytics-SDK/src/bigquery_agent_analytics/system_evaluator.py). That means the first implementation step is a deliberate refactor: @@ -281,7 +282,7 @@ the shared extraction helper. ### 7.2 Tier 2: code-evaluator score kernels -These should map directly to the existing `CodeEvaluator` math: +These should map directly to the existing `SystemEvaluator` math: ```sql CREATE FUNCTION `PROJECT.UDF_DATASET.bqaa_score_latency`( @@ -497,7 +498,7 @@ Remote Function should still be described as: - Add `udf_kernels.py` - Move reusable evaluator math into standalone pure functions - Move reusable event semantic helpers into a UDF-safe layer -- Add unit tests proving parity with existing `CodeEvaluator` behavior +- Add unit tests proving parity with existing `SystemEvaluator` behavior ### Phase U2: Tier 1 and Tier 2 UDFs diff --git a/examples/agent_improvement_cycle/DEMO_NARRATION.md b/examples/agent_improvement_cycle/DEMO_NARRATION.md index 9c3d02c..ca01971 100644 --- a/examples/agent_improvement_cycle/DEMO_NARRATION.md +++ b/examples/agent_improvement_cycle/DEMO_NARRATION.md @@ -98,7 +98,7 @@ Step two sends those ten questions to the agent. Every session is logged to BigQ Step three is where the SDK earns its keep. The quality report script reads those sessions back from BigQuery and an LLM judge scores each one. Four sessions are marked unhelpful — the agent deflected instead of using its tools. One is partial. Five are meaningful. The baseline score: fifty percent meaningful. That's our starting point. -Right below the quality score, the SDK's deterministic CodeEvaluator runs on the same sessions — average latency, total tokens per session, turn count and error_rate. These are the operational baselines. No LLM needed, just math on the data already in BigQuery. We'll compare against these numbers after the improvement to make sure the new prompt didn't trade quality for cost. +Right below the quality score, the SDK's deterministic SystemEvaluator runs on the same sessions — average latency, total tokens per session, turn count and error_rate. These are the operational baselines. No LLM needed, just math on the data already in BigQuery. We'll compare against these numbers after the improvement to make sure the new prompt didn't trade quality for cost. --- @@ -167,7 +167,7 @@ Let's recap what just happened: 3. We **generated ten synthetic questions** covering all six policy topics and ran them through the agent. The agent deflected on expenses, benefits, and holidays — topics it could answer but the prompt told it not to try. -4. The **SDK's quality report** read those sessions from BigQuery and an LLM judge scored them. Baseline: roughly fifty percent meaningful. Right below, the **SDK's CodeEvaluator** established operational baselines — latency, tokens, turns, tool error rate — all from the same BigQuery data, no extra LLM calls. +4. The **SDK's quality report** read those sessions from BigQuery and an LLM judge scored them. Baseline: roughly fifty percent meaningful. Right below, the **SDK's SystemEvaluator** established operational baselines — latency, tokens, turns, tool error rate — all from the same BigQuery data, no extra LLM calls. 5. We **extracted the failures** into the golden eval set — growing it from three to about eight cases. A **teacher agent** — same model, same tools, different prompt — generated ground truth for each failed question. The **Vertex AI Prompt Optimizer** used those triples to generate an improved prompt, and the **regression gate** validated it against all golden cases before promoting it to V2. @@ -217,5 +217,5 @@ By default, the script runs a single cycle and stops. The `--auto` flag enables ## [CLOSING] That's the agent improvement cycle. Capture sessions with the BigQuery Agent Analytics Plugin, evaluate quality with the SDK's LLM judge, -check operational metrics with the SDK's CodeEvaluator, optimize prompts with Vertex AI, and measure the results — all automated, all repeatable. +check operational metrics with the SDK's SystemEvaluator, optimize prompts with Vertex AI, and measure the results — all automated, all repeatable. The golden eval set grows with every cycle, so failures you discover today become regression tests for tomorrow. diff --git a/examples/agent_improvement_cycle/README.md b/examples/agent_improvement_cycle/README.md index 02fceba..ad7e304 100644 --- a/examples/agent_improvement_cycle/README.md +++ b/examples/agent_improvement_cycle/README.md @@ -104,7 +104,7 @@ This demo shows how to close that gap using four components: logged sessions back from BigQuery, evaluates quality using an LLM judge, and produces structured reports that drive automated improvement. -3. **[`SDK CodeEvaluator`](../../bigquery_agent_analytics/evaluators.py)** (the SDK's deterministic evaluator) checks +3. **[`SDK SystemEvaluator`](../../bigquery_agent_analytics/evaluators.py)** (the SDK's deterministic evaluator) checks operational metrics — latency, token efficiency, and turn count — on the same sessions. No LLM calls needed, just math on the data already in BigQuery. This ensures the improved prompt doesn't trade @@ -130,7 +130,7 @@ The full cycle: 5. **MEASURE IMPROVEMENT:** Verify the improved prompt against fresh traffic to quantify the quality jump. At each evaluation step (3 and 5), the SDK's deterministic -`CodeEvaluator` also checks latency, token efficiency, and turn count. +`SystemEvaluator` also checks latency, token efficiency, and turn count. Step 3 establishes the operational baseline; Step 5 shows the before/after comparison to verify the improved prompt didn't regress on cost or performance. No extra agent runs — just math on the session @@ -429,7 +429,7 @@ using ADK's `InMemoryRunner`. Sessions are logged to BigQuery via the **Step 3: Evaluate Quality** -- The SDK's `quality_report.py` reads sessions from BigQuery and scores each one on response_usefulness (meaningful/partial/unhelpful) and task_grounding (grounded/ungrounded). -The SDK's `CodeEvaluator` also runs deterministic checks on the same +The SDK's `SystemEvaluator` also runs deterministic checks on the same sessions — latency, token efficiency, and turn count — to establish an operational baseline. @@ -721,7 +721,7 @@ deployments, consider periodically pruning redundant golden cases. ## Further Reading -- [Your Agent Events Table Is Also a Test Suite](https://medium.com/google-cloud/your-agent-events-table-is-also-a-test-suite-999fbef885ed) — Using the SDK's `CodeEvaluator` and `categorical-eval` CLI to gate PRs against production traces. Covers the same deterministic evaluators (latency, token efficiency, turn count, error rate) this demo uses in Steps 3 and 5. +- [Your Agent Events Table Is Also a Test Suite](https://medium.com/google-cloud/your-agent-events-table-is-also-a-test-suite-999fbef885ed) — Using the SDK's `SystemEvaluator` and `categorical-eval` CLI to gate PRs against production traces. Covers the same deterministic evaluators (latency, token efficiency, turn count, error rate) this demo uses in Steps 3 and 5. - [BigQuery Agent Analytics: From Logs to Graphs](https://medium.com/google-cloud/bigquery-agent-analytics-from-logs-to-graphs-ab0bc34e1418) — Visualizing agent session traces as interactive graphs. Shows how the `BigQueryAgentAnalyticsPlugin` captures the data that powers this improvement cycle. ## Future / Next Steps diff --git a/examples/context_graph_adcp_demo.ipynb b/examples/context_graph_adcp_demo.ipynb index 95f96a9..fc75b5e 100644 --- a/examples/context_graph_adcp_demo.ipynb +++ b/examples/context_graph_adcp_demo.ipynb @@ -1,18 +1,8 @@ { "cells": [ { - "cell_type": "code", - "execution_count": 1, "id": "7b4dd2b6", - "metadata": { - "execution": { - "iopub.execute_input": "2026-03-05T09:26:40.608829Z", - "iopub.status.busy": "2026-03-05T09:26:40.608720Z", - "iopub.status.idle": "2026-03-05T09:26:40.611594Z", - "shell.execute_reply": "2026-03-05T09:26:40.611030Z" - } - }, - "outputs": [], + "cell_type": "code", "source": [ "# Copyright 2025 Google LLC\n", "#\n", @@ -27,40 +17,49 @@ "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", "# See the License for the specific language governing permissions and\n", "# limitations under the License." - ] + ], + "metadata": { + "execution": { + "iopub.execute_input": "2026-03-05T09:26:40.608829Z", + "iopub.status.busy": "2026-03-05T09:26:40.608720Z", + "iopub.status.idle": "2026-03-05T09:26:40.611594Z", + "shell.execute_reply": "2026-03-05T09:26:40.611030Z" + } + }, + "execution_count": 1 }, { - "cell_type": "markdown", "id": "0713f88b", - "metadata": {}, + "cell_type": "markdown", "source": [ "# Context Graph Demo V2: System of Reasoning for Agentic Ads\n", "\n", "**Integrating BigQuery Agent Analytics SDK with Native Property Graphs**\n", "\n", - "\n", - " \n", - " \n", - " \n", - "
\n", - " \n", - " \"Colab Run in Colab\n", - " \n", - " \n", - " \n", - " \"Vertex Open in Vertex AI Workbench\n", - " \n", - " \n", - " \n", - " \"BQ Open in BQ Studio\n", - " \n", - "
" - ] + "\u003ctable align=\"left\"\u003e\n", + " \u003ctd\u003e\n", + " \u003ca href=\"https://colab.research.google.com/github/haiyuan-eng-google/BigQuery-Agent-Analytics-SDK/blob/main/examples/context_graph_adcp_demo.ipynb\"\u003e\n", + " \u003cimg src=\"https://raw.githubusercontent.com/googleapis/python-bigquery-dataframes/refs/heads/main/third_party/logo/colab-logo.png\" alt=\"Colab logo\"\u003e Run in Colab\n", + " \u003c/a\u003e\n", + " \u003c/td\u003e\n", + " \u003ctd\u003e\n", + " \u003ca href=\"https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/haiyuan-eng-google/BigQuery-Agent-Analytics-SDK/main/examples/context_graph_adcp_demo.ipynb\"\u003e\n", + " \u003cimg src=\"https://www.gstatic.com/images/branding/product/1x/google_cloud_48dp.png\" alt=\"Vertex AI logo\" width=\"32\"\u003e Open in Vertex AI Workbench\n", + " \u003c/a\u003e\n", + " \u003c/td\u003e\n", + " \u003ctd\u003e\n", + " \u003ca href=\"https://console.cloud.google.com/bigquery/import?url=https://github.com/haiyuan-eng-google/BigQuery-Agent-Analytics-SDK/blob/main/examples/context_graph_adcp_demo.ipynb\"\u003e\n", + " \u003cimg src=\"https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcTW1gvOovVlbZAIZylUtf5Iu8-693qS1w5NJw\u0026s\" alt=\"BQ logo\" width=\"35\"\u003e Open in BQ Studio\n", + " \u003c/a\u003e\n", + " \u003c/td\u003e\n", + "\u003c/table\u003e" + ], + "metadata": {}, + "execution_count": null }, { - "cell_type": "markdown", "id": "4188cacb", - "metadata": {}, + "cell_type": "markdown", "source": [ "## Demo Scenario: Yahoo ADCP \"ELF Cosmetics\" Media Buy\n", "\n", @@ -68,13 +67,13 @@ "\n", "```mermaid\n", "flowchart LR\n", - " A[Buyer Agent
ELF Cosmetics] -->|ADCP Brief| B[Sales Agent
Yahoo DSP]\n", - " B -->|Inventory Query| C[Inventory Tool]\n", - " B -->|Audience Match| D[Audience Tool]\n", - " B -->|Budget Split| E[Budget Tool]\n", - " B -->|HITL Pause| F[Ad Ops Manager
Slack Approval]\n", - " F -->|Approved| G[Provision Campaign
Google Ad Manager]\n", - " G -->|Artifact| H[GCS Line Item JSON]\n", + " A[Buyer Agent\u003cbr\u003eELF Cosmetics] --\u003e|ADCP Brief| B[Sales Agent\u003cbr\u003eYahoo DSP]\n", + " B --\u003e|Inventory Query| C[Inventory Tool]\n", + " B --\u003e|Audience Match| D[Audience Tool]\n", + " B --\u003e|Budget Split| E[Budget Tool]\n", + " B --\u003e|HITL Pause| F[Ad Ops Manager\u003cbr\u003eSlack Approval]\n", + " F --\u003e|Approved| G[Provision Campaign\u003cbr\u003eGoogle Ad Manager]\n", + " G --\u003e|Artifact| H[GCS Line Item JSON]\n", "```\n", "\n", "We then build a **4-Pillar Context Graph** that cross-links:\n", @@ -84,20 +83,25 @@ "4. **Persisted Artifacts** (GCS object references for campaign JSON)\n", "\n", "Finally, we demonstrate **World Change detection** for long-running A2A tasks." - ] + ], + "metadata": {}, + "execution_count": null }, { - "cell_type": "markdown", "id": "fb54dbbe", - "metadata": {}, + "cell_type": "markdown", "source": [ "## Install Dependencies" - ] + ], + "metadata": {}, + "execution_count": null }, { - "cell_type": "code", - "execution_count": 2, "id": "ef0c7b17", + "cell_type": "code", + "source": [ + "!pip install -q google-adk bigquery-agent-analytics google-cloud-bigquery nest-asyncio" + ], "metadata": { "execution": { "iopub.execute_input": "2026-03-05T09:26:40.613838Z", @@ -106,6 +110,7 @@ "shell.execute_reply": "2026-03-05T09:26:41.651018Z" } }, + "execution_count": 2, "outputs": [ { "name": "stdout", @@ -119,54 +124,29 @@ "name": "stdout", "output_type": "stream", "text": [ - "\u001b[31mERROR: Ignored the following versions that require a different python version: 1.19.0 Requires-Python >=3.10; 1.20.0 Requires-Python >=3.10; 1.21.0 Requires-Python >=3.10; 1.22.0 Requires-Python >=3.10; 1.22.1 Requires-Python >=3.10; 1.23.0 Requires-Python >=3.10; 1.24.0 Requires-Python >=3.10; 1.24.1 Requires-Python >=3.10; 1.25.0 Requires-Python >=3.10; 1.25.1 Requires-Python >=3.10; 1.26.0 Requires-Python >=3.10\u001b[0m\u001b[31m\r\n", + "\u001b[31mERROR: Ignored the following versions that require a different python version: 1.19.0 Requires-Python \u003e=3.10; 1.20.0 Requires-Python \u003e=3.10; 1.21.0 Requires-Python \u003e=3.10; 1.22.0 Requires-Python \u003e=3.10; 1.22.1 Requires-Python \u003e=3.10; 1.23.0 Requires-Python \u003e=3.10; 1.24.0 Requires-Python \u003e=3.10; 1.24.1 Requires-Python \u003e=3.10; 1.25.0 Requires-Python \u003e=3.10; 1.25.1 Requires-Python \u003e=3.10; 1.26.0 Requires-Python \u003e=3.10\u001b[0m\u001b[31m\r\n", "\u001b[0m\u001b[31mERROR: Could not find a version that satisfies the requirement bigquery-agent-analytics (from versions: none)\u001b[0m\u001b[31m\r\n", "\u001b[0m\r\n", - "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m25.1.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m26.0.1\u001b[0m\r\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m25.1.1\u001b[0m\u001b[39;49m -\u003e \u001b[0m\u001b[32;49m26.0.1\u001b[0m\r\n", "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49m/usr/local/opt/python@3.9/bin/python3.9 -m pip install --upgrade pip\u001b[0m\r\n", "\u001b[31mERROR: No matching distribution found for bigquery-agent-analytics\u001b[0m\u001b[31m\r\n", "\u001b[0m" ] } - ], - "source": [ - "!pip install -q google-adk bigquery-agent-analytics google-cloud-bigquery nest-asyncio" ] }, { - "cell_type": "markdown", "id": "e654c10e", - "metadata": {}, + "cell_type": "markdown", "source": [ - "## Authenticate & Configure" - ] + "## Authenticate \u0026 Configure" + ], + "metadata": {}, + "execution_count": null }, { - "cell_type": "code", - "execution_count": 3, "id": "a8a11844", - "metadata": { - "execution": { - "iopub.execute_input": "2026-03-05T09:26:41.653979Z", - "iopub.status.busy": "2026-03-05T09:26:41.653848Z", - "iopub.status.idle": "2026-03-05T09:26:41.659483Z", - "shell.execute_reply": "2026-03-05T09:26:41.658883Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Not running in Colab -- using default credentials.\n", - "Project : test-project-0728-467323\n", - "Dataset : agent_analytics\n", - "Table : agent_events\n", - "Model : gemini-3-flash-preview\n", - "Vertex AI: enabled\n" - ] - } - ], + "cell_type": "code", "source": [ "import os\n", "\n", @@ -198,41 +178,47 @@ "print(f\"Table : {TABLE_ID}\")\n", "print(f\"Model : {MODEL_NAME}\")\n", "print(f\"Vertex AI: enabled\")" + ], + "metadata": { + "execution": { + "iopub.execute_input": "2026-03-05T09:26:41.653979Z", + "iopub.status.busy": "2026-03-05T09:26:41.653848Z", + "iopub.status.idle": "2026-03-05T09:26:41.659483Z", + "shell.execute_reply": "2026-03-05T09:26:41.658883Z" + } + }, + "execution_count": 3, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Not running in Colab -- using default credentials.\n", + "Project : test-project-0728-467323\n", + "Dataset : agent_analytics\n", + "Table : agent_events\n", + "Model : gemini-3-flash-preview\n", + "Vertex AI: enabled\n" + ] + } ] }, { - "cell_type": "markdown", "id": "55ecb639", - "metadata": {}, + "cell_type": "markdown", "source": [ "---\n", "\n", "## Phase 1: Define ADCP Domain Tools\n", "\n", "We create **deterministic tools** that simulate Yahoo's advertising platform. Each tool uses seeded randomness for reproducible demo output." - ] + ], + "metadata": {}, + "execution_count": null }, { - "cell_type": "code", - "execution_count": 4, "id": "4aef1ce5", - "metadata": { - "execution": { - "iopub.execute_input": "2026-03-05T09:26:41.660859Z", - "iopub.status.busy": "2026-03-05T09:26:41.660786Z", - "iopub.status.idle": "2026-03-05T09:26:41.670682Z", - "shell.execute_reply": "2026-03-05T09:26:41.670195Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "ADCP tools defined: query_ad_inventory, match_target_audience, allocate_media_budget, provision_campaign_in_gam\n" - ] - } - ], + "cell_type": "code", "source": [ "import hashlib\n", "import json\n", @@ -240,7 +226,7 @@ "from typing import Any\n", "\n", "\n", - "def _rng_from(*parts: str) -> random.Random:\n", + "def _rng_from(*parts: str) -\u003e random.Random:\n", " seed = int(hashlib.md5(\"|\".join(parts).encode()).hexdigest()[:8], 16)\n", " return random.Random(seed)\n", "\n", @@ -249,7 +235,7 @@ " product_name: str,\n", " format_type: str = \"display\",\n", " date_range: str = \"2025-Q2\",\n", - ") -> dict[str, Any]:\n", + ") -\u003e dict[str, Any]:\n", " \"\"\"Query available ad inventory for a Yahoo product.\n", "\n", " Args:\n", @@ -280,7 +266,7 @@ " brand: str,\n", " target_demographics: str,\n", " campaign_goal: str = \"brand_awareness\",\n", - ") -> dict[str, Any]:\n", + ") -\u003e dict[str, Any]:\n", " \"\"\"Match target audience segments against Yahoo's audience graph.\n", "\n", " Args:\n", @@ -296,7 +282,7 @@ " {\"segment\": \"Beauty Enthusiasts\", \"match_score\": 0.95},\n", " {\"segment\": \"Millennials 25-34\", \"match_score\": 0.92},\n", " {\"segment\": \"Female Shoppers\", \"match_score\": 0.88},\n", - " {\"segment\": \"Health & Wellness\", \"match_score\": 0.76},\n", + " {\"segment\": \"Health \u0026 Wellness\", \"match_score\": 0.76},\n", " {\"segment\": \"Premium Consumers\", \"match_score\": 0.71},\n", " ]\n", " rng.shuffle(segments)\n", @@ -315,7 +301,7 @@ " total_budget_usd: float,\n", " products: str,\n", " campaign_duration_days: int = 30,\n", - ") -> dict[str, Any]:\n", + ") -\u003e dict[str, Any]:\n", " \"\"\"Allocate media budget across Yahoo ad products.\n", "\n", " Args:\n", @@ -365,7 +351,7 @@ " start_date: str,\n", " end_date: str,\n", " targeting_segments: str,\n", - ") -> dict[str, Any]:\n", + ") -\u003e dict[str, Any]:\n", " \"\"\"Provision a campaign in Google Ad Manager (GAM).\n", "\n", " Args:\n", @@ -403,49 +389,42 @@ "\n", "print(\"ADCP tools defined: query_ad_inventory, match_target_audience,\"\n", " \" allocate_media_budget, provision_campaign_in_gam\")" + ], + "metadata": { + "execution": { + "iopub.execute_input": "2026-03-05T09:26:41.660859Z", + "iopub.status.busy": "2026-03-05T09:26:41.660786Z", + "iopub.status.idle": "2026-03-05T09:26:41.670682Z", + "shell.execute_reply": "2026-03-05T09:26:41.670195Z" + } + }, + "execution_count": 4, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "ADCP tools defined: query_ad_inventory, match_target_audience, allocate_media_budget, provision_campaign_in_gam\n" + ] + } ] }, { - "cell_type": "markdown", "id": "912fd1fc", - "metadata": {}, + "cell_type": "markdown", "source": [ "---\n", "\n", "## Phase 2: Build Multi-Agent System with ADK\n", "\n", "We create a **Yahoo Sales Agent** that orchestrates the ADCP workflow. It processes the buyer's brief, queries inventory, matches audiences, allocates budget, and pauses for HITL approval before provisioning." - ] + ], + "metadata": {}, + "execution_count": null }, { - "cell_type": "code", - "execution_count": 5, "id": "25290b99", - "metadata": { - "execution": { - "iopub.execute_input": "2026-03-05T09:26:41.672179Z", - "iopub.status.busy": "2026-03-05T09:26:41.672085Z", - "iopub.status.idle": "2026-03-05T09:26:43.629409Z", - "shell.execute_reply": "2026-03-05T09:26:43.628925Z" - } - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/lib/python3.13/site-packages/requests/__init__.py:113: RequestsDependencyWarning: urllib3 (2.6.3) or chardet (6.0.0.post1)/charset_normalizer (3.4.4) doesn't match a supported version!\n", - " warnings.warn(\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "ADCP agent builder ready.\n" - ] - } - ], + "cell_type": "code", "source": [ "from google.adk.agents import LlmAgent\n", "from google.genai import types\n", @@ -471,7 +450,7 @@ "\"\"\"\n", "\n", "\n", - "def build_adcp_agent() -> LlmAgent:\n", + "def build_adcp_agent() -\u003e LlmAgent:\n", " \"\"\"Build the Yahoo ADCP Sales Agent.\"\"\"\n", " return LlmAgent(\n", " name=\"yahoo_sales_agent\",\n", @@ -490,252 +469,50 @@ "\n", "\n", "print(\"ADCP agent builder ready.\")" - ] - }, - { - "cell_type": "markdown", - "id": "0bb7021f", - "metadata": {}, - "source": [ - "---\n", - "\n", - "## Phase 3: Run ADCP Workflows & Log Traces to BigQuery\n", - "\n", - "We run **three simulated ADCP conversations** representing different media buying scenarios. The `BigQueryAgentAnalyticsPlugin` captures every event." - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "b0290a9f", + ], "metadata": { "execution": { - "iopub.execute_input": "2026-03-05T09:26:43.631033Z", - "iopub.status.busy": "2026-03-05T09:26:43.630822Z", - "iopub.status.idle": "2026-03-05T09:27:54.876567Z", - "shell.execute_reply": "2026-03-05T09:27:54.875318Z" + "iopub.execute_input": "2026-03-05T09:26:41.672179Z", + "iopub.status.busy": "2026-03-05T09:26:41.672085Z", + "iopub.status.idle": "2026-03-05T09:26:43.629409Z", + "shell.execute_reply": "2026-03-05T09:26:43.628925Z" } }, + "execution_count": 5, "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "======================================================================\n", - " Session: adcp-a20d176b82af [ELF Cosmetics -- $50K Brand Awareness Campaign]\n", - "======================================================================\n", - "\n", - "[Turn 1] Buyer: ADCP Media Buying Brief:\n", - "Brand: ELF Cosmetics\n", - "Budget: $50,000\n", - "Campaign Goal: Brand awareness for new skincare line\n", - "Targe...\n", - "------------------------------------------------\n" - ] - }, { "name": "stderr", "output_type": "stream", "text": [ - "Warning: there are non-text parts in the response: ['function_call', 'function_call'], returning concatenated text result from text parts. Check the full candidates.content.parts accessor to get the full model response.\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - " -> Tool call: query_ad_inventory\n", - " -> Tool call: query_ad_inventory\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - " -> Tool call: match_target_audience\n" + "/usr/local/lib/python3.13/site-packages/requests/__init__.py:113: RequestsDependencyWarning: urllib3 (2.6.3) or chardet (6.0.0.post1)/charset_normalizer (3.4.4) doesn't match a supported version!\n", + " warnings.warn(\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - " -> Tool call: allocate_media_budget\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "[Sales Agent]: Based on your brief for **ELF Cosmetics**, I have developed a comprehensive media plan to drive brand awareness for your new skincare line among Millennials and beauty enthusiasts.\n", - "\n", - "### **Campaign Overview**\n", - "* **Brand:** ELF Cosmetics\n", - "* **Campaign Goal:** Brand Awareness\n", - "* **Total Budget:** $50,000\n", - "* **Flight Dates:** 2025-05-01 to 2025-05-31\n", - "* **Total Projected Impressions:** ~8,166,935\n", - "* **Projected ROAS:** 3.55\n", - "\n", - "---\n", - "\n", - "### **Audience Strategy**\n", - "We matched your requirements against Yahoo’s audience graph, identifying a total addressable audience of **7.28M users**. \n", - "* **Primary Segment:** Beauty Enthusiasts (Match Score: 0.95)\n", - "* **Secondary Segment:** Millennials 25-34 (Match Score: 0.92)\n", - "* **Supporting Segment:** Female Shoppers (Match Score: 0.88)\n", - "\n", - "---\n", - "\n", - "### **Media Allo\n", - " ... (truncated, 2212 chars total)\n", - "\n", - "[Turn 2] Buyer: The media plan is approved by the ad-ops manager. Please provision the campaign in Google Ad Manager with campaign name ...\n", - "------------------------------------------------\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - " -> Tool call: provision_campaign_in_gam\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "[Sales Agent]: The campaign has been successfully provisioned in Google Ad Manager.\n", - "\n", - "### **Campaign Provisioning Summary**\n", - "* **Campaign Name:** ELF_Skincare_May2025\n", - "* **Advertiser:** ELF Cosmetics\n", - "* **Order ID:** `ORD-70678`\n", - "* **Line Item ID:** `LI-756565`\n", - "* **Budget:** $50,000.00\n", - "* **Flight Dates:** 2025-05-01 to 2025-05-31\n", - "* **Targeting:** Beauty Enthusiasts, Millennials 25-34\n", - "* **Status:** Provisioned\n", - "\n", - "The campaign is now ready for creative upload and final activation by your ad-ops team. You can find the full configuration details in the GCS artifact: `gs://adcp-artifacts/elf_cosmetics/LI-756565.json`.\n", - "\n", - "Is there anything else you need assistance with today?\n", - "\n", - "======================================================================\n", - " Session: adcp-7d9855e7a71b [Nike -- $200K Multi-Product Performance Campaign]\n", - "======================================================================\n", - "\n", - "[Turn 1] Buyer: ADCP Media Buying Brief:\n", - "Brand: Nike\n", - "Budget: $200,000\n", - "Campaign Goal: Product launch for Air Max 2025\n", - "Target Demographics...\n", - "------------------------------------------------\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - " -> Tool call: query_ad_inventory\n", - " -> Tool call: query_ad_inventory\n", - " -> Tool call: query_ad_inventory\n", - " -> Tool call: match_target_audience\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - " -> Tool call: allocate_media_budget\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "[Sales Agent]: Based on your brief for the **Nike Air Max 2025** product launch, I have analyzed Yahoo's inventory and audience graph to develop a recommended media plan.\n", - "\n", - "### **Campaign Overview**\n", - "* **Brand:** Nike\n", - "* **Budget:** $200,000\n", - "* **Flight Dates:** June 1, 2025 – June 30, 2025 (30 Days)\n", - "* **Goal:** High-impact product launch & reach among sports/sneaker enthusiasts.\n", - "\n", - "---\n", - "\n", - "### **Media Plan Recommendation**\n", - "\n", - "| Product | Budget Allocation | Estimated Impressions | CPM (Avg) | Projected CTR |\n", - "| :--- | :--- | :--- | :--- | :--- |\n", - "| **Yahoo Finance** | $121,327.80 (60.7%) | 5,135,264 | $19.05 | 1.21% |\n", - "| **Yahoo Homepage**| $46,134.57 (23.1%) | 2,784,630 | $21.06 | 2.35% |\n", - "| **Yahoo Sports** | $32,537.63 (16.3%) | 1,475,740 | $5.81 | 2.25% |\n", - "| **TOTAL** | **$200,000.00** | **9,39\n", - " ... (truncated, 1984 chars total)\n", - "\n", - "======================================================================\n", - " Session: adcp-2c401a645c40 [Tesla -- $100K Targeted EV Campaign]\n", - "======================================================================\n", - "\n", - "[Turn 1] Buyer: ADCP Media Buying Brief:\n", - "Brand: Tesla\n", - "Budget: $100,000\n", - "Campaign Goal: Lead generation for Model Y test drives\n", - "Target Dem...\n", - "------------------------------------------------\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - " -> Tool call: query_ad_inventory\n", - " -> Tool call: query_ad_inventory\n", - " -> Tool call: match_target_audience\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - " -> Tool call: allocate_media_budget\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - " -> Tool call: provision_campaign_in_gam\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "[Sales Agent]: I have successfully processed your media buying brief for the **Tesla Model Y Lead Generation Campaign**. \n", - "\n", - "### **1. Media Plan Overview**\n", - "Based on your budget of **$100,000** and target audience of **Tech-savvy professionals and EV intenders**, I have optimized the allocation across your preferred Yahoo products.\n", - "\n", - "| Yahoo Product | Format | Budget Allocation | Est. Impressions | Key Reason |\n", - "| :--- | :--- | :--- | :--- | :--- |\n", - "| **Yahoo Finance** | Display | $29,465.15 (29.5%) | 1,432,055 | **Premium Placement:** High affinity with high-net-worth tech professionals. |\n", - "| **Yahoo Mail** | Native | $70,534.85 (70.5%) | 5,808,834 | **High Engagement:** Native ads in inbox provide superior CTR (1.19%) for lead gen. |\n", - "\n", - "**Campaign Performance Projections:**\n", - "* **Total Projected Impressions:** \n", - " ... (truncated, 1746 chars total)\n", - "\n", - "\n", - "Session IDs: ['adcp-a20d176b82af', 'adcp-7d9855e7a71b', 'adcp-2c401a645c40']\n" + "ADCP agent builder ready.\n" ] } + ] + }, + { + "id": "0bb7021f", + "cell_type": "markdown", + "source": [ + "---\n", + "\n", + "## Phase 3: Run ADCP Workflows \u0026 Log Traces to BigQuery\n", + "\n", + "We run **three simulated ADCP conversations** representing different media buying scenarios. The `BigQueryAgentAnalyticsPlugin` captures every event." ], + "metadata": {}, + "execution_count": null + }, + { + "id": "b0290a9f", + "cell_type": "code", "source": [ "import asyncio\n", "import uuid\n", @@ -863,11 +640,11 @@ " if hasattr(part, \"text\") and part.text:\n", " response_parts.append(part.text)\n", " elif hasattr(part, \"function_call\") and part.function_call:\n", - " print(f\" -> Tool call: {part.function_call.name}\")\n", + " print(f\" -\u003e Tool call: {part.function_call.name}\")\n", " if response_parts:\n", " text = \"\\n\".join(response_parts)\n", " print(f\"\\n[Sales Agent]: {text[:800]}\")\n", - " if len(text) > 800:\n", + " if len(text) \u003e 800:\n", " print(f\" ... (truncated, {len(text)} chars total)\")\n", " return session_id\n", "\n", @@ -881,12 +658,254 @@ " session_ids.append(sid)\n", "\n", "print(f\"\\n\\nSession IDs: {session_ids}\")" + ], + "metadata": { + "execution": { + "iopub.execute_input": "2026-03-05T09:26:43.631033Z", + "iopub.status.busy": "2026-03-05T09:26:43.630822Z", + "iopub.status.idle": "2026-03-05T09:27:54.876567Z", + "shell.execute_reply": "2026-03-05T09:27:54.875318Z" + } + }, + "execution_count": 6, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "======================================================================\n", + " Session: adcp-a20d176b82af [ELF Cosmetics -- $50K Brand Awareness Campaign]\n", + "======================================================================\n", + "\n", + "[Turn 1] Buyer: ADCP Media Buying Brief:\n", + "Brand: ELF Cosmetics\n", + "Budget: $50,000\n", + "Campaign Goal: Brand awareness for new skincare line\n", + "Targe...\n", + "------------------------------------------------\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Warning: there are non-text parts in the response: ['function_call', 'function_call'], returning concatenated text result from text parts. Check the full candidates.content.parts accessor to get the full model response.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " -\u003e Tool call: query_ad_inventory\n", + " -\u003e Tool call: query_ad_inventory\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " -\u003e Tool call: match_target_audience\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " -\u003e Tool call: allocate_media_budget\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "[Sales Agent]: Based on your brief for **ELF Cosmetics**, I have developed a comprehensive media plan to drive brand awareness for your new skincare line among Millennials and beauty enthusiasts.\n", + "\n", + "### **Campaign Overview**\n", + "* **Brand:** ELF Cosmetics\n", + "* **Campaign Goal:** Brand Awareness\n", + "* **Total Budget:** $50,000\n", + "* **Flight Dates:** 2025-05-01 to 2025-05-31\n", + "* **Total Projected Impressions:** ~8,166,935\n", + "* **Projected ROAS:** 3.55\n", + "\n", + "---\n", + "\n", + "### **Audience Strategy**\n", + "We matched your requirements against Yahoo’s audience graph, identifying a total addressable audience of **7.28M users**. \n", + "* **Primary Segment:** Beauty Enthusiasts (Match Score: 0.95)\n", + "* **Secondary Segment:** Millennials 25-34 (Match Score: 0.92)\n", + "* **Supporting Segment:** Female Shoppers (Match Score: 0.88)\n", + "\n", + "---\n", + "\n", + "### **Media Allo\n", + " ... (truncated, 2212 chars total)\n", + "\n", + "[Turn 2] Buyer: The media plan is approved by the ad-ops manager. Please provision the campaign in Google Ad Manager with campaign name ...\n", + "------------------------------------------------\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " -\u003e Tool call: provision_campaign_in_gam\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "[Sales Agent]: The campaign has been successfully provisioned in Google Ad Manager.\n", + "\n", + "### **Campaign Provisioning Summary**\n", + "* **Campaign Name:** ELF_Skincare_May2025\n", + "* **Advertiser:** ELF Cosmetics\n", + "* **Order ID:** `ORD-70678`\n", + "* **Line Item ID:** `LI-756565`\n", + "* **Budget:** $50,000.00\n", + "* **Flight Dates:** 2025-05-01 to 2025-05-31\n", + "* **Targeting:** Beauty Enthusiasts, Millennials 25-34\n", + "* **Status:** Provisioned\n", + "\n", + "The campaign is now ready for creative upload and final activation by your ad-ops team. You can find the full configuration details in the GCS artifact: `gs://adcp-artifacts/elf_cosmetics/LI-756565.json`.\n", + "\n", + "Is there anything else you need assistance with today?\n", + "\n", + "======================================================================\n", + " Session: adcp-7d9855e7a71b [Nike -- $200K Multi-Product Performance Campaign]\n", + "======================================================================\n", + "\n", + "[Turn 1] Buyer: ADCP Media Buying Brief:\n", + "Brand: Nike\n", + "Budget: $200,000\n", + "Campaign Goal: Product launch for Air Max 2025\n", + "Target Demographics...\n", + "------------------------------------------------\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " -\u003e Tool call: query_ad_inventory\n", + " -\u003e Tool call: query_ad_inventory\n", + " -\u003e Tool call: query_ad_inventory\n", + " -\u003e Tool call: match_target_audience\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " -\u003e Tool call: allocate_media_budget\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "[Sales Agent]: Based on your brief for the **Nike Air Max 2025** product launch, I have analyzed Yahoo's inventory and audience graph to develop a recommended media plan.\n", + "\n", + "### **Campaign Overview**\n", + "* **Brand:** Nike\n", + "* **Budget:** $200,000\n", + "* **Flight Dates:** June 1, 2025 – June 30, 2025 (30 Days)\n", + "* **Goal:** High-impact product launch \u0026 reach among sports/sneaker enthusiasts.\n", + "\n", + "---\n", + "\n", + "### **Media Plan Recommendation**\n", + "\n", + "| Product | Budget Allocation | Estimated Impressions | CPM (Avg) | Projected CTR |\n", + "| :--- | :--- | :--- | :--- | :--- |\n", + "| **Yahoo Finance** | $121,327.80 (60.7%) | 5,135,264 | $19.05 | 1.21% |\n", + "| **Yahoo Homepage**| $46,134.57 (23.1%) | 2,784,630 | $21.06 | 2.35% |\n", + "| **Yahoo Sports** | $32,537.63 (16.3%) | 1,475,740 | $5.81 | 2.25% |\n", + "| **TOTAL** | **$200,000.00** | **9,39\n", + " ... (truncated, 1984 chars total)\n", + "\n", + "======================================================================\n", + " Session: adcp-2c401a645c40 [Tesla -- $100K Targeted EV Campaign]\n", + "======================================================================\n", + "\n", + "[Turn 1] Buyer: ADCP Media Buying Brief:\n", + "Brand: Tesla\n", + "Budget: $100,000\n", + "Campaign Goal: Lead generation for Model Y test drives\n", + "Target Dem...\n", + "------------------------------------------------\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " -\u003e Tool call: query_ad_inventory\n", + " -\u003e Tool call: query_ad_inventory\n", + " -\u003e Tool call: match_target_audience\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " -\u003e Tool call: allocate_media_budget\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " -\u003e Tool call: provision_campaign_in_gam\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "[Sales Agent]: I have successfully processed your media buying brief for the **Tesla Model Y Lead Generation Campaign**. \n", + "\n", + "### **1. Media Plan Overview**\n", + "Based on your budget of **$100,000** and target audience of **Tech-savvy professionals and EV intenders**, I have optimized the allocation across your preferred Yahoo products.\n", + "\n", + "| Yahoo Product | Format | Budget Allocation | Est. Impressions | Key Reason |\n", + "| :--- | :--- | :--- | :--- | :--- |\n", + "| **Yahoo Finance** | Display | $29,465.15 (29.5%) | 1,432,055 | **Premium Placement:** High affinity with high-net-worth tech professionals. |\n", + "| **Yahoo Mail** | Native | $70,534.85 (70.5%) | 5,808,834 | **High Engagement:** Native ads in inbox provide superior CTR (1.19%) for lead gen. |\n", + "\n", + "**Campaign Performance Projections:**\n", + "* **Total Projected Impressions:** \n", + " ... (truncated, 1746 chars total)\n", + "\n", + "\n", + "Session IDs: ['adcp-a20d176b82af', 'adcp-7d9855e7a71b', 'adcp-2c401a645c40']\n" + ] + } ] }, { - "cell_type": "code", - "execution_count": 7, "id": "f9548cc1", + "cell_type": "code", + "source": [ + "import time\n", + "\n", + "print(\"Flushing traces to BigQuery ...\")\n", + "try:\n", + " asyncio.get_event_loop().run_until_complete(plugin.flush())\n", + "except Exception as exc:\n", + " print(f\"Flush warning: {exc}\")\n", + "\n", + "settle_seconds = 15\n", + "print(f\"Waiting {settle_seconds}s for BigQuery data to settle ...\")\n", + "time.sleep(settle_seconds)\n", + "print(\"Done.\")" + ], "metadata": { "execution": { "iopub.execute_input": "2026-03-05T09:27:54.879040Z", @@ -895,6 +914,7 @@ "shell.execute_reply": "2026-03-05T09:28:09.888134Z" } }, + "execution_count": 7, "outputs": [ { "name": "stdout", @@ -911,38 +931,36 @@ "Done.\n" ] } - ], - "source": [ - "import time\n", - "\n", - "print(\"Flushing traces to BigQuery ...\")\n", - "try:\n", - " asyncio.get_event_loop().run_until_complete(plugin.flush())\n", - "except Exception as exc:\n", - " print(f\"Flush warning: {exc}\")\n", - "\n", - "settle_seconds = 15\n", - "print(f\"Waiting {settle_seconds}s for BigQuery data to settle ...\")\n", - "time.sleep(settle_seconds)\n", - "print(\"Done.\")" ] }, { - "cell_type": "markdown", "id": "34e76dd7", - "metadata": {}, + "cell_type": "markdown", "source": [ "---\n", "\n", - "## Phase 4: Trace Retrieval & Visualization\n", + "## Phase 4: Trace Retrieval \u0026 Visualization\n", "\n", "Use the SDK Client to fetch traces and render the hierarchical execution DAG." - ] + ], + "metadata": {}, + "execution_count": null }, { - "cell_type": "code", - "execution_count": 8, "id": "fade6aa2", + "cell_type": "code", + "source": [ + "from bigquery_agent_analytics import Client, TraceFilter\n", + "\n", + "client = Client(\n", + " project_id=PROJECT_ID,\n", + " dataset_id=DATASET_ID,\n", + " table_id=TABLE_ID,\n", + " location=LOCATION,\n", + " endpoint=MODEL_NAME,\n", + ")\n", + "print(\"SDK Client initialised.\")" + ], "metadata": { "execution": { "iopub.execute_input": "2026-03-05T09:28:09.890911Z", @@ -951,6 +969,7 @@ "shell.execute_reply": "2026-03-05T09:28:11.699926Z" } }, + "execution_count": 8, "outputs": [ { "name": "stdout", @@ -959,24 +978,26 @@ "SDK Client initialised.\n" ] } - ], - "source": [ - "from bigquery_agent_analytics import Client, TraceFilter\n", - "\n", - "client = Client(\n", - " project_id=PROJECT_ID,\n", - " dataset_id=DATASET_ID,\n", - " table_id=TABLE_ID,\n", - " location=LOCATION,\n", - " endpoint=MODEL_NAME,\n", - ")\n", - "print(\"SDK Client initialised.\")" ] }, { - "cell_type": "code", - "execution_count": 9, "id": "b703e431", + "cell_type": "code", + "source": [ + "# Retrieve and render each trace\n", + "traces = []\n", + "for sid in session_ids:\n", + " try:\n", + " trace = client.get_session_trace(sid)\n", + " traces.append(trace)\n", + " print(f\"\\n{'=' * 70}\")\n", + " print(f\" Trace for session: {sid}\")\n", + " print(f\"{'=' * 70}\")\n", + " _ = trace.render()\n", + " except Exception as exc:\n", + " print(f\"Could not retrieve trace {sid}: {exc}\")\n", + " traces.append(None)" + ], "metadata": { "execution": { "iopub.execute_input": "2026-03-05T09:28:11.710446Z", @@ -985,6 +1006,7 @@ "shell.execute_reply": "2026-03-05T09:28:14.651317Z" } }, + "execution_count": 9, "outputs": [ { "name": "stdout", @@ -1171,27 +1193,28 @@ "##...\n" ] } - ], - "source": [ - "# Retrieve and render each trace\n", - "traces = []\n", - "for sid in session_ids:\n", - " try:\n", - " trace = client.get_session_trace(sid)\n", - " traces.append(trace)\n", - " print(f\"\\n{'=' * 70}\")\n", - " print(f\" Trace for session: {sid}\")\n", - " print(f\"{'=' * 70}\")\n", - " _ = trace.render()\n", - " except Exception as exc:\n", - " print(f\"Could not retrieve trace {sid}: {exc}\")\n", - " traces.append(None)" ] }, { - "cell_type": "code", - "execution_count": 10, "id": "d39c6cf4", + "cell_type": "code", + "source": [ + "# Inspect ADCP-specific trace properties\n", + "for i, trace in enumerate(traces):\n", + " if trace is None:\n", + " continue\n", + " print(f\"\\n--- Session {i+1}: {trace.session_id} ---\")\n", + " print(f\" Total spans : {len(trace.spans)}\")\n", + " print(f\" Tool calls : {len(trace.tool_calls)}\")\n", + " for tc in trace.tool_calls:\n", + " print(f\" - {tc.get('tool_name', '?')}\")\n", + " final = trace.final_response or \"(none)\"\n", + " print(f\" Final response : {final[:300]}\")\n", + " errors = trace.error_spans\n", + " print(f\" Error spans : {len(errors)}\")\n", + " if trace.total_latency_ms:\n", + " print(f\" Total latency : {trace.total_latency_ms:.0f}ms\")" + ], "metadata": { "execution": { "iopub.execute_input": "2026-03-05T09:28:14.653806Z", @@ -1200,6 +1223,7 @@ "shell.execute_reply": "2026-03-05T09:28:14.657021Z" } }, + "execution_count": 10, "outputs": [ { "name": "stdout", @@ -1260,29 +1284,11 @@ " Total latency : 15069ms\n" ] } - ], - "source": [ - "# Inspect ADCP-specific trace properties\n", - "for i, trace in enumerate(traces):\n", - " if trace is None:\n", - " continue\n", - " print(f\"\\n--- Session {i+1}: {trace.session_id} ---\")\n", - " print(f\" Total spans : {len(trace.spans)}\")\n", - " print(f\" Tool calls : {len(trace.tool_calls)}\")\n", - " for tc in trace.tool_calls:\n", - " print(f\" - {tc.get('tool_name', '?')}\")\n", - " final = trace.final_response or \"(none)\"\n", - " print(f\" Final response : {final[:300]}\")\n", - " errors = trace.error_spans\n", - " print(f\" Error spans : {len(errors)}\")\n", - " if trace.total_latency_ms:\n", - " print(f\" Total latency : {trace.total_latency_ms:.0f}ms\")" ] }, { - "cell_type": "markdown", "id": "50a56a07", - "metadata": {}, + "cell_type": "markdown", "source": [ "---\n", "\n", @@ -1291,34 +1297,13 @@ "We use BigQuery's `AI.GENERATE` with `output_schema` to extract structured business entities from the unstructured agent payloads. This creates the **Biz Graph** layer of our 4-pillar context graph.\n", "\n", "Entity types: `Product`, `Targeting`, `Campaign`, `Budget`, `Audience`" - ] + ], + "metadata": {}, + "execution_count": null }, { - "cell_type": "code", - "execution_count": 11, "id": "604322a5", - "metadata": { - "execution": { - "iopub.execute_input": "2026-03-05T09:28:14.658976Z", - "iopub.status.busy": "2026-03-05T09:28:14.658878Z", - "iopub.status.idle": "2026-03-05T09:28:14.662047Z", - "shell.execute_reply": "2026-03-05T09:28:14.661541Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "ContextGraphManager ready.\n", - " Biz nodes table : adcp_biz_nodes\n", - " Cross-links table: adcp_cross_links\n", - " Graph name : adcp_context_graph\n", - " Entity types : ['Product', 'Targeting', 'Campaign', 'Budget', 'Audience', 'Advertiser']\n", - " AI.GENERATE endpoint: https://aiplatform.googleapis.com/v1/projects/test-project-0728-467323/locations...\n" - ] - } - ], + "cell_type": "code", "source": [ "from bigquery_agent_analytics import ContextGraphManager, ContextGraphConfig\n", "\n", @@ -1359,12 +1344,57 @@ "print(f\" Graph name : {cg_config.graph_name}\")\n", "print(f\" Entity types : {cg_config.entity_types}\")\n", "print(f\" AI.GENERATE endpoint: {cgm._resolve_endpoint()[:80]}...\")" + ], + "metadata": { + "execution": { + "iopub.execute_input": "2026-03-05T09:28:14.658976Z", + "iopub.status.busy": "2026-03-05T09:28:14.658878Z", + "iopub.status.idle": "2026-03-05T09:28:14.662047Z", + "shell.execute_reply": "2026-03-05T09:28:14.661541Z" + } + }, + "execution_count": 11, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "ContextGraphManager ready.\n", + " Biz nodes table : adcp_biz_nodes\n", + " Cross-links table: adcp_cross_links\n", + " Graph name : adcp_context_graph\n", + " Entity types : ['Product', 'Targeting', 'Campaign', 'Budget', 'Audience', 'Advertiser']\n", + " AI.GENERATE endpoint: https://aiplatform.googleapis.com/v1/projects/test-project-0728-467323/locations...\n" + ] + } ] }, { - "cell_type": "code", - "execution_count": 12, "id": "0d7f19df", + "cell_type": "code", + "source": [ + "# Extract business entities using AI.GENERATE (server-side)\n", + "# Falls back to client-side extraction if AI.GENERATE is not available\n", + "try:\n", + " biz_nodes = cgm.extract_biz_nodes(\n", + " session_ids=session_ids,\n", + " use_ai_generate=True,\n", + " )\n", + " print(f\"Extracted {len(biz_nodes)} business entities:\")\n", + " for node in biz_nodes[:15]:\n", + " print(f\" [{node.node_type}] {node.node_value} \"\n", + " f\"(confidence={node.confidence:.2f})\")\n", + " if len(biz_nodes) \u003e 15:\n", + " print(f\" ... ({len(biz_nodes) - 15} more)\")\n", + "except Exception as exc:\n", + " print(f\"AI.GENERATE extraction not available: {exc}\")\n", + " print(\"Falling back to client-side extraction ...\")\n", + " biz_nodes = cgm.extract_biz_nodes(\n", + " session_ids=session_ids,\n", + " use_ai_generate=False,\n", + " )\n", + " print(f\"Fetched {len(biz_nodes)} raw payloads for manual review.\")" + ], "metadata": { "execution": { "iopub.execute_input": "2026-03-05T09:28:14.663303Z", @@ -1373,6 +1403,7 @@ "shell.execute_reply": "2026-03-05T09:29:01.878077Z" } }, + "execution_count": 12, "outputs": [ { "name": "stdout", @@ -1397,69 +1428,22 @@ " ... (165 more)\n" ] } - ], - "source": [ - "# Extract business entities using AI.GENERATE (server-side)\n", - "# Falls back to client-side extraction if AI.GENERATE is not available\n", - "try:\n", - " biz_nodes = cgm.extract_biz_nodes(\n", - " session_ids=session_ids,\n", - " use_ai_generate=True,\n", - " )\n", - " print(f\"Extracted {len(biz_nodes)} business entities:\")\n", - " for node in biz_nodes[:15]:\n", - " print(f\" [{node.node_type}] {node.node_value} \"\n", - " f\"(confidence={node.confidence:.2f})\")\n", - " if len(biz_nodes) > 15:\n", - " print(f\" ... ({len(biz_nodes) - 15} more)\")\n", - "except Exception as exc:\n", - " print(f\"AI.GENERATE extraction not available: {exc}\")\n", - " print(\"Falling back to client-side extraction ...\")\n", - " biz_nodes = cgm.extract_biz_nodes(\n", - " session_ids=session_ids,\n", - " use_ai_generate=False,\n", - " )\n", - " print(f\"Fetched {len(biz_nodes)} raw payloads for manual review.\")" ] }, { - "cell_type": "markdown", "id": "5a903ce8", - "metadata": {}, + "cell_type": "markdown", "source": [ - "### Alternative: Manual Entity Extraction & Storage\n", + "### Alternative: Manual Entity Extraction \u0026 Storage\n", "\n", "If `AI.GENERATE` is not available, you can extract entities client-side and store them via `store_biz_nodes()`." - ] + ], + "metadata": {}, + "execution_count": null }, { - "cell_type": "code", - "execution_count": 13, "id": "3e744c32", - "metadata": { - "execution": { - "iopub.execute_input": "2026-03-05T09:29:01.880329Z", - "iopub.status.busy": "2026-03-05T09:29:01.880213Z", - "iopub.status.idle": "2026-03-05T09:29:01.883830Z", - "shell.execute_reply": "2026-03-05T09:29:01.883278Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Manual biz nodes prepared: 7\n", - " [Advertiser] ELF Cosmetics\n", - " [Product] Yahoo Homepage\n", - " [Product] Yahoo Mail\n", - " [Targeting] Millennials 25-34\n", - " [Targeting] Beauty Enthusiasts\n", - " [Budget] $50,000\n", - " [Campaign] ELF_Skincare_May2025\n" - ] - } - ], + "cell_type": "code", "source": [ "from bigquery_agent_analytics import BizNode\n", "\n", @@ -1523,12 +1507,36 @@ "# Uncomment to store in BigQuery:\n", "# success = cgm.store_biz_nodes(manual_biz_nodes)\n", "# print(f\"Stored: {success}\")" + ], + "metadata": { + "execution": { + "iopub.execute_input": "2026-03-05T09:29:01.880329Z", + "iopub.status.busy": "2026-03-05T09:29:01.880213Z", + "iopub.status.idle": "2026-03-05T09:29:01.883830Z", + "shell.execute_reply": "2026-03-05T09:29:01.883278Z" + } + }, + "execution_count": 13, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Manual biz nodes prepared: 7\n", + " [Advertiser] ELF Cosmetics\n", + " [Product] Yahoo Homepage\n", + " [Product] Yahoo Mail\n", + " [Targeting] Millennials 25-34\n", + " [Targeting] Beauty Enthusiasts\n", + " [Budget] $50,000\n", + " [Campaign] ELF_Skincare_May2025\n" + ] + } ] }, { - "cell_type": "markdown", "id": "19f70d11", - "metadata": {}, + "cell_type": "markdown", "source": [ "---\n", "\n", @@ -1539,12 +1547,22 @@ "- **BizNode** — entities from `extracted_biz_nodes` (domain entities)\n", "- **Caused** edges — parent→child span linkage (decision lineage)\n", "- **Evaluated** edges — tech event → business entity cross-links" - ] + ], + "metadata": {}, + "execution_count": null }, { - "cell_type": "code", - "execution_count": 14, "id": "3625ad9f", + "cell_type": "code", + "source": [ + "# Generate and display the Property Graph DDL\n", + "ddl = cgm.get_property_graph_ddl()\n", + "\n", + "print(\"=\" * 70)\n", + "print(\" CREATE PROPERTY GRAPH DDL\")\n", + "print(\"=\" * 70)\n", + "print(ddl)" + ], "metadata": { "execution": { "iopub.execute_input": "2026-03-05T09:29:01.885148Z", @@ -1553,6 +1571,7 @@ "shell.execute_reply": "2026-03-05T09:29:01.886715Z" } }, + "execution_count": 14, "outputs": [ { "name": "stdout", @@ -1590,14 +1609,14 @@ " )\n", " )\n", " EDGE TABLES (\n", - " -- Causal lineage: parent span -> child span\n", + " -- Causal lineage: parent span -\u003e child span\n", " `test-project-0728-467323.agent_analytics.agent_events` AS Caused\n", " KEY (span_id)\n", " SOURCE KEY (parent_span_id) REFERENCES TechNode (span_id)\n", " DESTINATION KEY (span_id) REFERENCES TechNode (span_id)\n", " LABEL Caused,\n", "\n", - " -- Cross-link: technical event -> business entity it evaluated\n", + " -- Cross-link: technical event -\u003e business entity it evaluated\n", " `test-project-0728-467323.agent_analytics.adcp_cross_links` AS Evaluated\n", " KEY (span_id)\n", " SOURCE KEY (span_id) REFERENCES TechNode (span_id)\n", @@ -1607,21 +1626,39 @@ "\n" ] } - ], - "source": [ - "# Generate and display the Property Graph DDL\n", - "ddl = cgm.get_property_graph_ddl()\n", - "\n", - "print(\"=\" * 70)\n", - "print(\" CREATE PROPERTY GRAPH DDL\")\n", - "print(\"=\" * 70)\n", - "print(ddl)" ] }, { - "cell_type": "code", - "execution_count": 15, "id": "08c600a1", + "cell_type": "code", + "source": [ + "# Create the Property Graph in BigQuery\n", + "# NOTE: Property Graphs require BigQuery Studio (BQ Notebooks).\n", + "# If running outside BQ Studio, the DDL and GQL are generated for you\n", + "# to copy-paste into a BigQuery Studio notebook cell with %%bigquery magic.\n", + "\n", + "# Step 1: Create cross-links\n", + "try:\n", + " cross_links_ok = cgm.create_cross_links(session_ids)\n", + " print(f\"Cross-links created: {cross_links_ok}\")\n", + "except Exception as exc:\n", + " print(f\"Cross-links creation: {exc}\")\n", + "\n", + "# Step 2: Create the Property Graph\n", + "try:\n", + " graph_ok = cgm.create_property_graph()\n", + " print(f\"Property Graph created: {graph_ok}\")\n", + "except Exception as exc:\n", + " print(f\"Property Graph creation: {exc}\")\n", + " print(\"\\n--- To create the Property Graph, run the DDL above in ---\")\n", + " print(\"--- a BigQuery Studio notebook cell: ---\")\n", + " print(\"--- %%bigquery ---\")\n", + " print(\"--- CREATE OR REPLACE PROPERTY GRAPH ... ---\")\n", + " print(\"--- ---\")\n", + " print(\"--- Reference: https://github.com/GoogleCloudPlatform/ ---\")\n", + " print(\"--- devrel-demos/blob/main/data-analytics/ ---\")\n", + " print(\"--- knowledge_graph_demo/kg_demo_template.ipynb ---\")" + ], "metadata": { "execution": { "iopub.execute_input": "2026-03-05T09:29:01.888414Z", @@ -1630,6 +1667,7 @@ "shell.execute_reply": "2026-03-05T09:29:07.897303Z" } }, + "execution_count": 15, "outputs": [ { "name": "stdout", @@ -1656,70 +1694,24 @@ "Property Graph created: False\n" ] } - ], - "source": [ - "# Create the Property Graph in BigQuery\n", - "# NOTE: Property Graphs require BigQuery Studio (BQ Notebooks).\n", - "# If running outside BQ Studio, the DDL and GQL are generated for you\n", - "# to copy-paste into a BigQuery Studio notebook cell with %%bigquery magic.\n", - "\n", - "# Step 1: Create cross-links\n", - "try:\n", - " cross_links_ok = cgm.create_cross_links(session_ids)\n", - " print(f\"Cross-links created: {cross_links_ok}\")\n", - "except Exception as exc:\n", - " print(f\"Cross-links creation: {exc}\")\n", - "\n", - "# Step 2: Create the Property Graph\n", - "try:\n", - " graph_ok = cgm.create_property_graph()\n", - " print(f\"Property Graph created: {graph_ok}\")\n", - "except Exception as exc:\n", - " print(f\"Property Graph creation: {exc}\")\n", - " print(\"\\n--- To create the Property Graph, run the DDL above in ---\")\n", - " print(\"--- a BigQuery Studio notebook cell: ---\")\n", - " print(\"--- %%bigquery ---\")\n", - " print(\"--- CREATE OR REPLACE PROPERTY GRAPH ... ---\")\n", - " print(\"--- ---\")\n", - " print(\"--- Reference: https://github.com/GoogleCloudPlatform/ ---\")\n", - " print(\"--- devrel-demos/blob/main/data-analytics/ ---\")\n", - " print(\"--- knowledge_graph_demo/kg_demo_template.ipynb ---\")" ] }, { - "cell_type": "markdown", "id": "q9v6s6cftur", - "metadata": {}, + "cell_type": "markdown", "source": [ "### Run in BigQuery Studio (BQ Notebooks)\n", "\n", "Property Graphs and GQL visualization require **BigQuery Studio**. When running this notebook in BQ Studio, uncomment and execute the `%%bigquery` cells below. The graph will render as an interactive visualization.\n", "\n", - "> **Reference**: [Knowledge Graph Demo by Google Cloud](https://github.com/GoogleCloudPlatform/devrel-demos/blob/main/data-analytics/knowledge_graph_demo/kg_demo_template.ipynb)" - ] + "\u003e **Reference**: [Knowledge Graph Demo by Google Cloud](https://github.com/GoogleCloudPlatform/devrel-demos/blob/main/data-analytics/knowledge_graph_demo/kg_demo_template.ipynb)" + ], + "metadata": {}, + "execution_count": null }, { - "cell_type": "code", - "execution_count": 16, "id": "66lobz9hhp4", - "metadata": { - "execution": { - "iopub.execute_input": "2026-03-05T09:29:07.901006Z", - "iopub.status.busy": "2026-03-05T09:29:07.900843Z", - "iopub.status.idle": "2026-03-05T09:29:07.905103Z", - "shell.execute_reply": "2026-03-05T09:29:07.904010Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Copy the DDL from the cell above into a %%bigquery cell in BQ Studio.\n", - "The Property Graph DDL was generated by the SDK in the previous cell.\n" - ] - } - ], + "cell_type": "code", "source": [ "# ============================================================\n", "# BQ Studio: Create Property Graph\n", @@ -1755,12 +1747,30 @@ "# Print the DDL for copy-paste into BQ Studio\n", "print(\"Copy the DDL from the cell above into a %%bigquery cell in BQ Studio.\")\n", "print(\"The Property Graph DDL was generated by the SDK in the previous cell.\")" + ], + "metadata": { + "execution": { + "iopub.execute_input": "2026-03-05T09:29:07.901006Z", + "iopub.status.busy": "2026-03-05T09:29:07.900843Z", + "iopub.status.idle": "2026-03-05T09:29:07.905103Z", + "shell.execute_reply": "2026-03-05T09:29:07.904010Z" + } + }, + "execution_count": 16, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Copy the DDL from the cell above into a %%bigquery cell in BQ Studio.\n", + "The Property Graph DDL was generated by the SDK in the previous cell.\n" + ] + } ] }, { - "cell_type": "markdown", "id": "3981ee96", - "metadata": {}, + "cell_type": "markdown", "source": [ "---\n", "\n", @@ -1768,15 +1778,29 @@ "\n", "With the Property Graph in place, we use **Graph Query Language (GQL)** to answer the question:\n", "\n", - "> _\"Why was the Yahoo Homepage selected for the $50K ELF campaign?\"_\n", + "\u003e _\"Why was the Yahoo Homepage selected for the $50K ELF campaign?\"_\n", "\n", "The GQL query traces causal hops from the final decision back to the business inputs that informed it." - ] + ], + "metadata": {}, + "execution_count": null }, { - "cell_type": "code", - "execution_count": 17, "id": "cf8343eb", + "cell_type": "code", + "source": [ + "# Generate the GQL reasoning chain query\n", + "gql_query = cgm.get_reasoning_chain_gql(\n", + " decision_event_type=\"AGENT_COMPLETED\",\n", + " biz_entity=\"Yahoo Homepage\",\n", + " max_hops=15,\n", + ")\n", + "\n", + "print(\"=\" * 70)\n", + "print(\" GQL: Why was Yahoo Homepage selected?\")\n", + "print(\"=\" * 70)\n", + "print(gql_query)" + ], "metadata": { "execution": { "iopub.execute_input": "2026-03-05T09:29:07.907407Z", @@ -1785,6 +1809,7 @@ "shell.execute_reply": "2026-03-05T09:29:07.909808Z" } }, + "execution_count": 17, "outputs": [ { "name": "stdout", @@ -1795,8 +1820,8 @@ "======================================================================\n", "GRAPH `test-project-0728-467323.agent_analytics.adcp_context_graph`\n", "MATCH\n", - " (decision:TechNode)-[c:Caused]->{1,15}(step:TechNode)\n", - " -[e:Evaluated]->(biz:BizNode)\n", + " (decision:TechNode)-[c:Caused]-\u003e{1,15}(step:TechNode)\n", + " -[e:Evaluated]-\u003e(biz:BizNode)\n", "WHERE decision.event_type = @decision_event_type\n", " AND biz.node_value = 'Yahoo Homepage'\n", "RETURN\n", @@ -1822,25 +1847,30 @@ "\n" ] } - ], - "source": [ - "# Generate the GQL reasoning chain query\n", - "gql_query = cgm.get_reasoning_chain_gql(\n", - " decision_event_type=\"AGENT_COMPLETED\",\n", - " biz_entity=\"Yahoo Homepage\",\n", - " max_hops=15,\n", - ")\n", - "\n", - "print(\"=\" * 70)\n", - "print(\" GQL: Why was Yahoo Homepage selected?\")\n", - "print(\"=\" * 70)\n", - "print(gql_query)" ] }, { - "cell_type": "code", - "execution_count": 18, "id": "6bc59a25", + "cell_type": "code", + "source": [ + "# Execute the GQL query (requires Property Graph to be created)\n", + "try:\n", + " chain = cgm.explain_decision(\n", + " decision_event_type=\"AGENT_COMPLETED\",\n", + " biz_entity=\"Yahoo Homepage\",\n", + " )\n", + " print(f\"Reasoning chain: {len(chain)} steps\")\n", + " for step in chain:\n", + " print(f\" [{step.get('step_type', '?')}] \"\n", + " f\"{step.get('step_agent', '?')}: \"\n", + " f\"{step.get('reasoning_text', '')[:150]}\")\n", + " print(f\" -\u003e Entity: {step.get('entity_type', '?')}: \"\n", + " f\"{step.get('entity_value', '?')}\")\n", + "except Exception as exc:\n", + " print(f\"GQL traversal: {exc}\")\n", + " print(\"\\nThe GQL query above can be run in BigQuery Console\")\n", + " print(\"once the Property Graph is created.\")" + ], "metadata": { "execution": { "iopub.execute_input": "2026-03-05T09:29:07.912716Z", @@ -1849,6 +1879,7 @@ "shell.execute_reply": "2026-03-05T09:29:08.132739Z" } }, + "execution_count": 18, "outputs": [ { "name": "stderr", @@ -1868,31 +1899,23 @@ "Reasoning chain: 0 steps\n" ] } - ], - "source": [ - "# Execute the GQL query (requires Property Graph to be created)\n", - "try:\n", - " chain = cgm.explain_decision(\n", - " decision_event_type=\"AGENT_COMPLETED\",\n", - " biz_entity=\"Yahoo Homepage\",\n", - " )\n", - " print(f\"Reasoning chain: {len(chain)} steps\")\n", - " for step in chain:\n", - " print(f\" [{step.get('step_type', '?')}] \"\n", - " f\"{step.get('step_agent', '?')}: \"\n", - " f\"{step.get('reasoning_text', '')[:150]}\")\n", - " print(f\" -> Entity: {step.get('entity_type', '?')}: \"\n", - " f\"{step.get('entity_value', '?')}\")\n", - "except Exception as exc:\n", - " print(f\"GQL traversal: {exc}\")\n", - " print(\"\\nThe GQL query above can be run in BigQuery Console\")\n", - " print(\"once the Property Graph is created.\")" ] }, { - "cell_type": "code", - "execution_count": 19, "id": "335bc8db", + "cell_type": "code", + "source": [ + "# Full causal chain for the ELF campaign session\n", + "causal_gql = cgm.get_causal_chain_gql(\n", + " session_id=session_ids[0],\n", + " max_hops=20,\n", + ")\n", + "\n", + "print(\"=\" * 70)\n", + "print(\" GQL: Full Causal Chain for ELF Campaign\")\n", + "print(\"=\" * 70)\n", + "print(causal_gql)" + ], "metadata": { "execution": { "iopub.execute_input": "2026-03-05T09:29:08.136944Z", @@ -1901,6 +1924,7 @@ "shell.execute_reply": "2026-03-05T09:29:08.138893Z" } }, + "execution_count": 19, "outputs": [ { "name": "stdout", @@ -1911,7 +1935,7 @@ "======================================================================\n", "GRAPH `test-project-0728-467323.agent_analytics.adcp_context_graph`\n", "MATCH\n", - " (root:TechNode)-[c:Caused]->{1,20}(leaf:TechNode)\n", + " (root:TechNode)-[c:Caused]-\u003e{1,20}(leaf:TechNode)\n", "WHERE root.session_id = @session_id\n", " AND root.event_type = 'USER_MESSAGE_RECEIVED'\n", "RETURN\n", @@ -1933,42 +1957,11 @@ "\n" ] } - ], - "source": [ - "# Full causal chain for the ELF campaign session\n", - "causal_gql = cgm.get_causal_chain_gql(\n", - " session_id=session_ids[0],\n", - " max_hops=20,\n", - ")\n", - "\n", - "print(\"=\" * 70)\n", - "print(\" GQL: Full Causal Chain for ELF Campaign\")\n", - "print(\"=\" * 70)\n", - "print(causal_gql)" ] }, { - "cell_type": "code", - "execution_count": 20, "id": "xif3qepf5ng", - "metadata": { - "execution": { - "iopub.execute_input": "2026-03-05T09:29:08.140628Z", - "iopub.status.busy": "2026-03-05T09:29:08.140553Z", - "iopub.status.idle": "2026-03-05T09:29:08.143016Z", - "shell.execute_reply": "2026-03-05T09:29:08.142610Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "GQL visualization cells are ready for BQ Studio.\n", - "Uncomment the %%bigquery --graph cells above when running in BQ Studio.\n" - ] - } - ], + "cell_type": "code", "source": [ "# ============================================================\n", "# BQ Studio: Visualize the Context Graph\n", @@ -1979,7 +1972,7 @@ "# --- Visualize ALL relationships in the Context Graph ---\n", "# %%bigquery --graph display_only\n", "# GRAPH `agent_analytics.adcp_context_graph`\n", - "# MATCH (source)-[r]->(target)\n", + "# MATCH (source)-[r]-\u003e(target)\n", "# RETURN\n", "# TO_JSON(source) AS Source_Node,\n", "# TO_JSON(r) AS Edge,\n", @@ -1988,8 +1981,8 @@ "# --- Reasoning Chain: Why was Yahoo Homepage selected? ---\n", "# %%bigquery --graph display_only\n", "# GRAPH `agent_analytics.adcp_context_graph`\n", - "# MATCH (decision:TechNode)-[c:Caused]->{1,15}(step:TechNode)\n", - "# -[e:Evaluated]->(biz:BizNode)\n", + "# MATCH (decision:TechNode)-[c:Caused]-\u003e{1,15}(step:TechNode)\n", + "# -[e:Evaluated]-\u003e(biz:BizNode)\n", "# WHERE decision.event_type = 'AGENT_COMPLETED'\n", "# AND biz.node_value = 'Yahoo Homepage'\n", "# RETURN\n", @@ -2002,7 +1995,7 @@ "# --- Full Causal Chain for a session ---\n", "# %%bigquery --graph display_only\n", "# GRAPH `agent_analytics.adcp_context_graph`\n", - "# MATCH (root:TechNode)-[c:Caused]->{1,20}(leaf:TechNode)\n", + "# MATCH (root:TechNode)-[c:Caused]-\u003e{1,20}(leaf:TechNode)\n", "# WHERE root.event_type = 'USER_MESSAGE_RECEIVED'\n", "# RETURN\n", "# TO_JSON(root) AS root_node,\n", @@ -2011,12 +2004,30 @@ "\n", "print(\"GQL visualization cells are ready for BQ Studio.\")\n", "print(\"Uncomment the %%bigquery --graph cells above when running in BQ Studio.\")" + ], + "metadata": { + "execution": { + "iopub.execute_input": "2026-03-05T09:29:08.140628Z", + "iopub.status.busy": "2026-03-05T09:29:08.140553Z", + "iopub.status.idle": "2026-03-05T09:29:08.143016Z", + "shell.execute_reply": "2026-03-05T09:29:08.142610Z" + } + }, + "execution_count": 20, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "GQL visualization cells are ready for BQ Studio.\n", + "Uncomment the %%bigquery --graph cells above when running in BQ Studio.\n" + ] + } ] }, { - "cell_type": "markdown", "id": "f4db3f75", - "metadata": {}, + "cell_type": "markdown", "source": [ "---\n", "\n", @@ -2028,36 +2039,13 @@ "- Target audience segments have shifted\n", "\n", "We run a **\"diff check\"** before the HITL approval is finalized. The SDK traverses the graph to find the original BizNodes, queries current availability, and alerts the manager if the \"World State\" has drifted." - ] + ], + "metadata": {}, + "execution_count": null }, { - "cell_type": "code", - "execution_count": 21, "id": "2079e426", - "metadata": { - "execution": { - "iopub.execute_input": "2026-03-05T09:29:08.144314Z", - "iopub.status.busy": "2026-03-05T09:29:08.144248Z", - "iopub.status.idle": "2026-03-05T09:29:09.150800Z", - "shell.execute_reply": "2026-03-05T09:29:09.149816Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "World Change Report — Session: adcp-a20d176b82af\n", - " Entities checked : 67\n", - " Stale entities : 4\n", - " Safe to approve : False\n", - " [inventory_depleted] Yahoo Homepage: Product: Yahoo Homepage -> SOLD OUT -- Q2 inventory depleted (severity=0.95)\n", - " [inventory_depleted] Yahoo Homepage: Product: Yahoo Homepage -> SOLD OUT -- Q2 inventory depleted (severity=0.95)\n", - " [inventory_depleted] Yahoo Homepage: Product: Yahoo Homepage -> SOLD OUT -- Q2 inventory depleted (severity=0.95)\n", - " [inventory_depleted] Yahoo Homepage: Product: Yahoo Homepage -> SOLD OUT -- Q2 inventory depleted (severity=0.95)\n" - ] - } - ], + "cell_type": "code", "source": [ "from bigquery_agent_analytics import WorldChangeReport\n", "\n", @@ -2125,32 +2113,84 @@ " )\n", " print(manual_report.summary())\n", " print(f\"\\nRecommendation: {manual_report.alerts[0].recommendation}\")" + ], + "metadata": { + "execution": { + "iopub.execute_input": "2026-03-05T09:29:08.144314Z", + "iopub.status.busy": "2026-03-05T09:29:08.144248Z", + "iopub.status.idle": "2026-03-05T09:29:09.150800Z", + "shell.execute_reply": "2026-03-05T09:29:09.149816Z" + } + }, + "execution_count": 21, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "World Change Report — Session: adcp-a20d176b82af\n", + " Entities checked : 67\n", + " Stale entities : 4\n", + " Safe to approve : False\n", + " [inventory_depleted] Yahoo Homepage: Product: Yahoo Homepage -\u003e SOLD OUT -- Q2 inventory depleted (severity=0.95)\n", + " [inventory_depleted] Yahoo Homepage: Product: Yahoo Homepage -\u003e SOLD OUT -- Q2 inventory depleted (severity=0.95)\n", + " [inventory_depleted] Yahoo Homepage: Product: Yahoo Homepage -\u003e SOLD OUT -- Q2 inventory depleted (severity=0.95)\n", + " [inventory_depleted] Yahoo Homepage: Product: Yahoo Homepage -\u003e SOLD OUT -- Q2 inventory depleted (severity=0.95)\n" + ] + } ] }, { - "cell_type": "markdown", "id": "f0fed921", - "metadata": {}, + "cell_type": "markdown", "source": [ "---\n", "\n", "## Phase 9: SDK Evaluation Pipeline\n", "\n", "Now we evaluate the ADCP agent's performance using the full SDK evaluation stack." - ] + ], + "metadata": {}, + "execution_count": null }, { - "cell_type": "markdown", "id": "8cc77228", - "metadata": {}, + "cell_type": "markdown", "source": [ "### 9a. Code-Based Evaluation" - ] + ], + "metadata": {}, + "execution_count": null }, { - "cell_type": "code", - "execution_count": 22, "id": "b2a911a2", + "cell_type": "code", + "source": [ + "from bigquery_agent_analytics import CodeEvaluator\n", + "\n", + "trace_filter = TraceFilter(session_ids=session_ids)\n", + "\n", + "presets = [\n", + " (\"latency\", CodeEvaluator.latency(threshold_ms=30000)),\n", + " (\"turn_count\", CodeEvaluator.turn_count(max_turns=10)),\n", + " (\"error_rate\", CodeEvaluator.error_rate(max_error_rate=0.1)),\n", + " (\"token_efficiency\", CodeEvaluator.token_efficiency(max_tokens=100000)),\n", + "]\n", + "\n", + "for name, evaluator in presets:\n", + " try:\n", + " report = asyncio.get_event_loop().run_until_complete(\n", + " asyncio.to_thread(\n", + " client.evaluate,\n", + " evaluator=evaluator,\n", + " filters=trace_filter,\n", + " )\n", + " )\n", + " print(f\"\\n[{name}]\")\n", + " print(report.summary())\n", + " except Exception as exc:\n", + " print(f\"\\n[{name}] Failed: {exc}\")" + ], "metadata": { "execution": { "iopub.execute_input": "2026-03-05T09:29:09.153081Z", @@ -2159,6 +2199,7 @@ "shell.execute_reply": "2026-03-05T09:29:13.674462Z" } }, + "execution_count": 22, "outputs": [ { "name": "stdout", @@ -2220,46 +2261,44 @@ " token_efficiency: 0.932\n" ] } - ], - "source": [ - "from bigquery_agent_analytics import CodeEvaluator\n", - "\n", - "trace_filter = TraceFilter(session_ids=session_ids)\n", - "\n", - "presets = [\n", - " (\"latency\", CodeEvaluator.latency(threshold_ms=30000)),\n", - " (\"turn_count\", CodeEvaluator.turn_count(max_turns=10)),\n", - " (\"error_rate\", CodeEvaluator.error_rate(max_error_rate=0.1)),\n", - " (\"token_efficiency\", CodeEvaluator.token_efficiency(max_tokens=100000)),\n", - "]\n", - "\n", - "for name, evaluator in presets:\n", - " try:\n", - " report = asyncio.get_event_loop().run_until_complete(\n", - " asyncio.to_thread(\n", - " client.evaluate,\n", - " evaluator=evaluator,\n", - " filters=trace_filter,\n", - " )\n", - " )\n", - " print(f\"\\n[{name}]\")\n", - " print(report.summary())\n", - " except Exception as exc:\n", - " print(f\"\\n[{name}] Failed: {exc}\")" ] }, { - "cell_type": "markdown", "id": "724698e1", - "metadata": {}, + "cell_type": "markdown", "source": [ "### 9b. LLM-as-Judge Evaluation" - ] + ], + "metadata": {}, + "execution_count": null }, { - "cell_type": "code", - "execution_count": 23, "id": "056c9080", + "cell_type": "code", + "source": [ + "from bigquery_agent_analytics import LLMAsJudge\n", + "\n", + "# Correctness: Does the agent follow ADCP protocol correctly?\n", + "judge_correctness = LLMAsJudge.correctness(threshold=0.6)\n", + "try:\n", + " report = asyncio.get_event_loop().run_until_complete(\n", + " asyncio.to_thread(\n", + " client.evaluate,\n", + " evaluator=judge_correctness,\n", + " filters=trace_filter,\n", + " )\n", + " )\n", + " print(\"[LLM Judge: Correctness]\")\n", + " print(report.summary())\n", + " print(\"\\nPer-session details:\")\n", + " for ss in report.session_scores:\n", + " print(f\" {ss.session_id}: scores={ss.scores} \"\n", + " f\"passed={ss.passed}\")\n", + " if ss.llm_feedback:\n", + " print(f\" Feedback: {ss.llm_feedback[:200]}\")\n", + "except Exception as exc:\n", + " print(f\"Correctness judge failed: {exc}\")" + ], "metadata": { "execution": { "iopub.execute_input": "2026-03-05T09:29:13.677450Z", @@ -2268,6 +2307,7 @@ "shell.execute_reply": "2026-03-05T09:29:38.673907Z" } }, + "execution_count": 23, "outputs": [ { "name": "stdout", @@ -2291,69 +2331,25 @@ " Feedback: correctness: The agent correctly identified the need to provision the campaign in Google Ad Manager, executed the relevant tool, and provided a comprehensive summary of the provisioned campaign detail\n" ] } - ], - "source": [ - "from bigquery_agent_analytics import LLMAsJudge\n", - "\n", - "# Correctness: Does the agent follow ADCP protocol correctly?\n", - "judge_correctness = LLMAsJudge.correctness(threshold=0.6)\n", - "try:\n", - " report = asyncio.get_event_loop().run_until_complete(\n", - " asyncio.to_thread(\n", - " client.evaluate,\n", - " evaluator=judge_correctness,\n", - " filters=trace_filter,\n", - " )\n", - " )\n", - " print(\"[LLM Judge: Correctness]\")\n", - " print(report.summary())\n", - " print(\"\\nPer-session details:\")\n", - " for ss in report.session_scores:\n", - " print(f\" {ss.session_id}: scores={ss.scores} \"\n", - " f\"passed={ss.passed}\")\n", - " if ss.llm_feedback:\n", - " print(f\" Feedback: {ss.llm_feedback[:200]}\")\n", - "except Exception as exc:\n", - " print(f\"Correctness judge failed: {exc}\")" ] }, { - "cell_type": "markdown", "id": "55bca58f", - "metadata": {}, + "cell_type": "markdown", "source": [ "### 9c. Trajectory Matching -- ADCP Workflow Compliance" - ] + ], + "metadata": {}, + "execution_count": null }, { - "cell_type": "code", - "execution_count": 24, "id": "59ad9161", - "metadata": { - "execution": { - "iopub.execute_input": "2026-03-05T09:29:38.676374Z", - "iopub.status.busy": "2026-03-05T09:29:38.676256Z", - "iopub.status.idle": "2026-03-05T09:29:40.307629Z", - "shell.execute_reply": "2026-03-05T09:29:40.307149Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[Trajectory: ELF Campaign -- Full ADCP Workflow (IN_ORDER)]\n", - " Session : adcp-a20d176b82af\n", - " Status : EvalStatus.PASSED\n", - " Scores : {'trajectory_in_order': 1.0, 'step_efficiency': 0.8}\n" - ] - } - ], + "cell_type": "code", "source": [ - "from bigquery_agent_analytics import BigQueryTraceEvaluator\n", + "from bigquery_agent_analytics import PerformanceEvaluator\n", "from bigquery_agent_analytics.trace_evaluator import MatchType\n", "\n", - "trace_evaluator = BigQueryTraceEvaluator(\n", + "trace_evaluator = PerformanceEvaluator(\n", " project_id=PROJECT_ID,\n", " dataset_id=DATASET_ID,\n", " table_id=TABLE_ID,\n", @@ -2389,32 +2385,32 @@ " print(f\" Scores : {result.scores}\")\n", "except Exception as exc:\n", " print(f\"Trajectory evaluation failed: {exc}\")" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "id": "316bcb43", + ], "metadata": { "execution": { - "iopub.execute_input": "2026-03-05T09:29:40.309212Z", - "iopub.status.busy": "2026-03-05T09:29:40.309103Z", - "iopub.status.idle": "2026-03-05T09:29:41.393364Z", - "shell.execute_reply": "2026-03-05T09:29:41.392888Z" + "iopub.execute_input": "2026-03-05T09:29:38.676374Z", + "iopub.status.busy": "2026-03-05T09:29:38.676256Z", + "iopub.status.idle": "2026-03-05T09:29:40.307629Z", + "shell.execute_reply": "2026-03-05T09:29:40.307149Z" } }, + "execution_count": 24, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "[Batch Trajectory Evaluation -- ANY_ORDER]\n", - " adcp-a20d176b82af: EvalStatus.PASSED scores={'trajectory_any_order': 1.0, 'step_efficiency': 0.8}\n", - " adcp-7d9855e7a71b: EvalStatus.PASSED scores={'trajectory_any_order': 1.0, 'step_efficiency': 0.6}\n", - " adcp-2c401a645c40: EvalStatus.PASSED scores={'trajectory_any_order': 1.0, 'step_efficiency': 0.8}\n" + "[Trajectory: ELF Campaign -- Full ADCP Workflow (IN_ORDER)]\n", + " Session : adcp-a20d176b82af\n", + " Status : EvalStatus.PASSED\n", + " Scores : {'trajectory_in_order': 1.0, 'step_efficiency': 0.8}\n" ] } - ], + ] + }, + { + "id": "316bcb43", + "cell_type": "code", "source": [ "# Batch trajectory evaluation across all sessions\n", "eval_dataset = [\n", @@ -2445,44 +2441,41 @@ " f\"scores={r.scores}\")\n", "except Exception as exc:\n", " print(f\"Batch evaluation failed: {exc}\")" - ] - }, - { - "cell_type": "markdown", - "id": "511fd003", - "metadata": {}, - "source": [ - "### 9d. Grader Pipeline -- Composite ADCP Quality Score" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "id": "5461103a", + ], "metadata": { "execution": { - "iopub.execute_input": "2026-03-05T09:29:41.395279Z", - "iopub.status.busy": "2026-03-05T09:29:41.395161Z", - "iopub.status.idle": "2026-03-05T09:29:46.400507Z", - "shell.execute_reply": "2026-03-05T09:29:46.399815Z" + "iopub.execute_input": "2026-03-05T09:29:40.309212Z", + "iopub.status.busy": "2026-03-05T09:29:40.309103Z", + "iopub.status.idle": "2026-03-05T09:29:41.393364Z", + "shell.execute_reply": "2026-03-05T09:29:41.392888Z" } }, + "execution_count": 25, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "[GraderPipeline -- ADCP Quality Score]\n", - " Final score : 0.991\n", - " Passed : True\n", - " Strategy : weighted\n", - " Grader breakdown:\n", - " - latency_evaluator: scores={'latency': 0.97443249375} passed=True\n", - " - error_rate_evaluator: scores={'error_rate': 1.0} passed=True\n", - " - correctness_judge: scores={'correctness': 1.0} passed=True\n" + "[Batch Trajectory Evaluation -- ANY_ORDER]\n", + " adcp-a20d176b82af: EvalStatus.PASSED scores={'trajectory_any_order': 1.0, 'step_efficiency': 0.8}\n", + " adcp-7d9855e7a71b: EvalStatus.PASSED scores={'trajectory_any_order': 1.0, 'step_efficiency': 0.6}\n", + " adcp-2c401a645c40: EvalStatus.PASSED scores={'trajectory_any_order': 1.0, 'step_efficiency': 0.8}\n" ] } + ] + }, + { + "id": "511fd003", + "cell_type": "markdown", + "source": [ + "### 9d. Grader Pipeline -- Composite ADCP Quality Score" ], + "metadata": {}, + "execution_count": null + }, + { + "id": "5461103a", + "cell_type": "code", "source": [ "import contextlib\n", "import io\n", @@ -2533,7 +2526,7 @@ " \"turn_count\": sum(\n", " 1 for s in trace.spans if s.event_type == \"user_message\"\n", " ),\n", - " \"has_error\": len(trace.error_spans) > 0,\n", + " \"has_error\": len(trace.error_spans) \u003e 0,\n", " \"input_tokens\": sum(\n", " s.attributes.get(\"input_tokens\", 0) or 0\n", " for s in trace.spans\n", @@ -2573,47 +2566,49 @@ " f\"passed={gr.passed}\")\n", "else:\n", " print(\"Trace not available -- skipping pipeline evaluation.\")" - ] - }, - { - "cell_type": "markdown", - "id": "bd1591ff", - "metadata": {}, - "source": [ - "---\n", - "\n", - "## Phase 10: Eval Suite & Multi-Trial\n", - "\n", - "Define a reusable **EvalSuite** for ADCP workflow compliance and run multi-trial evaluation." - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "id": "32336f6b", + ], "metadata": { "execution": { - "iopub.execute_input": "2026-03-05T09:29:46.402586Z", - "iopub.status.busy": "2026-03-05T09:29:46.402436Z", - "iopub.status.idle": "2026-03-05T09:29:46.407601Z", - "shell.execute_reply": "2026-03-05T09:29:46.406977Z" + "iopub.execute_input": "2026-03-05T09:29:41.395279Z", + "iopub.status.busy": "2026-03-05T09:29:41.395161Z", + "iopub.status.idle": "2026-03-05T09:29:46.400507Z", + "shell.execute_reply": "2026-03-05T09:29:46.399815Z" } }, + "execution_count": 26, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "EvalSuite 'adcp_workflow_evals' -- 3 tasks:\n", - " [capability] elf_full_workflow: ELF Cosmetics full ADCP workflow: brief -> plan -> provision.\n", - " [capability] nike_brief_processing: Nike brief processing: inventory + audience + budget.\n", - " [regression] tesla_end_to_end: Tesla end-to-end: brief -> provision in single turn.\n", - "\n", - "Validation: 1 warnings\n", - " [warning] balance: High positive case ratio (100%). Consider adding more negative test cases.\n" + "[GraderPipeline -- ADCP Quality Score]\n", + " Final score : 0.991\n", + " Passed : True\n", + " Strategy : weighted\n", + " Grader breakdown:\n", + " - latency_evaluator: scores={'latency': 0.97443249375} passed=True\n", + " - error_rate_evaluator: scores={'error_rate': 1.0} passed=True\n", + " - correctness_judge: scores={'correctness': 1.0} passed=True\n" ] } + ] + }, + { + "id": "bd1591ff", + "cell_type": "markdown", + "source": [ + "---\n", + "\n", + "## Phase 10: Eval Suite \u0026 Multi-Trial\n", + "\n", + "Define a reusable **EvalSuite** for ADCP workflow compliance and run multi-trial evaluation." ], + "metadata": {}, + "execution_count": null + }, + { + "id": "32336f6b", + "cell_type": "code", "source": [ "from bigquery_agent_analytics import (\n", " EvalSuite,\n", @@ -2627,7 +2622,7 @@ "suite.add_task(EvalTaskDef(\n", " task_id=\"elf_full_workflow\",\n", " session_id=session_ids[0],\n", - " description=\"ELF Cosmetics full ADCP workflow: brief -> plan -> provision.\",\n", + " description=\"ELF Cosmetics full ADCP workflow: brief -\u003e plan -\u003e provision.\",\n", " category=EvalCategory.CAPABILITY,\n", " expected_trajectory=golden_adcp_full,\n", " thresholds={\"trajectory_match\": 0.8, \"latency\": 0.7},\n", @@ -2647,7 +2642,7 @@ "suite.add_task(EvalTaskDef(\n", " task_id=\"tesla_end_to_end\",\n", " session_id=session_ids[2],\n", - " description=\"Tesla end-to-end: brief -> provision in single turn.\",\n", + " description=\"Tesla end-to-end: brief -\u003e provision in single turn.\",\n", " category=EvalCategory.REGRESSION,\n", " expected_trajectory=golden_adcp_full,\n", " thresholds={\"trajectory_match\": 0.8},\n", @@ -2665,38 +2660,35 @@ " print(f\" [{w.severity}] {w.check_name}: {w.message}\")\n", "if not warnings:\n", " print(\" Suite looks healthy!\")" - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "id": "5c8d9c5a", + ], "metadata": { "execution": { - "iopub.execute_input": "2026-03-05T09:29:46.409275Z", - "iopub.status.busy": "2026-03-05T09:29:46.409161Z", - "iopub.status.idle": "2026-03-05T09:29:55.312224Z", - "shell.execute_reply": "2026-03-05T09:29:55.311350Z" + "iopub.execute_input": "2026-03-05T09:29:46.402586Z", + "iopub.status.busy": "2026-03-05T09:29:46.402436Z", + "iopub.status.idle": "2026-03-05T09:29:46.407601Z", + "shell.execute_reply": "2026-03-05T09:29:46.406977Z" } }, + "execution_count": 27, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "[Multi-Trial -- ELF Campaign, 3 trials]\n", - " pass@k : 1.000\n", - " pass^k : 1.000\n", - " per_trial_pass_rate: 1.000\n", - " mean_scores : {'llm_judge_efficiency': 0.8, 'llm_judge_reasoning': 0.26666666666666666, 'llm_judge_task_completion': 0.9666666666666667, 'llm_judge_tool_usage': 0.9333333333333333, 'step_efficiency': 0.8, 'trajectory_in_order': 1.0}\n", + "EvalSuite 'adcp_workflow_evals' -- 3 tasks:\n", + " [capability] elf_full_workflow: ELF Cosmetics full ADCP workflow: brief -\u003e plan -\u003e provision.\n", + " [capability] nike_brief_processing: Nike brief processing: inventory + audience + budget.\n", + " [regression] tesla_end_to_end: Tesla end-to-end: brief -\u003e provision in single turn.\n", "\n", - " Per-trial results:\n", - " Trial 0: passed=True scores={'trajectory_in_order': 1.0, 'step_efficiency': 0.8, 'llm_judge_task_completion': 1.0, 'llm_judge_efficiency': 0.8, 'llm_judge_tool_usage': 0.9}\n", - " Trial 1: passed=True scores={'trajectory_in_order': 1.0, 'step_efficiency': 0.8, 'llm_judge_task_completion': 1.0, 'llm_judge_efficiency': 0.9, 'llm_judge_tool_usage': 1.0}\n", - " Trial 2: passed=True scores={'trajectory_in_order': 1.0, 'step_efficiency': 0.8, 'llm_judge_task_completion': 0.9, 'llm_judge_efficiency': 0.7, 'llm_judge_tool_usage': 0.9, 'llm_judge_reasoning': 0.8}\n" + "Validation: 1 warnings\n", + " [warning] balance: High positive case ratio (100%). Consider adding more negative test cases.\n" ] } - ], + ] + }, + { + "id": "5c8d9c5a", + "cell_type": "code", "source": [ "from bigquery_agent_analytics import TrialRunner\n", "\n", @@ -2713,79 +2705,64 @@ " golden_trajectory=golden_adcp_full,\n", " match_type=MatchType.IN_ORDER,\n", " use_llm_judge=True,\n", - " )\n", - " )\n", - " print(\"[Multi-Trial -- ELF Campaign, 3 trials]\")\n", - " print(f\" pass@k : {trial_report.pass_at_k:.3f}\")\n", - " print(f\" pass^k : {trial_report.pass_pow_k:.3f}\")\n", - " print(f\" per_trial_pass_rate: {trial_report.per_trial_pass_rate:.3f}\")\n", - " print(f\" mean_scores : {trial_report.mean_scores}\")\n", - " print(f\"\\n Per-trial results:\")\n", - " for tr in trial_report.trial_results:\n", - " print(f\" Trial {tr.trial_index}: passed={tr.passed} \"\n", - " f\"scores={tr.scores}\")\n", - "except Exception as exc:\n", - " print(f\"Multi-trial evaluation failed: {exc}\")" - ] - }, - { - "cell_type": "markdown", - "id": "94ad7bbb", - "metadata": {}, - "source": [ - "---\n", - "\n", - "## Phase 11: AI-Powered Insights Report\n", - "\n", - "Generate a comprehensive insights report analyzing all ADCP sessions." - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "id": "57d59966", + " )\n", + " )\n", + " print(\"[Multi-Trial -- ELF Campaign, 3 trials]\")\n", + " print(f\" pass@k : {trial_report.pass_at_k:.3f}\")\n", + " print(f\" pass^k : {trial_report.pass_pow_k:.3f}\")\n", + " print(f\" per_trial_pass_rate: {trial_report.per_trial_pass_rate:.3f}\")\n", + " print(f\" mean_scores : {trial_report.mean_scores}\")\n", + " print(f\"\\n Per-trial results:\")\n", + " for tr in trial_report.trial_results:\n", + " print(f\" Trial {tr.trial_index}: passed={tr.passed} \"\n", + " f\"scores={tr.scores}\")\n", + "except Exception as exc:\n", + " print(f\"Multi-trial evaluation failed: {exc}\")" + ], "metadata": { "execution": { - "iopub.execute_input": "2026-03-05T09:29:55.315628Z", - "iopub.status.busy": "2026-03-05T09:29:55.315444Z", - "iopub.status.idle": "2026-03-05T09:31:21.970015Z", - "shell.execute_reply": "2026-03-05T09:31:21.969378Z" + "iopub.execute_input": "2026-03-05T09:29:46.409275Z", + "iopub.status.busy": "2026-03-05T09:29:46.409161Z", + "iopub.status.idle": "2026-03-05T09:29:55.312224Z", + "shell.execute_reply": "2026-03-05T09:29:55.311350Z" } }, + "execution_count": 28, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "[ADCP Insights Report]\n", - "Agent Insights Report\n", - " Generated: 2026-03-05 09:31 UTC\n", - " Sessions analyzed: 3\n", - " Success rate: 100%\n", - " Avg effectiveness: 10.0/10\n", - " Avg latency: 4737ms\n", - " Avg turns: 1.3\n", - " Error rate: 0.0%\n", - "\n", - " Top Goals:\n", - " data_retrieval: 3\n", - " planning: 3\n", - " analysis: 2\n", - " task_automation: 2\n", - " Outcomes:\n", - " success: 3\n", + "[Multi-Trial -- ELF Campaign, 3 trials]\n", + " pass@k : 1.000\n", + " pass^k : 1.000\n", + " per_trial_pass_rate: 1.000\n", + " mean_scores : {'llm_judge_efficiency': 0.8, 'llm_judge_reasoning': 0.26666666666666666, 'llm_judge_task_completion': 0.9666666666666667, 'llm_judge_tool_usage': 0.9333333333333333, 'step_efficiency': 0.8, 'trajectory_in_order': 1.0}\n", "\n", - " Analysis Sections:\n", - " - Task Areas\n", - " - Interaction Patterns\n", - " - What Works Well\n", - " - Friction Analysis\n", - " - Tool Usage Patterns\n", - " - Improvement Suggestions\n", - " - Trends & Anomalies\n" + " Per-trial results:\n", + " Trial 0: passed=True scores={'trajectory_in_order': 1.0, 'step_efficiency': 0.8, 'llm_judge_task_completion': 1.0, 'llm_judge_efficiency': 0.8, 'llm_judge_tool_usage': 0.9}\n", + " Trial 1: passed=True scores={'trajectory_in_order': 1.0, 'step_efficiency': 0.8, 'llm_judge_task_completion': 1.0, 'llm_judge_efficiency': 0.9, 'llm_judge_tool_usage': 1.0}\n", + " Trial 2: passed=True scores={'trajectory_in_order': 1.0, 'step_efficiency': 0.8, 'llm_judge_task_completion': 0.9, 'llm_judge_efficiency': 0.7, 'llm_judge_tool_usage': 0.9, 'llm_judge_reasoning': 0.8}\n" ] } + ] + }, + { + "id": "94ad7bbb", + "cell_type": "markdown", + "source": [ + "---\n", + "\n", + "## Phase 11: AI-Powered Insights Report\n", + "\n", + "Generate a comprehensive insights report analyzing all ADCP sessions." ], + "metadata": {}, + "execution_count": null + }, + { + "id": "57d59966", + "cell_type": "code", "source": [ "import time as _time\n", "from bigquery_agent_analytics import InsightsConfig\n", @@ -2798,7 +2775,7 @@ " return fn()\n", " except Exception as exc:\n", " is_rate_limit = \"429\" in str(exc) or \"RESOURCE_EXHAUSTED\" in str(exc)\n", - " if is_rate_limit and attempt < max_retries:\n", + " if is_rate_limit and attempt \u003c max_retries:\n", " delay = base_delay * (2 ** attempt)\n", " print(f\" Rate limited (attempt {attempt + 1}/{max_retries + 1}),\"\n", " f\" retrying in {delay:.0f}s ...\")\n", @@ -2824,30 +2801,54 @@ " print(insights_report.summary())\n", "except Exception as exc:\n", " print(f\"Insights generation failed after retries: {exc}\")" - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "id": "9f00660f", + ], "metadata": { "execution": { - "iopub.execute_input": "2026-03-05T09:31:21.975014Z", - "iopub.status.busy": "2026-03-05T09:31:21.974876Z", - "iopub.status.idle": "2026-03-05T09:31:21.978303Z", - "shell.execute_reply": "2026-03-05T09:31:21.977766Z" + "iopub.execute_input": "2026-03-05T09:29:55.315628Z", + "iopub.status.busy": "2026-03-05T09:29:55.315444Z", + "iopub.status.idle": "2026-03-05T09:31:21.970015Z", + "shell.execute_reply": "2026-03-05T09:31:21.969378Z" } }, + "execution_count": 29, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "[Executive Summary]\n", - "The agent demonstrates exceptional performance, achieving a 100% success rate and perfect effectiveness with high user satisfaction across all interactions. Users primarily leverage it for critical campaign management tasks, including media planning, data retrieval, and provisioning, often through short, task-oriented sessions. The most notable friction point is a significant average latency of 4.7 seconds per interaction, which could impact user perception despite successful task completion. While functionally flawless, implementing advanced user feedback and sentiment analysis is crucial to uncover subtle user experience issues. Prioritizing latency reduction and gaining deeper qualitative insights will further enhance this highly reliable agent's value and user delight.\n" + "[ADCP Insights Report]\n", + "Agent Insights Report\n", + " Generated: 2026-03-05 09:31 UTC\n", + " Sessions analyzed: 3\n", + " Success rate: 100%\n", + " Avg effectiveness: 10.0/10\n", + " Avg latency: 4737ms\n", + " Avg turns: 1.3\n", + " Error rate: 0.0%\n", + "\n", + " Top Goals:\n", + " data_retrieval: 3\n", + " planning: 3\n", + " analysis: 2\n", + " task_automation: 2\n", + " Outcomes:\n", + " success: 3\n", + "\n", + " Analysis Sections:\n", + " - Task Areas\n", + " - Interaction Patterns\n", + " - What Works Well\n", + " - Friction Analysis\n", + " - Tool Usage Patterns\n", + " - Improvement Suggestions\n", + " - Trends \u0026 Anomalies\n" ] } - ], + ] + }, + { + "id": "9f00660f", + "cell_type": "code", "source": [ "# Executive summary (with retry backoff)\n", "try:\n", @@ -2872,12 +2873,46 @@ " print(\"Executive summary can be retried after the rate limit resets.\")\n", " else:\n", " print(f\"Executive summary failed: {exc}\")" + ], + "metadata": { + "execution": { + "iopub.execute_input": "2026-03-05T09:31:21.975014Z", + "iopub.status.busy": "2026-03-05T09:31:21.974876Z", + "iopub.status.idle": "2026-03-05T09:31:21.978303Z", + "shell.execute_reply": "2026-03-05T09:31:21.977766Z" + } + }, + "execution_count": 30, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[Executive Summary]\n", + "The agent demonstrates exceptional performance, achieving a 100% success rate and perfect effectiveness with high user satisfaction across all interactions. Users primarily leverage it for critical campaign management tasks, including media planning, data retrieval, and provisioning, often through short, task-oriented sessions. The most notable friction point is a significant average latency of 4.7 seconds per interaction, which could impact user perception despite successful task completion. While functionally flawless, implementing advanced user feedback and sentiment analysis is crucial to uncover subtle user experience issues. Prioritizing latency reduction and gaining deeper qualitative insights will further enhance this highly reliable agent's value and user delight.\n" + ] + } ] }, { - "cell_type": "code", - "execution_count": 31, "id": "10125977", + "cell_type": "code", + "source": [ + "# Per-session facets\n", + "try:\n", + " print(\"[Session Facets]\")\n", + " for facet in insights_report.session_facets:\n", + " print(f\"\\n Session: {facet.session_id}\")\n", + " if facet.goal_categories:\n", + " print(f\" Goal categories : {facet.goal_categories}\")\n", + " if facet.outcome:\n", + " print(f\" Outcome : {facet.outcome}\")\n", + " if facet.key_topics:\n", + " print(f\" Key topics : {facet.key_topics}\")\n", + " print(f\" Effectiveness : {facet.agent_effectiveness}\")\n", + "except NameError:\n", + " print(\"Insights report not available -- run previous cells first.\")" + ], "metadata": { "execution": { "iopub.execute_input": "2026-03-05T09:31:21.979684Z", @@ -2886,6 +2921,7 @@ "shell.execute_reply": "2026-03-05T09:31:21.981660Z" } }, + "execution_count": 31, "outputs": [ { "name": "stdout", @@ -2912,40 +2948,40 @@ " Effectiveness : 10.0\n" ] } - ], - "source": [ - "# Per-session facets\n", - "try:\n", - " print(\"[Session Facets]\")\n", - " for facet in insights_report.session_facets:\n", - " print(f\"\\n Session: {facet.session_id}\")\n", - " if facet.goal_categories:\n", - " print(f\" Goal categories : {facet.goal_categories}\")\n", - " if facet.outcome:\n", - " print(f\" Outcome : {facet.outcome}\")\n", - " if facet.key_topics:\n", - " print(f\" Key topics : {facet.key_topics}\")\n", - " print(f\" Effectiveness : {facet.agent_effectiveness}\")\n", - "except NameError:\n", - " print(\"Insights report not available -- run previous cells first.\")" ] }, { - "cell_type": "markdown", "id": "686343b9", - "metadata": {}, + "cell_type": "markdown", "source": [ "---\n", "\n", "## Phase 12: End-to-End Pipeline (One-Shot)\n", "\n", "The `build_context_graph()` method runs the full pipeline in a single call: extract entities, create cross-links, and create the Property Graph." - ] + ], + "metadata": {}, + "execution_count": null }, { - "cell_type": "code", - "execution_count": 32, "id": "6551b064", + "cell_type": "code", + "source": [ + "# One-shot pipeline: extract + cross-link + create graph\n", + "try:\n", + " results = cgm.build_context_graph(\n", + " session_ids=session_ids,\n", + " use_ai_generate=True,\n", + " )\n", + " print(\"[Context Graph Pipeline Results]\")\n", + " print(f\" Biz nodes extracted : {results['biz_nodes_count']}\")\n", + " print(f\" Cross-links created : {results['cross_links_created']}\")\n", + " print(f\" Property Graph created : {results['property_graph_created']}\")\n", + "except Exception as exc:\n", + " print(f\"Pipeline: {exc}\")\n", + " print(\"\\nThe individual steps (extract, cross-link, create graph)\")\n", + " print(\"can be run separately as shown in Phases 5-7.\")" + ], "metadata": { "execution": { "iopub.execute_input": "2026-03-05T09:31:21.983566Z", @@ -2954,6 +2990,7 @@ "shell.execute_reply": "2026-03-05T09:32:04.282978Z" } }, + "execution_count": 32, "outputs": [ { "name": "stderr", @@ -2976,28 +3013,11 @@ " Property Graph created : False\n" ] } - ], - "source": [ - "# One-shot pipeline: extract + cross-link + create graph\n", - "try:\n", - " results = cgm.build_context_graph(\n", - " session_ids=session_ids,\n", - " use_ai_generate=True,\n", - " )\n", - " print(\"[Context Graph Pipeline Results]\")\n", - " print(f\" Biz nodes extracted : {results['biz_nodes_count']}\")\n", - " print(f\" Cross-links created : {results['cross_links_created']}\")\n", - " print(f\" Property Graph created : {results['property_graph_created']}\")\n", - "except Exception as exc:\n", - " print(f\"Pipeline: {exc}\")\n", - " print(\"\\nThe individual steps (extract, cross-link, create graph)\")\n", - " print(\"can be run separately as shown in Phases 5-7.\")" ] }, { - "cell_type": "markdown", "id": "88d77f19", - "metadata": {}, + "cell_type": "markdown", "source": [ "---\n", "\n", @@ -3018,7 +3038,7 @@ "| 9 | **Full Evaluation** | Code metrics, LLM judge, trajectory matching, grader pipeline |\n", "| 10 | **Eval Suite** | Reusable ADCP compliance tests with multi-trial pass@k |\n", "| 11 | **AI Insights** | Multi-stage analysis with executive summary |\n", - "| 12 | **One-Shot Pipeline** | `build_context_graph()` runs extract -> cross-link -> create |\n", + "| 12 | **One-Shot Pipeline** | `build_context_graph()` runs extract -\u003e cross-link -\u003e create |\n", "\n", "### Key Takeaways\n", "\n", @@ -3026,12 +3046,27 @@ "- **Native Property Graphs**: BigQuery's `CREATE PROPERTY GRAPH` + GQL replaces cumbersome recursive CTEs with elegant graph traversal.\n", "- **World Change Detection**: Long-running A2A tasks (days/weeks) need temporal intelligence to detect stale context before final HITL approval.\n", "- **End-to-End Observability**: From agent execution to business entity extraction to evaluation -- all powered by the BigQuery Agent Analytics SDK." - ] + ], + "metadata": {}, + "execution_count": null }, { - "cell_type": "code", - "execution_count": 33, "id": "184fa6c6", + "cell_type": "code", + "source": [ + "# Cleanup\n", + "try:\n", + " asyncio.get_event_loop().run_until_complete(\n", + " plugin.shutdown(timeout=10.0)\n", + " )\n", + "except Exception:\n", + " pass\n", + "\n", + "print(\"\\nDemo complete!\")\n", + "print(f\"Sessions: {session_ids}\")\n", + "print(f\"Traces logged to: {PROJECT_ID}.{DATASET_ID}.{TABLE_ID}\")\n", + "print(f\"Context Graph: {PROJECT_ID}.{DATASET_ID}.{cg_config.graph_name}\")" + ], "metadata": { "execution": { "iopub.execute_input": "2026-03-05T09:32:04.309566Z", @@ -3040,6 +3075,7 @@ "shell.execute_reply": "2026-03-05T09:32:04.315383Z" } }, + "execution_count": 33, "outputs": [ { "name": "stdout", @@ -3052,20 +3088,6 @@ "Context Graph: test-project-0728-467323.agent_analytics.adcp_context_graph\n" ] } - ], - "source": [ - "# Cleanup\n", - "try:\n", - " asyncio.get_event_loop().run_until_complete(\n", - " plugin.shutdown(timeout=10.0)\n", - " )\n", - "except Exception:\n", - " pass\n", - "\n", - "print(\"\\nDemo complete!\")\n", - "print(f\"Sessions: {session_ids}\")\n", - "print(f\"Traces logged to: {PROJECT_ID}.{DATASET_ID}.{TABLE_ID}\")\n", - "print(f\"Context Graph: {PROJECT_ID}.{DATASET_ID}.{cg_config.graph_name}\")" ] } ], @@ -3088,6 +3110,6 @@ "version": "3.13.5" } }, - "nbformat": 4, - "nbformat_minor": 5 + "nbformat_minor": 5, + "nbformat": 4 } diff --git a/examples/e2e_demo.py b/examples/e2e_demo.py index ddbee3c..d1cc1f5 100644 --- a/examples/e2e_demo.py +++ b/examples/e2e_demo.py @@ -57,13 +57,12 @@ # --------------------------------------------------------------------------- # BigQuery Agent Analytics SDK (consumer side) # --------------------------------------------------------------------------- -from bigquery_agent_analytics import BigQueryTraceEvaluator +from bigquery_agent_analytics import PerformanceEvaluator from bigquery_agent_analytics import Client -from bigquery_agent_analytics import CodeEvaluator +from bigquery_agent_analytics import SystemEvaluator from bigquery_agent_analytics import InsightsConfig -from bigquery_agent_analytics import LLMAsJudge from bigquery_agent_analytics import TraceFilter -from bigquery_agent_analytics.trace_evaluator import MatchType +from bigquery_agent_analytics.performance_evaluator import MatchType # --------------------------------------------------------------------------- # Configuration @@ -459,12 +458,12 @@ async def phase2_evaluate( print("\n--- 2b. Code-Based Evaluation ---\n") trace_filter = TraceFilter(session_ids=session_ids) presets = [ - ("latency", CodeEvaluator.latency(threshold_ms=30000)), - ("turn_count", CodeEvaluator.turn_count(max_turns=10)), - ("error_rate", CodeEvaluator.error_rate(max_error_rate=0.1)), + ("latency", SystemEvaluator.latency(threshold_ms=30000)), + ("turn_count", SystemEvaluator.turn_count(max_turns=10)), + ("error_rate", SystemEvaluator.error_rate(max_error_rate=0.1)), ( "token_efficiency", - CodeEvaluator.token_efficiency(max_tokens=100000), + SystemEvaluator.token_efficiency(max_tokens=100000), ), ] for preset_name, evaluator in presets: @@ -481,18 +480,22 @@ async def phase2_evaluate( # ---- 2c. LLM-as-Judge --------------------------------------------- # print("\n--- 2c. LLM-as-Judge Evaluation ---\n") try: - judge = LLMAsJudge.correctness(threshold=0.6) + judge = PerformanceEvaluator( + project_id=PROJECT_ID, + dataset_id=DATASET_ID, + table_id=TABLE_ID, + ) report = await asyncio.to_thread( client.evaluate, evaluator=judge, filters=trace_filter ) print(report.summary()) except Exception as exc: - logger.warning("LLM-as-Judge failed: %s", exc) + logger.warning("PerformanceEvaluator judge failed: %s", exc) # ---- 2d. Trajectory matching --------------------------------------- # print("\n--- 2d. Trajectory Matching ---\n") try: - evaluator = BigQueryTraceEvaluator( + evaluator = PerformanceEvaluator( project_id=PROJECT_ID, dataset_id=DATASET_ID, table_id=TABLE_ID, diff --git a/examples/e2e_notebook_demo.ipynb b/examples/e2e_notebook_demo.ipynb index 3361ed2..b66152e 100644 --- a/examples/e2e_notebook_demo.ipynb +++ b/examples/e2e_notebook_demo.ipynb @@ -1,12 +1,8 @@ { "cells": [ { + "id": "d1ad2160", "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "copyright-header" - }, - "outputs": [], "source": [ "# Copyright 2025 Google LLC\n", "#\n", @@ -21,90 +17,100 @@ "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", "# See the License for the specific language governing permissions and\n", "# limitations under the License." - ] + ], + "metadata": { + "id": "copyright-header" + }, + "execution_count": null }, { + "id": "c0d73458", "cell_type": "markdown", - "metadata": { - "id": "notebook-affordances" - }, "source": [ "# Demo Plan: BigQuery for Agent Ops - Unified Platform\n", "\n", - "\n", + "\u003ctable align=\"left\"\u003e\n", "\n", - " \n", - " \n", - " \n", - " \n", - "
\n", - " \n", - " \"Colab Run in Colab\n", - " \n", - " \n", - " \n", - " \"GitHub\n", + " \u003ctd\u003e\n", + " \u003ca href=\"https://colab.research.google.com/github/haiyuan-eng-google/BigQuery-Agent-Analytics-SDK/blob/main/examples/e2e_notebook_demo.ipynb\"\u003e\n", + " \u003cimg src=\"https://raw.githubusercontent.com/googleapis/python-bigquery-dataframes/refs/heads/main/third_party/logo/colab-logo.png\" alt=\"Colab logo\"\u003e Run in Colab\n", + " \u003c/a\u003e\n", + " \u003c/td\u003e\n", + " \u003ctd\u003e\n", + " \u003ca href=\"https://github.com/haiyuan-eng-google/BigQuery-Agent-Analytics-SDK/blob/main/examples/e2e_notebook_demo.ipynb\"\u003e\n", + " \u003cimg src=\"https://raw.githubusercontent.com/googleapis/python-bigquery-dataframes/refs/heads/main/third_party/logo/github-logo.png\" width=\"32\" alt=\"GitHub logo\"\u003e\n", " View on GitHub\n", - " \n", - " \n", - " \n", - " \"Vertex\n", + " \u003c/a\u003e\n", + " \u003c/td\u003e\n", + " \u003ctd\u003e\n", + " \u003ca href=\"https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/haiyuan-eng-google/BigQuery-Agent-Analytics-SDK/main/examples/e2e_notebook_demo.ipynb\"\u003e\n", + " \u003cimg src=\"https://www.gstatic.com/images/branding/product/1x/google_cloud_48dp.png\" alt=\"Vertex AI logo\" width=\"32\"\u003e\n", " Open in Vertex AI Workbench\n", - " \n", - " \n", - " \n", - " \"BQ\n", + " \u003c/a\u003e\n", + " \u003c/td\u003e\n", + " \u003ctd\u003e\n", + " \u003ca href=\"https://console.cloud.google.com/bigquery/import?url=https://github.com/haiyuan-eng-google/BigQuery-Agent-Analytics-SDK/blob/main/examples/e2e_notebook_demo.ipynb\"\u003e\n", + " \u003cimg src=\"https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcTW1gvOovVlbZAIZylUtf5Iu8-693qS1w5NJw\u0026s\" alt=\"BQ logo\" width=\"35\"\u003e\n", " Open in BQ Studio\n", - " \n", - "
" - ] + " \u003c/a\u003e\n", + " \u003c/td\u003e\n", + "\u003c/table\u003e" + ], + "metadata": { + "id": "notebook-affordances" + }, + "execution_count": null }, { + "id": "8c92e8a1", "cell_type": "markdown", - "metadata": {}, "source": [ "## Install Dependencies" - ] + ], + "metadata": {}, + "execution_count": null }, { + "id": "e4b43507", "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], "source": [ "!pip install -q google-adk bigquery-agent-analytics google-cloud-bigquery nest-asyncio" - ] + ], + "metadata": {}, + "execution_count": null }, { + "id": "ba28b332", "cell_type": "markdown", - "metadata": {}, "source": [ - "## Authenticate & Configure" - ] + "## Authenticate \u0026 Configure" + ], + "metadata": {}, + "execution_count": null }, { + "id": "005d4ae6", "cell_type": "code", - "execution_count": null, + "source": "import os\n\n# Colab authentication\ntry:\n from google.colab import auth\n auth.authenticate_user()\n print(\"Colab authentication successful.\")\nexcept ImportError:\n print(\"Not running in Colab — using default credentials.\")\n\n# ---------- Configuration ----------\nPROJECT_ID = os.environ.get(\"GOOGLE_CLOUD_PROJECT\", \"test-project-0728-467323\")\nDATASET_ID = os.environ.get(\"BQ_DATASET\", \"agent_analytics\")\nTABLE_ID = os.environ.get(\"BQ_TABLE\", \"agent_events\")\nAGENT_MODEL = os.environ.get(\"AGENT_MODEL\", \"gemini-3-flash-preview\")\nSDK_ENDPOINT = os.environ.get(\"SDK_ENDPOINT\", \"gemini-2.5-flash\")\nLOCATION = \"US\"\nAPP_NAME = \"e2e_notebook_demo\"\nUSER_ID = \"demo_user\"\n\nos.environ[\"GOOGLE_GENAI_USE_VERTEXAI\"] = \"true\"\nos.environ[\"GOOGLE_CLOUD_PROJECT\"] = PROJECT_ID\nos.environ[\"GOOGLE_CLOUD_LOCATION\"] = \"global\"\n\n# Enable async in Jupyter\nimport nest_asyncio\nnest_asyncio.apply()\n\nprint(f\"Project : {PROJECT_ID}\")\nprint(f\"Dataset : {DATASET_ID}\")\nprint(f\"Table : {TABLE_ID}\")\nprint(f\"Agent model : {AGENT_MODEL}\")\nprint(f\"SDK endpoint : {SDK_ENDPOINT}\")", "metadata": {}, - "outputs": [], - "source": "import os\n\n# Colab authentication\ntry:\n from google.colab import auth\n auth.authenticate_user()\n print(\"Colab authentication successful.\")\nexcept ImportError:\n print(\"Not running in Colab — using default credentials.\")\n\n# ---------- Configuration ----------\nPROJECT_ID = os.environ.get(\"GOOGLE_CLOUD_PROJECT\", \"test-project-0728-467323\")\nDATASET_ID = os.environ.get(\"BQ_DATASET\", \"agent_analytics\")\nTABLE_ID = os.environ.get(\"BQ_TABLE\", \"agent_events\")\nAGENT_MODEL = os.environ.get(\"AGENT_MODEL\", \"gemini-3-flash-preview\")\nSDK_ENDPOINT = os.environ.get(\"SDK_ENDPOINT\", \"gemini-2.5-flash\")\nLOCATION = \"US\"\nAPP_NAME = \"e2e_notebook_demo\"\nUSER_ID = \"demo_user\"\n\nos.environ[\"GOOGLE_GENAI_USE_VERTEXAI\"] = \"true\"\nos.environ[\"GOOGLE_CLOUD_PROJECT\"] = PROJECT_ID\nos.environ[\"GOOGLE_CLOUD_LOCATION\"] = \"global\"\n\n# Enable async in Jupyter\nimport nest_asyncio\nnest_asyncio.apply()\n\nprint(f\"Project : {PROJECT_ID}\")\nprint(f\"Dataset : {DATASET_ID}\")\nprint(f\"Table : {TABLE_ID}\")\nprint(f\"Agent model : {AGENT_MODEL}\")\nprint(f\"SDK endpoint : {SDK_ENDPOINT}\")" + "execution_count": null }, { + "id": "eaa02c79", "cell_type": "markdown", - "metadata": {}, "source": [ "---\n", "\n", - "## Phase 1: Run Agent & Log Traces to BigQuery\n", + "## Phase 1: Run Agent \u0026 Log Traces to BigQuery\n", "\n", "We define a **travel planner agent** with four deterministic tools, run three conversations, and log every event to BigQuery via the `BigQueryAgentAnalyticsPlugin`." - ] + ], + "metadata": {}, + "execution_count": null }, { + "id": "7757d2bf", "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], "source": [ "import hashlib\n", "import random\n", @@ -116,7 +122,7 @@ " destination: str,\n", " date: str,\n", " max_results: int = 5,\n", - ") -> dict[str, Any]:\n", + ") -\u003e dict[str, Any]:\n", " \"\"\"Search for available flights between two cities.\n", "\n", " Args:\n", @@ -166,7 +172,7 @@ " check_in: str,\n", " check_out: str,\n", " max_results: int = 5,\n", - ") -> dict[str, Any]:\n", + ") -\u003e dict[str, Any]:\n", " \"\"\"Search for hotels in a given city.\n", "\n", " Args:\n", @@ -215,7 +221,7 @@ "async def get_weather_forecast(\n", " city: str,\n", " date: str,\n", - ") -> dict[str, Any]:\n", + ") -\u003e dict[str, Any]:\n", " \"\"\"Get weather forecast for a city on a specific date.\n", "\n", " Args:\n", @@ -249,7 +255,7 @@ " hotels: float,\n", " daily_expenses: float,\n", " num_days: int,\n", - ") -> dict[str, Any]:\n", + ") -\u003e dict[str, Any]:\n", " \"\"\"Calculate total trip budget from component costs.\n", "\n", " Args:\n", @@ -281,20 +287,20 @@ "\n", "\n", "print(\"Tool functions defined: search_flights, search_hotels, get_weather_forecast, calculate_trip_budget\")" - ] + ], + "metadata": {}, + "execution_count": null }, { + "id": "738faa0f", "cell_type": "code", - "execution_count": null, + "source": "from google.adk.agents import LlmAgent\nfrom google.genai import types\n\nTRAVEL_PLANNER_INSTRUCTION = \"\"\"\\\nYou are a helpful travel planning assistant. You help users plan trips by\nsearching for flights, hotels, checking weather forecasts, and calculating\nbudgets.\n\nGuidelines:\n- Always search for flights and hotels when the user asks to plan a trip.\n- Check the weather at the destination when relevant.\n- Provide a budget estimate when enough cost information is available.\n- Be concise but informative in your responses.\n- Present results in a clear, organized format.\n- When multiple tools are needed, call them as appropriate and then\n synthesize the results into a cohesive plan.\n\"\"\"\n\n\ndef build_agent() -\u003e LlmAgent:\n \"\"\"Build the travel planner agent.\"\"\"\n return LlmAgent(\n name=\"travel_planner\",\n model=AGENT_MODEL,\n instruction=TRAVEL_PLANNER_INSTRUCTION,\n tools=[\n search_flights,\n search_hotels,\n get_weather_forecast,\n calculate_trip_budget,\n ],\n generate_content_config=types.GenerateContentConfig(\n temperature=1.0,\n ),\n )\n\n\nprint(\"Agent builder ready.\")", "metadata": {}, - "outputs": [], - "source": "from google.adk.agents import LlmAgent\nfrom google.genai import types\n\nTRAVEL_PLANNER_INSTRUCTION = \"\"\"\\\nYou are a helpful travel planning assistant. You help users plan trips by\nsearching for flights, hotels, checking weather forecasts, and calculating\nbudgets.\n\nGuidelines:\n- Always search for flights and hotels when the user asks to plan a trip.\n- Check the weather at the destination when relevant.\n- Provide a budget estimate when enough cost information is available.\n- Be concise but informative in your responses.\n- Present results in a clear, organized format.\n- When multiple tools are needed, call them as appropriate and then\n synthesize the results into a cohesive plan.\n\"\"\"\n\n\ndef build_agent() -> LlmAgent:\n \"\"\"Build the travel planner agent.\"\"\"\n return LlmAgent(\n name=\"travel_planner\",\n model=AGENT_MODEL,\n instruction=TRAVEL_PLANNER_INSTRUCTION,\n tools=[\n search_flights,\n search_hotels,\n get_weather_forecast,\n calculate_trip_budget,\n ],\n generate_content_config=types.GenerateContentConfig(\n temperature=1.0,\n ),\n )\n\n\nprint(\"Agent builder ready.\")" + "execution_count": null }, { + "id": "b8d36d65", "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], "source": [ "import asyncio\n", "import uuid\n", @@ -331,7 +337,7 @@ "# Define three conversations\n", "conversations = [\n", " {\n", - " \"label\": \"Simple trip (SF -> NY)\",\n", + " \"label\": \"Simple trip (SF -\u003e NY)\",\n", " \"messages\": [\n", " (\n", " \"Plan a weekend trip from San Francisco to New York\"\n", @@ -342,7 +348,7 @@ " ],\n", " },\n", " {\n", - " \"label\": \"Complex trip (LA -> Tokyo)\",\n", + " \"label\": \"Complex trip (LA -\u003e Tokyo)\",\n", " \"messages\": [\n", " (\n", " \"I want to plan a 5-day vacation to Tokyo from\"\n", @@ -357,7 +363,7 @@ " ],\n", " },\n", " {\n", - " \"label\": \"Multi-turn (Chicago -> Paris)\",\n", + " \"label\": \"Multi-turn (Chicago -\u003e Paris)\",\n", " \"messages\": [\n", " \"What's the weather like in Paris on 2025-04-20?\",\n", " \"Find me flights from Chicago to Paris on 2025-04-20.\",\n", @@ -400,11 +406,11 @@ " if hasattr(part, \"text\") and part.text:\n", " response_parts.append(part.text)\n", " elif hasattr(part, \"function_call\") and part.function_call:\n", - " print(f\" -> Tool call: {part.function_call.name}\")\n", + " print(f\" -\u003e Tool call: {part.function_call.name}\")\n", " if response_parts:\n", " text = \"\\n\".join(response_parts)\n", " print(f\"\\n[Agent]: {text[:1000]}\")\n", - " if len(text) > 1000:\n", + " if len(text) \u003e 1000:\n", " print(f\" ... (truncated, {len(text)} chars total)\")\n", " return session_id\n", "\n", @@ -418,13 +424,13 @@ " session_ids.append(sid)\n", "\n", "print(f\"\\n\\nSession IDs: {session_ids}\")" - ] + ], + "metadata": {}, + "execution_count": null }, { + "id": "c7b111e0", "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], "source": [ "import time\n", "\n", @@ -439,38 +445,40 @@ "print(f\"Waiting {settle_seconds}s for BigQuery data to settle ...\")\n", "time.sleep(settle_seconds)\n", "print(\"Done.\")" - ] + ], + "metadata": {}, + "execution_count": null }, { + "id": "51583bfd", "cell_type": "markdown", - "metadata": {}, "source": [ "---\n", "\n", - "## Phase 2: Trace Retrieval & Visualization\n", + "## Phase 2: Trace Retrieval \u0026 Visualization\n", "\n", "Now that traces are in BigQuery, we use the **SDK Client** to fetch them. Each `Trace` contains a hierarchical span tree that can be rendered as a DAG. We can also inspect tool calls, the final response, and any error spans." - ] + ], + "metadata": {}, + "execution_count": null }, { + "id": "6a6e1770", "cell_type": "code", - "execution_count": null, + "source": "from bigquery_agent_analytics import Client, TraceFilter\n\nclient = Client(\n project_id=PROJECT_ID,\n dataset_id=DATASET_ID,\n table_id=TABLE_ID,\n location=LOCATION,\n endpoint=SDK_ENDPOINT,\n)\nprint(\"SDK Client initialised.\")", "metadata": {}, - "outputs": [], - "source": "from bigquery_agent_analytics import Client, TraceFilter\n\nclient = Client(\n project_id=PROJECT_ID,\n dataset_id=DATASET_ID,\n table_id=TABLE_ID,\n location=LOCATION,\n endpoint=SDK_ENDPOINT,\n)\nprint(\"SDK Client initialised.\")" + "execution_count": null }, { + "id": "f14ba5d9", "cell_type": "code", - "execution_count": null, + "source": "# Retrieve and render each trace\ntraces = []\nfor sid in session_ids:\n try:\n trace = client.get_session_trace(sid)\n traces.append(trace)\n print(f\"\\n{'=' * 60}\")\n print(f\" Trace for session: {sid}\")\n print(f\"{'=' * 60}\")\n _ = trace.render() # render() prints and returns the tree\n except Exception as exc:\n print(f\"Could not retrieve trace {sid}: {exc}\")\n traces.append(None)", "metadata": {}, - "outputs": [], - "source": "# Retrieve and render each trace\ntraces = []\nfor sid in session_ids:\n try:\n trace = client.get_session_trace(sid)\n traces.append(trace)\n print(f\"\\n{'=' * 60}\")\n print(f\" Trace for session: {sid}\")\n print(f\"{'=' * 60}\")\n _ = trace.render() # render() prints and returns the tree\n except Exception as exc:\n print(f\"Could not retrieve trace {sid}: {exc}\")\n traces.append(None)" + "execution_count": null }, { + "id": "34561942", "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], "source": [ "# Inspect trace properties\n", "for i, trace in enumerate(traces):\n", @@ -486,13 +494,13 @@ " print(f\" Error spans: {len(errors)}\")\n", " for es in errors:\n", " print(f\" - {es.event_type}: {es.error_message}\")" - ] + ], + "metadata": {}, + "execution_count": null }, { + "id": "e0d5a135", "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], "source": [ "# List traces with filtering\n", "all_traces = client.list_traces(\n", @@ -502,24 +510,26 @@ "for t in all_traces:\n", " print(f\" - {t.session_id} spans={len(t.spans)} \"\n", " f\"tools={len(t.tool_calls)}\")" - ] + ], + "metadata": {}, + "execution_count": null }, { + "id": "768cf69a", "cell_type": "markdown", - "metadata": {}, "source": [ "---\n", "\n", "## Phase 3: Code-Based Evaluation\n", "\n", "The `CodeEvaluator` runs deterministic metrics over session aggregates — no LLM needed. Pre-built evaluators cover latency, turn count, error rate, token efficiency, and cost. You can also define custom metrics." - ] + ], + "metadata": {}, + "execution_count": null }, { + "id": "60c56d62", "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], "source": [ "from bigquery_agent_analytics import CodeEvaluator\n", "\n", @@ -546,31 +556,33 @@ " print(report.summary())\n", " except Exception as exc:\n", " print(f\"\\n[{name}] Failed: {exc}\")" - ] + ], + "metadata": {}, + "execution_count": null }, { + "id": "80d44dfc", "cell_type": "code", - "execution_count": null, + "source": "# Custom metric: response length scoring\ndef response_length_score(session_summary: dict) -\u003e float:\n \"\"\"Score based on response token count — longer is better up to a point.\"\"\"\n tokens = session_summary.get(\"output_tokens\") or 0\n # Ideal range: 200-2000 tokens\n if 200 \u003c= tokens \u003c= 2000:\n return 1.0\n elif tokens \u003c 200:\n return tokens / 200.0\n else:\n return max(0.0, 1.0 - (tokens - 2000) / 5000.0)\n\n\ncustom_eval = (\n CodeEvaluator(\"custom_metrics\")\n .add_metric(\"response_length\", response_length_score, threshold=0.5)\n)\n\ntry:\n report = asyncio.get_event_loop().run_until_complete(\n asyncio.to_thread(\n client.evaluate,\n evaluator=custom_eval,\n filters=trace_filter,\n )\n )\n print(\"[custom: response_length]\")\n print(report.summary())\nexcept Exception as exc:\n print(f\"Custom evaluator failed: {exc}\")", "metadata": {}, - "outputs": [], - "source": "# Custom metric: response length scoring\ndef response_length_score(session_summary: dict) -> float:\n \"\"\"Score based on response token count — longer is better up to a point.\"\"\"\n tokens = session_summary.get(\"output_tokens\") or 0\n # Ideal range: 200-2000 tokens\n if 200 <= tokens <= 2000:\n return 1.0\n elif tokens < 200:\n return tokens / 200.0\n else:\n return max(0.0, 1.0 - (tokens - 2000) / 5000.0)\n\n\ncustom_eval = (\n CodeEvaluator(\"custom_metrics\")\n .add_metric(\"response_length\", response_length_score, threshold=0.5)\n)\n\ntry:\n report = asyncio.get_event_loop().run_until_complete(\n asyncio.to_thread(\n client.evaluate,\n evaluator=custom_eval,\n filters=trace_filter,\n )\n )\n print(\"[custom: response_length]\")\n print(report.summary())\nexcept Exception as exc:\n print(f\"Custom evaluator failed: {exc}\")" + "execution_count": null }, { + "id": "46ed0b92", "cell_type": "markdown", - "metadata": {}, "source": [ "---\n", "\n", "## Phase 4: LLM-as-Judge Evaluation\n", "\n", "Semantic evaluation using an LLM to judge agent quality. The SDK supports a 3-tier fallback: BigQuery `AI.GENERATE` → `ML.GENERATE_TEXT` → Gemini API. Pre-built judges evaluate **correctness**, **hallucination** (faithfulness), and **sentiment**." - ] + ], + "metadata": {}, + "execution_count": null }, { + "id": "690c9ca0", "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], "source": [ "from bigquery_agent_analytics import LLMAsJudge\n", "\n", @@ -594,13 +606,13 @@ " print(f\" Feedback: {ss.llm_feedback[:200]}\")\n", "except Exception as exc:\n", " print(f\"Correctness judge failed: {exc}\")" - ] + ], + "metadata": {}, + "execution_count": null }, { + "id": "e97c84d0", "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], "source": [ "# Hallucination (faithfulness) evaluation\n", "judge_hallucination = LLMAsJudge.hallucination()\n", @@ -616,13 +628,13 @@ " print(report.summary())\n", "except Exception as exc:\n", " print(f\"Hallucination judge failed: {exc}\")" - ] + ], + "metadata": {}, + "execution_count": null }, { + "id": "7064a224", "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], "source": [ "# Sentiment evaluation\n", "judge_sentiment = LLMAsJudge.sentiment()\n", @@ -638,11 +650,13 @@ " print(report.summary())\n", "except Exception as exc:\n", " print(f\"Sentiment judge failed: {exc}\")" - ] + ], + "metadata": {}, + "execution_count": null }, { + "id": "91e59828", "cell_type": "markdown", - "metadata": {}, "source": [ "---\n", "\n", @@ -652,33 +666,33 @@ "\n", "| `MatchType` | Description |\n", "|---|---|\n", - "| `EXACT` | Tool calls must match exactly (order & count) |\n", + "| `EXACT` | Tool calls must match exactly (order \u0026 count) |\n", "| `IN_ORDER` | Expected tools appear in order, extra tools allowed between |\n", "| `ANY_ORDER` | All expected tools present, any order |\n" - ] + ], + "metadata": {}, + "execution_count": null }, { + "id": "07e59fcc", "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], "source": [ - "from bigquery_agent_analytics import BigQueryTraceEvaluator\n", + "from bigquery_agent_analytics import PerformanceEvaluator\n", "from bigquery_agent_analytics.trace_evaluator import MatchType\n", "\n", - "trace_evaluator = BigQueryTraceEvaluator(\n", + "trace_evaluator = PerformanceEvaluator(\n", " project_id=PROJECT_ID,\n", " dataset_id=DATASET_ID,\n", " table_id=TABLE_ID,\n", ")\n", - "print(\"BigQueryTraceEvaluator ready.\")" - ] + "print(\"PerformanceEvaluator ready.\")" + ], + "metadata": {}, + "execution_count": null }, { + "id": "deb4daf8", "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], "source": [ "import json\n", "\n", @@ -707,13 +721,13 @@ " print(f\" Details : {json.dumps(result.details, indent=2)}\")\n", "except Exception as exc:\n", " print(f\"Trajectory evaluation failed: {exc}\")" - ] + ], + "metadata": {}, + "execution_count": null }, { + "id": "5df7c1aa", "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], "source": [ "# Compare match types: EXACT vs ANY_ORDER on the same session\n", "for match_type in [MatchType.EXACT, MatchType.ANY_ORDER]:\n", @@ -730,13 +744,13 @@ " f\"Scores: {result.scores}\")\n", " except Exception as exc:\n", " print(f\" {match_type.value} failed: {exc}\")" - ] + ], + "metadata": {}, + "execution_count": null }, { + "id": "f047b88c", "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], "source": [ "# Batch evaluation across all sessions\n", "eval_dataset = [\n", @@ -774,24 +788,26 @@ " f\"scores={r.scores}\")\n", "except Exception as exc:\n", " print(f\"Batch evaluation failed: {exc}\")" - ] + ], + "metadata": {}, + "execution_count": null }, { + "id": "28dc9fa9", "cell_type": "markdown", - "metadata": {}, "source": [ "---\n", "\n", "## Phase 6: Grader Pipeline\n", "\n", "Compose multiple evaluators (code + LLM) into a single **GraderPipeline** with configurable voting strategies: `WeightedStrategy`, `BinaryStrategy`, or `MajorityStrategy`." - ] + ], + "metadata": {}, + "execution_count": null }, { + "id": "d4a498f2", "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], "source": [ "from bigquery_agent_analytics import (\n", " GraderPipeline,\n", @@ -817,38 +833,40 @@ " )\n", ")\n", "print(\"GraderPipeline built (weighted: code=1.0 + code=1.0 + llm=2.0).\")" - ] + ], + "metadata": {}, + "execution_count": null }, { + "id": "e0fb414e", "cell_type": "code", - "execution_count": null, + "source": "# Construct session_summary from trace metadata and evaluate\n# We use the Tokyo trip trace (index 1) as an example.\nimport io, contextlib\n\ntrace_idx = 1\nif traces[trace_idx] is not None:\n trace = traces[trace_idx]\n session_summary = {\n \"session_id\": trace.session_id,\n \"total_events\": len(trace.spans),\n \"tool_calls\": len(trace.tool_calls),\n \"tool_errors\": len(trace.error_spans),\n \"llm_calls\": sum(\n 1 for s in trace.spans\n if s.event_type in (\"llm_request\", \"llm_response\")\n ),\n \"avg_latency_ms\": (\n trace.total_latency_ms / max(len(trace.spans), 1)\n if trace.total_latency_ms\n else 0.0\n ),\n \"max_latency_ms\": max(\n (s.latency_ms or 0 for s in trace.spans), default=0\n ),\n \"total_latency_ms\": trace.total_latency_ms or 0.0,\n \"turn_count\": sum(\n 1 for s in trace.spans if s.event_type == \"user_message\"\n ),\n \"has_error\": len(trace.error_spans) \u003e 0,\n \"input_tokens\": sum(\n s.attributes.get(\"input_tokens\", 0) or 0\n for s in trace.spans\n ),\n \"output_tokens\": sum(\n s.attributes.get(\"output_tokens\", 0) or 0\n for s in trace.spans\n ),\n \"total_tokens\": sum(\n s.attributes.get(\"total_tokens\", 0) or 0\n for s in trace.spans\n ),\n }\n\n # Get trace text (suppress render's print) and final response\n buf = io.StringIO()\n with contextlib.redirect_stdout(buf):\n trace_text = trace.render(format=\"tree\")\n if not isinstance(trace_text, str):\n trace_text = buf.getvalue()\n final_response = trace.final_response or \"\"\n\n verdict = asyncio.get_event_loop().run_until_complete(\n pipeline.evaluate(\n session_summary=session_summary,\n trace_text=trace_text,\n final_response=final_response,\n )\n )\n\n print(f\"[GraderPipeline — Weighted]\")\n print(f\" Final score : {verdict.final_score:.3f}\")\n print(f\" Passed : {verdict.passed}\")\n print(f\" Strategy : {verdict.strategy_name}\")\n print(f\" Grader breakdown:\")\n for gr in verdict.grader_results:\n print(f\" - {gr.grader_name}: scores={gr.scores} \"\n f\"passed={gr.passed}\")\nelse:\n print(\"Trace not available — skipping pipeline evaluation.\")", "metadata": {}, - "outputs": [], - "source": "# Construct session_summary from trace metadata and evaluate\n# We use the Tokyo trip trace (index 1) as an example.\nimport io, contextlib\n\ntrace_idx = 1\nif traces[trace_idx] is not None:\n trace = traces[trace_idx]\n session_summary = {\n \"session_id\": trace.session_id,\n \"total_events\": len(trace.spans),\n \"tool_calls\": len(trace.tool_calls),\n \"tool_errors\": len(trace.error_spans),\n \"llm_calls\": sum(\n 1 for s in trace.spans\n if s.event_type in (\"llm_request\", \"llm_response\")\n ),\n \"avg_latency_ms\": (\n trace.total_latency_ms / max(len(trace.spans), 1)\n if trace.total_latency_ms\n else 0.0\n ),\n \"max_latency_ms\": max(\n (s.latency_ms or 0 for s in trace.spans), default=0\n ),\n \"total_latency_ms\": trace.total_latency_ms or 0.0,\n \"turn_count\": sum(\n 1 for s in trace.spans if s.event_type == \"user_message\"\n ),\n \"has_error\": len(trace.error_spans) > 0,\n \"input_tokens\": sum(\n s.attributes.get(\"input_tokens\", 0) or 0\n for s in trace.spans\n ),\n \"output_tokens\": sum(\n s.attributes.get(\"output_tokens\", 0) or 0\n for s in trace.spans\n ),\n \"total_tokens\": sum(\n s.attributes.get(\"total_tokens\", 0) or 0\n for s in trace.spans\n ),\n }\n\n # Get trace text (suppress render's print) and final response\n buf = io.StringIO()\n with contextlib.redirect_stdout(buf):\n trace_text = trace.render(format=\"tree\")\n if not isinstance(trace_text, str):\n trace_text = buf.getvalue()\n final_response = trace.final_response or \"\"\n\n verdict = asyncio.get_event_loop().run_until_complete(\n pipeline.evaluate(\n session_summary=session_summary,\n trace_text=trace_text,\n final_response=final_response,\n )\n )\n\n print(f\"[GraderPipeline — Weighted]\")\n print(f\" Final score : {verdict.final_score:.3f}\")\n print(f\" Passed : {verdict.passed}\")\n print(f\" Strategy : {verdict.strategy_name}\")\n print(f\" Grader breakdown:\")\n for gr in verdict.grader_results:\n print(f\" - {gr.grader_name}: scores={gr.scores} \"\n f\"passed={gr.passed}\")\nelse:\n print(\"Trace not available — skipping pipeline evaluation.\")" + "execution_count": null }, { + "id": "536b23fb", "cell_type": "code", - "execution_count": null, + "source": "# Demo alternative strategies: Binary and Majority\nif traces[trace_idx] is not None:\n for strategy_cls, strategy_name in [\n (BinaryStrategy, \"Binary (all must pass)\"),\n (MajorityStrategy, \"Majority\"),\n ]:\n alt_pipeline = (\n GraderPipeline(strategy_cls())\n .add_code_grader(\n CodeEvaluator.latency(threshold_ms=30000),\n )\n .add_code_grader(\n CodeEvaluator.error_rate(max_error_rate=0.1),\n )\n .add_llm_grader(\n LLMAsJudge.correctness(threshold=0.6),\n )\n )\n v = asyncio.get_event_loop().run_until_complete(\n alt_pipeline.evaluate(\n session_summary=session_summary,\n trace_text=trace_text,\n final_response=final_response,\n )\n )\n print(f\"\\n[GraderPipeline — {strategy_name}]\")\n print(f\" Final score: {v.final_score:.3f} \"\n f\"Passed: {v.passed}\")", "metadata": {}, - "outputs": [], - "source": "# Demo alternative strategies: Binary and Majority\nif traces[trace_idx] is not None:\n for strategy_cls, strategy_name in [\n (BinaryStrategy, \"Binary (all must pass)\"),\n (MajorityStrategy, \"Majority\"),\n ]:\n alt_pipeline = (\n GraderPipeline(strategy_cls())\n .add_code_grader(\n CodeEvaluator.latency(threshold_ms=30000),\n )\n .add_code_grader(\n CodeEvaluator.error_rate(max_error_rate=0.1),\n )\n .add_llm_grader(\n LLMAsJudge.correctness(threshold=0.6),\n )\n )\n v = asyncio.get_event_loop().run_until_complete(\n alt_pipeline.evaluate(\n session_summary=session_summary,\n trace_text=trace_text,\n final_response=final_response,\n )\n )\n print(f\"\\n[GraderPipeline — {strategy_name}]\")\n print(f\" Final score: {v.final_score:.3f} \"\n f\"Passed: {v.passed}\")" + "execution_count": null }, { + "id": "44112c44", "cell_type": "markdown", - "metadata": {}, "source": [ "---\n", "\n", - "## Phase 7: Eval Suite & Validator\n", + "## Phase 7: Eval Suite \u0026 Validator\n", "\n", "The **EvalSuite** manages evaluation task definitions, supports capability-to-regression graduation, and exports to eval datasets. The **EvalValidator** performs sanity checks (ambiguity, balance, threshold consistency, duplicates, saturation)." - ] + ], + "metadata": {}, + "execution_count": null }, { + "id": "800dd793", "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], "source": [ "from bigquery_agent_analytics import (\n", " EvalSuite,\n", @@ -863,7 +881,7 @@ "suite.add_task(EvalTaskDef(\n", " task_id=\"simple_trip_sf_ny\",\n", " session_id=session_ids[0],\n", - " description=\"Simple SF->NY weekend trip — should call flights + hotels.\",\n", + " description=\"Simple SF-\u003eNY weekend trip — should call flights + hotels.\",\n", " category=EvalCategory.CAPABILITY,\n", " expected_trajectory=[\n", " {\"tool_name\": \"search_flights\"},\n", @@ -876,7 +894,7 @@ "suite.add_task(EvalTaskDef(\n", " task_id=\"complex_trip_tokyo\",\n", " session_id=session_ids[1],\n", - " description=\"Complex LA->Tokyo 5-day trip — all 4 tools expected.\",\n", + " description=\"Complex LA-\u003eTokyo 5-day trip — all 4 tools expected.\",\n", " category=EvalCategory.CAPABILITY,\n", " expected_trajectory=golden_tokyo,\n", " thresholds={\"trajectory_match\": 0.9, \"latency\": 0.6},\n", @@ -886,7 +904,7 @@ "suite.add_task(EvalTaskDef(\n", " task_id=\"multiturn_paris\",\n", " session_id=session_ids[2],\n", - " description=\"Multi-turn Chicago->Paris — weather, flights, hotels across 3 turns.\",\n", + " description=\"Multi-turn Chicago-\u003eParis — weather, flights, hotels across 3 turns.\",\n", " category=EvalCategory.REGRESSION,\n", " expected_trajectory=[\n", " {\"tool_name\": \"get_weather_forecast\"},\n", @@ -900,13 +918,13 @@ "print(f\"EvalSuite '{suite.name}' — {len(suite.get_tasks())} tasks added.\")\n", "for t in suite.get_tasks():\n", " print(f\" [{t.category.value}] {t.task_id}: {t.description}\")" - ] + ], + "metadata": {}, + "execution_count": null }, { + "id": "03bb79bb", "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], "source": [ "# Suite health check\n", "pass_history = {\n", @@ -928,13 +946,13 @@ " print(f\" Warnings:\")\n", " for w in health.warnings:\n", " print(f\" - {w}\")" - ] + ], + "metadata": {}, + "execution_count": null }, { + "id": "0dbbe9c7", "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], "source": [ "# Validate suite\n", "warnings = EvalValidator.validate_suite(\n", @@ -946,13 +964,13 @@ " f\"(task={w.task_id}): {w.message}\")\n", "if not warnings:\n", " print(\" No warnings — suite looks healthy!\")" - ] + ], + "metadata": {}, + "execution_count": null }, { + "id": "a98f8151", "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], "source": [ "# Export to eval dataset\n", "eval_ds = suite.to_eval_dataset()\n", @@ -961,11 +979,13 @@ " print(f\" session_id={entry['session_id']} \"\n", " f\"trajectory_len=\"\n", " f\"{len(entry.get('expected_trajectory', []))}\")" - ] + ], + "metadata": {}, + "execution_count": null }, { + "id": "8013f7d1", "cell_type": "markdown", - "metadata": {}, "source": [ "---\n", "\n", @@ -976,13 +996,13 @@ "- **pass^k** — probability that all k trials pass\n", "- **per_trial_pass_rate** — fraction of trials that passed\n", "- **mean_scores** and **score_std_dev** — statistics across trials" - ] + ], + "metadata": {}, + "execution_count": null }, { + "id": "78ebeebc", "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], "source": [ "from bigquery_agent_analytics import TrialRunner\n", "\n", @@ -1013,11 +1033,13 @@ " f\"scores={tr.scores}\")\n", "except Exception as exc:\n", " print(f\"Multi-trial evaluation failed: {exc}\")" - ] + ], + "metadata": {}, + "execution_count": null }, { + "id": "2cf4f18b", "cell_type": "markdown", - "metadata": {}, "source": [ "---\n", "\n", @@ -1029,13 +1051,13 @@ "3. Cross-session aggregation\n", "4. Multi-prompt analysis (7 specialised prompts)\n", "5. Executive summary generation" - ] + ], + "metadata": {}, + "execution_count": null }, { + "id": "36b1bdd8", "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], "source": [ "from bigquery_agent_analytics import InsightsConfig\n", "\n", @@ -1055,13 +1077,13 @@ " print(insights_report.summary())\n", "except Exception as exc:\n", " print(f\"Insights generation failed: {exc}\")" - ] + ], + "metadata": {}, + "execution_count": null }, { + "id": "b59829f0", "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], "source": [ "# Executive summary\n", "try:\n", @@ -1072,48 +1094,50 @@ " print(\"No executive summary generated.\")\n", "except NameError:\n", " print(\"Insights report not available — run previous cell first.\")" - ] + ], + "metadata": {}, + "execution_count": null }, { + "id": "1baa106e", "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], "source": [ "# Analysis sections\n", "try:\n", " for section in insights_report.analysis_sections:\n", " print(f\"\\n## {section.title}\")\n", " print(section.content[:2000])\n", - " if len(section.content) > 2000:\n", + " if len(section.content) \u003e 2000:\n", " print(\" ... (truncated)\")\n", "except NameError:\n", " print(\"Insights report not available — run previous cells first.\")" - ] + ], + "metadata": {}, + "execution_count": null }, { + "id": "a8ce1f87", "cell_type": "code", - "execution_count": null, + "source": "# Per-session facets\ntry:\n print(\"[Session Facets]\")\n for facet in insights_report.session_facets:\n print(f\"\\n Session: {facet.session_id}\")\n if facet.goal_categories:\n print(f\" Goal categories : {facet.goal_categories}\")\n if facet.outcome:\n print(f\" Outcome : {facet.outcome}\")\n if facet.satisfaction:\n print(f\" Satisfaction : {facet.satisfaction}\")\n if facet.key_topics:\n print(f\" Key topics : {facet.key_topics}\")\n print(f\" Effectiveness : {facet.agent_effectiveness}\")\n print(f\" Primary success : {facet.primary_success}\")\nexcept NameError:\n print(\"Insights report not available — run previous cells first.\")", "metadata": {}, - "outputs": [], - "source": "# Per-session facets\ntry:\n print(\"[Session Facets]\")\n for facet in insights_report.session_facets:\n print(f\"\\n Session: {facet.session_id}\")\n if facet.goal_categories:\n print(f\" Goal categories : {facet.goal_categories}\")\n if facet.outcome:\n print(f\" Outcome : {facet.outcome}\")\n if facet.satisfaction:\n print(f\" Satisfaction : {facet.satisfaction}\")\n if facet.key_topics:\n print(f\" Key topics : {facet.key_topics}\")\n print(f\" Effectiveness : {facet.agent_effectiveness}\")\n print(f\" Primary success : {facet.primary_success}\")\nexcept NameError:\n print(\"Insights report not available — run previous cells first.\")" + "execution_count": null }, { + "id": "23534079", "cell_type": "markdown", - "metadata": {}, "source": [ "---\n", "\n", - "## Phase 10: Deep Analysis & Drift Detection\n", + "## Phase 10: Deep Analysis \u0026 Drift Detection\n", "\n", "**Deep analysis** performs question distribution analysis — grouping user queries into semantic categories. **Drift detection** compares production questions against a golden dataset to measure coverage." - ] + ], + "metadata": {}, + "execution_count": null }, { + "id": "0f771e3c", "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], "source": [ "from bigquery_agent_analytics import AnalysisConfig\n", "\n", @@ -1137,13 +1161,13 @@ " print(f\" - {ex}\")\n", "except Exception as exc:\n", " print(f\"Deep analysis failed: {exc}\")" - ] + ], + "metadata": {}, + "execution_count": null }, { + "id": "efbbe049", "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], "source": [ "# Drift detection requires a golden dataset table in BigQuery.\n", "# Below shows the API pattern — uncomment and provide your golden table.\n", @@ -1164,11 +1188,13 @@ "# print(f\" New questions: {drift_report.new_questions}\")\n", "\n", "print(\"Drift detection requires a golden dataset table — see commented code above.\")" - ] + ], + "metadata": {}, + "execution_count": null }, { + "id": "a6604324", "cell_type": "markdown", - "metadata": {}, "source": [ "---\n", "\n", @@ -1197,13 +1223,13 @@ "- **Suite management**: `EvalSuite` + `EvalValidator` support capability-to-regression graduation and health monitoring.\n", "- **Non-determinism handling**: `TrialRunner` repeats evaluations to compute robust pass@k/pass^k metrics.\n", "- **AI-powered insights**: The insights pipeline and deep analysis provide actionable intelligence about agent behavior at scale." - ] + ], + "metadata": {}, + "execution_count": null }, { + "id": "436016ca", "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], "source": [ "# Cleanup\n", "try:\n", @@ -1216,7 +1242,9 @@ "print(\"\\nDemo complete!\")\n", "print(f\"Sessions: {session_ids}\")\n", "print(f\"Traces logged to: {PROJECT_ID}.{DATASET_ID}.{TABLE_ID}\")" - ] + ], + "metadata": {}, + "execution_count": null } ], "metadata": { @@ -1238,6 +1266,6 @@ "version": "3.10.0" } }, - "nbformat": 4, - "nbformat_minor": 4 + "nbformat_minor": 4, + "nbformat": 4 } \ No newline at end of file diff --git a/examples/nba_agent_trace_analysis_notebook.ipynb b/examples/nba_agent_trace_analysis_notebook.ipynb index b5d805c..4acd371 100644 --- a/examples/nba_agent_trace_analysis_notebook.ipynb +++ b/examples/nba_agent_trace_analysis_notebook.ipynb @@ -1,8 +1,8 @@ { "cells": [ { + "id": "2fc9e820", "cell_type": "markdown", - "metadata": {}, "source": [ "\n", "# Copyright 2026 Google LLC\n", @@ -15,12 +15,12 @@ "\n", "```mermaid\n", "flowchart LR\n", - " A[Configure gcloud / gh / ADK env] --> B[Run NBA ADK agent]\n", - " B --> C[BigQuery Agent Analytics Plugin logs events]\n", - " C --> D[BigQuery dataset: agent_trace]\n", - " D --> E[SDK Client reconstructs traces]\n", - " E --> F[Code + trajectory evaluation]\n", - " F --> G[Insights for NBA conversations]\n", + " A[Configure gcloud / gh / ADK env] --\u003e B[Run NBA ADK agent]\n", + " B --\u003e C[BigQuery Agent Analytics Plugin logs events]\n", + " C --\u003e D[BigQuery dataset: agent_trace]\n", + " D --\u003e E[SDK Client reconstructs traces]\n", + " E --\u003e F[Code + trajectory evaluation]\n", + " F --\u003e G[Insights for NBA conversations]\n", "```\n", "\n", "Target environment defaults used in this notebook:\n", @@ -29,53 +29,57 @@ "- **Repository**: `haiyuan-eng-google/BigQuery-Agent-Analytics-SDK`\n", "\n", "\n", - "\n", - " \n", - " \n", - " \n", - "
\n", - " \n", - " \"Colab Run in Colab\n", - " \n", - " \n", - " \n", - " \"Vertex Open in Vertex AI Workbench\n", - " \n", - " \n", - " \n", - " \"BQ Open in BQ Studio\n", - " \n", - "
\n" - ] + "\u003ctable align=\"left\"\u003e\n", + " \u003ctd\u003e\n", + " \u003ca href=\"https://colab.research.google.com/github/haiyuan-eng-google/BigQuery-Agent-Analytics-SDK/blob/main/examples/nba_agent_trace_analysis_notebook.ipynb\"\u003e\n", + " \u003cimg src=\"https://raw.githubusercontent.com/googleapis/python-bigquery-dataframes/refs/heads/main/third_party/logo/colab-logo.png\" alt=\"Colab logo\"\u003e Run in Colab\n", + " \u003c/a\u003e\n", + " \u003c/td\u003e\n", + " \u003ctd\u003e\n", + " \u003ca href=\"https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/haiyuan-eng-google/BigQuery-Agent-Analytics-SDK/main/examples/nba_agent_trace_analysis_notebook.ipynb\"\u003e\n", + " \u003cimg src=\"https://www.gstatic.com/images/branding/product/1x/google_cloud_48dp.png\" alt=\"Vertex AI logo\" width=\"32\"\u003e Open in Vertex AI Workbench\n", + " \u003c/a\u003e\n", + " \u003c/td\u003e\n", + " \u003ctd\u003e\n", + " \u003ca href=\"https://console.cloud.google.com/bigquery/import?url=https://github.com/haiyuan-eng-google/BigQuery-Agent-Analytics-SDK/blob/main/examples/nba_agent_trace_analysis_notebook.ipynb\"\u003e\n", + " \u003cimg src=\"https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcTW1gvOovVlbZAIZylUtf5Iu8-693qS1w5NJw\u0026s\" alt=\"BQ logo\" width=\"35\"\u003e Open in BQ Studio\n", + " \u003c/a\u003e\n", + " \u003c/td\u003e\n", + "\u003c/table\u003e\n" + ], + "metadata": {}, + "execution_count": null }, { + "id": "7ca7f060", "cell_type": "markdown", - "metadata": {}, "source": [ "## 1) Install dependencies" - ] + ], + "metadata": {}, + "execution_count": null }, { + "id": "271c2b17", "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], "source": [ "!pip install -q google-adk bigquery-agent-analytics google-cloud-bigquery nest-asyncio pandas || pip install -q google-adk google-cloud-bigquery nest-asyncio pandas \"git+https://github.com/haiyuan-eng-google/BigQuery-Agent-Analytics-SDK.git@main\"\n" - ] + ], + "metadata": {}, + "execution_count": null }, { + "id": "af6a0ae9", "cell_type": "markdown", - "metadata": {}, "source": [ "## 2) Environment setup (gcloud / gh / ADK)" - ] + ], + "metadata": {}, + "execution_count": null }, { + "id": "3bc80e79", "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], "source": [ "\n", "import os\n", @@ -118,20 +122,22 @@ " bq --location=US mk --dataset --if_not_exists ${GOOGLE_CLOUD_PROJECT}:agent_trace\n", " gh auth login\n", "\"\"\")\n" - ] + ], + "metadata": {}, + "execution_count": null }, { + "id": "bc175dc9", "cell_type": "markdown", - "metadata": {}, "source": [ "## 3) Define a deterministic NBA toolset for the test agent" - ] + ], + "metadata": {}, + "execution_count": null }, { + "id": "9c487464", "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], "source": [ "\n", "import hashlib\n", @@ -139,12 +145,12 @@ "from typing import Any\n", "\n", "\n", - "def _rng_from(*parts: str) -> random.Random:\n", + "def _rng_from(*parts: str) -\u003e random.Random:\n", " seed = int(hashlib.md5(\"|\".join(parts).encode()).hexdigest()[:8], 16)\n", " return random.Random(seed)\n", "\n", "\n", - "async def get_nba_team_snapshot(team_name: str, season: str = \"2024-25\") -> dict[str, Any]:\n", + "async def get_nba_team_snapshot(team_name: str, season: str = \"2024-25\") -\u003e dict[str, Any]:\n", " \"\"\"Return deterministic synthetic team stats for demo trace generation.\"\"\"\n", " rng = _rng_from(team_name, season)\n", " wins = rng.randint(28, 62)\n", @@ -164,7 +170,7 @@ " }\n", "\n", "\n", - "async def get_nba_player_snapshot(player_name: str, season: str = \"2024-25\") -> dict[str, Any]:\n", + "async def get_nba_player_snapshot(player_name: str, season: str = \"2024-25\") -\u003e dict[str, Any]:\n", " \"\"\"Return deterministic synthetic player box-score style metrics.\"\"\"\n", " rng = _rng_from(player_name, season)\n", " return {\n", @@ -178,12 +184,12 @@ " }\n", "\n", "\n", - "async def compare_matchup(home_team: str, away_team: str, season: str = \"2024-25\") -> dict[str, Any]:\n", + "async def compare_matchup(home_team: str, away_team: str, season: str = \"2024-25\") -\u003e dict[str, Any]:\n", " \"\"\"Create a deterministic matchup projection and rationale.\"\"\"\n", " home = await get_nba_team_snapshot(home_team, season)\n", " away = await get_nba_team_snapshot(away_team, season)\n", " margin = round((home[\"net_rating\"] - away[\"net_rating\"]) + 2.1, 1)\n", - " favorite = home_team if margin >= 0 else away_team\n", + " favorite = home_team if margin \u003e= 0 else away_team\n", " return {\n", " \"season\": season,\n", " \"home_team\": home_team,\n", @@ -196,20 +202,22 @@ " \"home_snapshot\": home,\n", " \"away_snapshot\": away,\n", " }\n" - ] + ], + "metadata": {}, + "execution_count": null }, { + "id": "eaa8c098", "cell_type": "markdown", - "metadata": {}, "source": [ "## 4) Build ADK NBA agent with BigQuery analytics plugin" - ] + ], + "metadata": {}, + "execution_count": null }, { + "id": "806fec9a", "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], "source": [ "\n", "import asyncio\n", @@ -256,7 +264,7 @@ " plugins=[plugin],\n", ")\n", "\n", - "async def run_conversation(session_id: str, user_prompt: str) -> str:\n", + "async def run_conversation(session_id: str, user_prompt: str) -\u003e str:\n", " await session_service.create_session(\n", " app_name=\"nba_agent_trace_demo\",\n", " user_id=\"demo_user\",\n", @@ -287,7 +295,7 @@ " print(f\"\\n=== Running session {sid} ===\")\n", " print(\"User:\", prompt)\n", " response = asyncio.run(run_conversation(sid, prompt))\n", - " print(\"Agent:\", response[:600], \"...\" if len(response) > 600 else \"\")\n", + " print(\"Agent:\", response[:600], \"...\" if len(response) \u003e 600 else \"\")\n", "\n", "# Give plugin time to flush in notebook environments.\n", "time.sleep(3)\n", @@ -302,7 +310,7 @@ "elapsed = 0\n", "rows_visible = 0\n", "\n", - "while elapsed <= max_wait_s:\n", + "while elapsed \u003c= max_wait_s:\n", " query = f\"\"\"\n", " SELECT COUNT(*) AS c\n", " FROM `{PROJECT_ID}.{DATASET_ID}.{TABLE_ID}`\n", @@ -317,30 +325,31 @@ " ),\n", " )\n", " rows_visible = int(list(job.result())[0][\"c\"])\n", - " print(f\"Elapsed {elapsed:>3}s -> visible_rows={rows_visible}\")\n", - " if rows_visible > 0:\n", + " print(f\"Elapsed {elapsed:\u003e3}s -\u003e visible_rows={rows_visible}\")\n", + " if rows_visible \u003e 0:\n", " break\n", " time.sleep(poll_s)\n", " elapsed += poll_s\n", "\n", "print(\"Final visible rows for current notebook sessions:\", rows_visible)\n" - ] + ], + "metadata": {}, + "execution_count": null }, { + "id": "af74323d", "cell_type": "markdown", - "metadata": {}, "source": [ "## 5) Analyze traces with BigQuery Agent Analytics SDK APIs\n" - ] + ], + "metadata": {}, + "execution_count": null }, { + "id": "d06ce086", "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], "source": [ - "\n", - "from bigquery_agent_analytics import BigQueryTraceEvaluator\n", + "from bigquery_agent_analytics import PerformanceEvaluator\n", "from bigquery_agent_analytics import Client\n", "from bigquery_agent_analytics import CodeEvaluator\n", "from bigquery_agent_analytics.trace_evaluator import MatchType\n", @@ -359,14 +368,14 @@ "\n", "# In notebooks, Trace.render() returns a graphviz object if available.\n", "rendered = trace.render()\n", - "rendered\n" - ] + "rendered" + ], + "metadata": {}, + "execution_count": null }, { + "id": "ef4291b6", "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], "source": [ "# Quick trace table instead of raw SQL\n", "trace_rows = []\n", @@ -381,13 +390,13 @@ "\n", "import pandas as pd\n", "pd.DataFrame(trace_rows).head(10)\n" - ] + ], + "metadata": {}, + "execution_count": null }, { + "id": "2fd9a740", "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], "source": [ "# Deterministic code-based evaluation examples (SDK client-side)\n", "latency_eval = CodeEvaluator.latency(threshold_ms=12000)\n", @@ -397,21 +406,23 @@ "report_turns = client.evaluate(turn_eval)\n", "print(\"latency pass_rate:\", report_latency.pass_rate)\n", "print(\"turn_count pass_rate:\", report_turns.pass_rate)\n" - ] + ], + "metadata": {}, + "execution_count": null }, { + "id": "a241e3df", "cell_type": "markdown", - "metadata": {}, "source": [ "### 5.1) LLM-as-Judge example (factuality / tactical depth proxy)\n", "This uses the SDK `LLMAsJudge` path over the same generated NBA sessions.\n" - ] + ], + "metadata": {}, + "execution_count": null }, { + "id": "a3aa5db2", "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], "source": [ "from bigquery_agent_analytics import LLMAsJudge, TraceFilter\n", "\n", @@ -425,17 +436,16 @@ "print(\"sessions evaluated:\", judge_report.total_sessions)\n", "print(\"pass_rate:\", judge_report.pass_rate)\n", "judge_report.session_scores[:3]\n" - ] + ], + "metadata": {}, + "execution_count": null }, { + "id": "68633707", "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], "source": [ - "\n", "# Trajectory matching against expected tool behavior\n", - "trajectory_evaluator = BigQueryTraceEvaluator(\n", + "trajectory_evaluator = PerformanceEvaluator(\n", " project_id=PROJECT_ID,\n", " dataset_id=DATASET_ID,\n", " table_id=TABLE_ID,\n", @@ -457,21 +467,23 @@ "print(\"Eval status:\", traj_result.eval_status)\n", "print(\"Overall score:\", traj_result.overall_score)\n", "print(\"Scores:\", traj_result.scores)\n", - "print(\"Details:\", traj_result.details)\n" - ] + "print(\"Details:\", traj_result.details)" + ], + "metadata": {}, + "execution_count": null }, { + "id": "7efec60e", "cell_type": "markdown", - "metadata": {}, "source": [ "## 6) Optional: Generate insights report for NBA sessions" - ] + ], + "metadata": {}, + "execution_count": null }, { + "id": "7385ad12", "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], "source": [ "\n", "from bigquery_agent_analytics import InsightsConfig\n", @@ -484,18 +496,22 @@ " filters=TraceFilter(session_ids=session_ids),\n", ")\n", "insights\n" - ] + ], + "metadata": {}, + "execution_count": null }, { + "id": "39126f43", "cell_type": "markdown", - "metadata": {}, "source": [ "\n", "## 7) Next steps\n", "\n", "- Swap synthetic tool outputs with a real NBA data source (e.g., trusted sports API).\n", "- Track model versions in metadata to compare trace quality over time.\n" - ] + ], + "metadata": {}, + "execution_count": null } ], "metadata": { @@ -509,6 +525,6 @@ "version": "3.11" } }, - "nbformat": 4, - "nbformat_minor": 5 + "nbformat_minor": 5, + "nbformat": 4 } diff --git a/src/bigquery_agent_analytics/__init__.py b/src/bigquery_agent_analytics/__init__.py index e154558..eb0d0bc 100644 --- a/src/bigquery_agent_analytics/__init__.py +++ b/src/bigquery_agent_analytics/__init__.py @@ -65,10 +65,11 @@ # --- SDK Client & Core --- try: from .client import Client - from .evaluators import CodeEvaluator - from .evaluators import EvaluationReport + from .system_evaluator import CodeEvaluator + from .system_evaluator import EvaluationReport from .evaluators import LLMAsJudge - from .evaluators import SessionScore + from .system_evaluator import SessionScore + from .system_evaluator import SystemEvaluator from .feedback import AnalysisConfig from .feedback import DriftReport from .feedback import QuestionDistribution @@ -96,6 +97,7 @@ "TraceFilter", "ViewManager", "CodeEvaluator", + "SystemEvaluator", "LLMAsJudge", "EvaluationReport", "SessionScore", @@ -118,14 +120,16 @@ # Trace Evaluator try: - from .trace_evaluator import BigQueryTraceEvaluator - from .trace_evaluator import EvaluationResult - from .trace_evaluator import TraceReplayRunner - from .trace_evaluator import TrajectoryMetrics + from .performance_evaluator import BigQueryTraceEvaluator + from .performance_evaluator import EvaluationResult + from .performance_evaluator import PerformanceEvaluator + from .performance_evaluator import TraceReplayRunner + from .performance_evaluator import TrajectoryMetrics __all__.extend( [ "BigQueryTraceEvaluator", + "PerformanceEvaluator", "EvaluationResult", "TraceReplayRunner", "TrajectoryMetrics", @@ -188,13 +192,15 @@ # Multi-Trial try: - from .multi_trial import MultiTrialReport - from .multi_trial import TrialResult - from .multi_trial import TrialRunner + from .multi_trial_performance_evaluator import MultiTrialReport + from .multi_trial_performance_evaluator import TrialResult + from .multi_trial_performance_evaluator import MultiTrialPerformanceEvaluator + from .multi_trial_performance_evaluator import TrialRunner __all__.extend( [ "TrialRunner", + "MultiTrialPerformanceEvaluator", "TrialResult", "MultiTrialReport", ] @@ -208,18 +214,20 @@ # Grader Pipeline try: - from .grader_pipeline import AggregateVerdict - from .grader_pipeline import BinaryStrategy - from .grader_pipeline import GraderPipeline - from .grader_pipeline import GraderResult - from .grader_pipeline import MajorityStrategy - from .grader_pipeline import ScoringStrategy - from .grader_pipeline import WeightedStrategy + from .aggregate_grader import AggregateVerdict + from .aggregate_grader import BinaryStrategy + from .aggregate_grader import AggregateGrader + from .aggregate_grader import GraderPipeline + from .aggregate_grader import GraderResult + from .aggregate_grader import MajorityStrategy + from .aggregate_grader import ScoringStrategy + from .aggregate_grader import WeightedStrategy __all__.extend( [ "AggregateVerdict", "BinaryStrategy", + "AggregateGrader", "GraderPipeline", "GraderResult", "MajorityStrategy", diff --git a/src/bigquery_agent_analytics/_streaming_evaluation.py b/src/bigquery_agent_analytics/_streaming_evaluation.py index 6b7e1c3..fb7a830 100644 --- a/src/bigquery_agent_analytics/_streaming_evaluation.py +++ b/src/bigquery_agent_analytics/_streaming_evaluation.py @@ -25,7 +25,7 @@ import json from typing import Any -from bigquery_agent_analytics import CodeEvaluator +from bigquery_agent_analytics import SystemEvaluator from bigquery_agent_analytics import EvaluationReport from bigquery_agent_analytics import serialize from bigquery_agent_analytics import udf_kernels @@ -69,12 +69,12 @@ def is_final(self) -> bool: return self.trigger_kind == TRIGGER_KIND_SESSION_TERMINAL -def build_streaming_observability_evaluator() -> CodeEvaluator: +def build_streaming_observability_evaluator() -> SystemEvaluator: """Build the fixed launch evaluator profile for streaming observability. Uses raw-budget gates (a session passes iff the observed metric is within the configured budget) for consistency with - ``CodeEvaluator.latency`` / ``.error_rate`` / ``.turn_count``. + ``SystemEvaluator.latency`` / ``.error_rate`` / ``.turn_count``. Prior implementation used normalized scores with a 0.5 pass cutoff, which caused gates to fire at roughly half the configured budget. """ @@ -102,7 +102,7 @@ def _score_turn_count(session_summary: dict[str, Any]) -> float: observed = session_summary.get("turn_count", 0) or 0 return 1.0 if observed <= _MAX_TURNS else 0.0 - evaluator = CodeEvaluator(name=STREAMING_EVALUATOR_PROFILE) + evaluator = SystemEvaluator(name=STREAMING_EVALUATOR_PROFILE) evaluator.add_metric( "latency", _score_latency, diff --git a/src/bigquery_agent_analytics/aggregate_grader.py b/src/bigquery_agent_analytics/aggregate_grader.py new file mode 100644 index 0000000..b7ad09e --- /dev/null +++ b/src/bigquery_agent_analytics/aggregate_grader.py @@ -0,0 +1,471 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Grader composition pipeline for combining multiple evaluators. + +Composes ``SystemEvaluator``, ``PerformanceEvaluator``, and custom graders +into a single verdict using configurable scoring strategies (weighted average, +binary all-pass, or majority vote). + +Example usage:: + + from bigquery_agent_analytics import SystemEvaluator, AggregateGrader + from bigquery_agent_analytics.aggregate_grader import WeightedStrategy + + pipeline = ( + AggregateGrader(WeightedStrategy( + weights={"latency": 0.3, "correctness": 0.7}, + )) + .add_system_grader(SystemEvaluator.latency(), weight=0.3) + ) + + verdict = await pipeline.evaluate( + session_summary={"session_id": "s1", "avg_latency_ms": 2000}, + trace_text="User: hello\\nAgent: hi", + final_response="hi", + ) +""" + +from __future__ import annotations + +import abc +import logging +from typing import Any, Callable + +from pydantic import BaseModel +from pydantic import Field + +from .system_evaluator import SystemEvaluator +from .performance_evaluator import PerformanceEvaluator +from .evaluators import LLMAsJudge + +logger = logging.getLogger("bigquery_agent_analytics." + __name__) + + +# ------------------------------------------------------------------ # +# Data Models # +# ------------------------------------------------------------------ # + + +class GraderResult(BaseModel): + """Result from a single grader.""" + + grader_name: str = Field(description="Name of the grader.") + scores: dict[str, float] = Field( + default_factory=dict, + description="Metric scores from this grader.", + ) + passed: bool = Field( + default=True, + description="Whether this grader passed.", + ) + + +class AggregateVerdict(BaseModel): + """Aggregated verdict from all graders in the pipeline.""" + + grader_results: list[GraderResult] = Field( + default_factory=list, + description="Individual grader results.", + ) + final_score: float = Field( + default=0.0, + description="Final aggregated score.", + ) + passed: bool = Field( + default=False, + description="Whether the overall evaluation passed.", + ) + strategy_name: str = Field( + default="", + description="Name of the scoring strategy used.", + ) + + +# ------------------------------------------------------------------ # +# Scoring Strategies # +# ------------------------------------------------------------------ # + + +class ScoringStrategy(abc.ABC): + """Abstract base class for scoring strategies.""" + + @abc.abstractmethod + def aggregate( + self, + grader_results: list[GraderResult], + ) -> AggregateVerdict: + """Aggregates grader results into a single verdict. + + Args: + grader_results: List of individual grader results. + + Returns: + AggregateVerdict with final score and pass/fail. + """ + + +class WeightedStrategy(ScoringStrategy): + """Weighted average of grader scores; pass if >= threshold.""" + + def __init__( + self, + weights: dict[str, float] | None = None, + threshold: float = 0.5, + ) -> None: + """Initializes the weighted strategy. + + Args: + weights: Mapping of grader name to weight. If None, + all graders are weighted equally. + threshold: Minimum weighted score to pass. + """ + self.weights = weights or {} + self.threshold = threshold + + def aggregate( + self, + grader_results: list[GraderResult], + ) -> AggregateVerdict: + if not grader_results: + return AggregateVerdict(strategy_name="weighted") + + total_weight = 0.0 + weighted_sum = 0.0 + + for result in grader_results: + weight = self.weights.get(result.grader_name, 1.0) + # Average the grader's metric scores + if result.scores: + avg_score = sum(result.scores.values()) / len(result.scores) + else: + avg_score = 1.0 if result.passed else 0.0 + weighted_sum += avg_score * weight + total_weight += weight + + final_score = weighted_sum / total_weight if total_weight > 0 else 0.0 + + return AggregateVerdict( + grader_results=grader_results, + final_score=final_score, + passed=final_score >= self.threshold, + strategy_name="weighted", + ) + + +class BinaryStrategy(ScoringStrategy): + """All graders must pass independently.""" + + def aggregate( + self, + grader_results: list[GraderResult], + ) -> AggregateVerdict: + if not grader_results: + return AggregateVerdict(strategy_name="binary") + + all_passed = all(r.passed for r in grader_results) + + # Average of all scores + all_scores = [] + for r in grader_results: + all_scores.extend(r.scores.values()) + final_score = ( + sum(all_scores) / len(all_scores) + if all_scores + else (1.0 if all_passed else 0.0) + ) + + return AggregateVerdict( + grader_results=grader_results, + final_score=final_score, + passed=all_passed, + strategy_name="binary", + ) + + +class MajorityStrategy(ScoringStrategy): + """Majority of graders must pass.""" + + def aggregate( + self, + grader_results: list[GraderResult], + ) -> AggregateVerdict: + if not grader_results: + return AggregateVerdict(strategy_name="majority") + + num_passed = sum(1 for r in grader_results if r.passed) + majority = num_passed > len(grader_results) / 2 + + # Average of all scores + all_scores = [] + for r in grader_results: + all_scores.extend(r.scores.values()) + final_score = ( + sum(all_scores) / len(all_scores) + if all_scores + else (1.0 if majority else 0.0) + ) + + return AggregateVerdict( + grader_results=grader_results, + final_score=final_score, + passed=majority, + strategy_name="majority", + ) + + +# ------------------------------------------------------------------ # +# Grader Pipeline # +# ------------------------------------------------------------------ # + + +class _GraderEntry: + """Internal wrapper for a grader in the pipeline.""" + + def __init__( + self, + name: str, + evaluate_fn: Any, + weight: float = 1.0, + is_async: bool = False, + ) -> None: + self.name = name + self.evaluate_fn = evaluate_fn + self.weight = weight + self.is_async = is_async + + +class AggregateGrader: + """Composes multiple graders into a single evaluation pipeline. + + Supports ``SystemEvaluator``, ``PerformanceEvaluator``, and arbitrary custom + grader functions combined via a configurable ``ScoringStrategy``. + + Example:: + + pipeline = ( + AggregateGrader(WeightedStrategy(threshold=0.6)) + .add_system_grader(SystemEvaluator.latency()) + ) + verdict = await pipeline.evaluate( + session_summary={...}, + trace_text="...", + final_response="...", + ) + """ + + def __init__(self, strategy: ScoringStrategy) -> None: + """Initializes the grader pipeline with a scoring strategy. + + Args: + strategy: The strategy used to aggregate grader results. + """ + self.strategy = strategy + self._graders: list[_GraderEntry] = [] + + def add_system_grader( + self, + evaluator: SystemEvaluator, + weight: float = 1.0, + ) -> AggregateGrader: + """Adds a SystemEvaluator grader to the pipeline. + + Args: + evaluator: A SystemEvaluator instance. + weight: Weight for weighted strategies. + + Returns: + Self for chaining. + """ + self._graders.append( + _GraderEntry( + name=evaluator.name, + evaluate_fn=evaluator, + weight=weight, + is_async=False, + ) + ) + return self + + def add_code_grader( + self, + evaluator: SystemEvaluator, + weight: float = 1.0, + ) -> AggregateGrader: + """Adds a code grader to the pipeline. + + Note this grader is preserved for backwards compatibility, but isn't + recommended for use. + """ + return self.add_system_grader(evaluator, weight=weight) + + def add_llm_grader( + self, + judge: LLMAsJudge, + weight: float = 1.0, + ) -> AggregateGrader: + """Adds an LLMAsJudge grader to the pipeline. + + Note this grader is preserved for backwards compatibility, but isn't + recommended for use. + + Args: + judge: An LLMAsJudge instance. + weight: Weight for weighted strategies. + + Returns: + Self for chaining. + """ + self._graders.append( + _GraderEntry( + name=judge.name, + evaluate_fn=judge, + weight=weight, + is_async=True, + ) + ) + return self + + def add_performance_grader( + self, + evaluator: PerformanceEvaluator, + weight: float = 1.0, + ) -> AggregateGrader: + """Adds a PerformanceEvaluator grader to the pipeline. + + Args: + evaluator: A PerformanceEvaluator instance. + weight: Weight for weighted strategies. + + Returns: + Self for chaining. + """ + self._graders.append( + _GraderEntry( + name=evaluator.name, + evaluate_fn=evaluator, + weight=weight, + is_async=True, + ) + ) + return self + + def add_custom_grader( + self, + name: str, + fn: Callable[[dict[str, Any]], GraderResult], + weight: float = 1.0, + ) -> AggregateGrader: + """Adds a custom grader function to the pipeline. + + The function receives a dict with ``session_summary``, + ``trace_text``, and ``final_response`` keys. + + Args: + name: Name for the grader. + fn: Function returning a GraderResult. + weight: Weight for weighted strategies. + + Returns: + Self for chaining. + """ + self._graders.append( + _GraderEntry( + name=name, + evaluate_fn=fn, + weight=weight, + is_async=False, + ) + ) + return self + + async def evaluate( + self, + session_summary: dict[str, Any] | None = None, + trace_text: str = "", + final_response: str = "", + ) -> AggregateVerdict: + """Evaluates using all graders and aggregates results. + + Args: + session_summary: Dict with session metrics (for + SystemEvaluator graders). + trace_text: Formatted trace text (for LLMAsJudge + graders). + final_response: Final agent response. + + Returns: + AggregateVerdict with combined results. + """ + session_summary = session_summary or {} + grader_results: list[GraderResult] = [] + + for entry in self._graders: + try: + result = await self._run_grader( + entry, session_summary, trace_text, final_response + ) + grader_results.append(result) + except Exception as e: + logger.warning("Grader %s failed: %s", entry.name, e) + grader_results.append( + GraderResult( + grader_name=entry.name, + scores={}, + passed=False, + ) + ) + + return self.strategy.aggregate(grader_results) + + async def _run_grader( + self, + entry: _GraderEntry, + session_summary: dict[str, Any], + trace_text: str, + final_response: str, + ) -> GraderResult: + """Runs a single grader and returns its result.""" + evaluator = entry.evaluate_fn + + if isinstance(evaluator, SystemEvaluator): + score = evaluator.evaluate_session(session_summary) + return GraderResult( + grader_name=entry.name, + scores=score.scores, + passed=score.passed, + ) + + if isinstance(evaluator, PerformanceEvaluator): + score = await evaluator.evaluate_session( + trace_text=trace_text, + final_response=final_response, + ) + return GraderResult( + grader_name=entry.name, + scores=score.scores, + passed=score.passed, + ) + + # Custom grader function + context = { + "session_summary": session_summary, + "trace_text": trace_text, + "final_response": final_response, + } + return evaluator(context) + + +# Keep aliases for backward compatibility +GraderPipeline = AggregateGrader diff --git a/src/bigquery_agent_analytics/categorical_evaluator.py b/src/bigquery_agent_analytics/categorical_evaluator.py index 7d80603..9ace3a1 100644 --- a/src/bigquery_agent_analytics/categorical_evaluator.py +++ b/src/bigquery_agent_analytics/categorical_evaluator.py @@ -16,7 +16,7 @@ Classifies agent sessions into user-defined categories using BigQuery's native ``AI.GENERATE``, with Gemini API fallback when BigQuery-native -execution is unavailable. Unlike the numeric ``CodeEvaluator`` and +execution is unavailable. Unlike the numeric ``SystemEvaluator`` and ``LLMAsJudge`` report paths, this module returns label-valued results with strict category validation. diff --git a/src/bigquery_agent_analytics/cli.py b/src/bigquery_agent_analytics/cli.py index d02a4aa..c8abf4c 100644 --- a/src/bigquery_agent_analytics/cli.py +++ b/src/bigquery_agent_analytics/cli.py @@ -39,7 +39,7 @@ import typer -from .evaluators import CodeEvaluator +from .evaluators import CodeEvaluator, SystemEvaluator from .evaluators import EvaluationReport from .evaluators import LLMAsJudge from .formatter import format_output @@ -154,28 +154,28 @@ def _load_spec_from_args( _CODE_EVALUATORS = { "latency": ( - lambda t: CodeEvaluator.latency(threshold_ms=t), - lambda: CodeEvaluator.latency(), + lambda t: SystemEvaluator.latency(threshold_ms=t), + lambda: SystemEvaluator.latency(), ), "error_rate": ( - lambda t: CodeEvaluator.error_rate(max_error_rate=t), - lambda: CodeEvaluator.error_rate(), + lambda t: SystemEvaluator.error_rate(max_error_rate=t), + lambda: SystemEvaluator.error_rate(), ), "turn_count": ( - lambda t: CodeEvaluator.turn_count(max_turns=int(t)), - lambda: CodeEvaluator.turn_count(), + lambda t: SystemEvaluator.turn_count(max_turns=int(t)), + lambda: SystemEvaluator.turn_count(), ), "token_efficiency": ( - lambda t: CodeEvaluator.token_efficiency(max_tokens=int(t)), - lambda: CodeEvaluator.token_efficiency(), + lambda t: SystemEvaluator.token_efficiency(max_tokens=int(t)), + lambda: SystemEvaluator.token_efficiency(), ), "ttft": ( - lambda t: CodeEvaluator.ttft(threshold_ms=t), - lambda: CodeEvaluator.ttft(), + lambda t: SystemEvaluator.ttft(threshold_ms=t), + lambda: SystemEvaluator.ttft(), ), "cost": ( - lambda t: CodeEvaluator.cost_per_session(max_cost_usd=t), - lambda: CodeEvaluator.cost_per_session(), + lambda t: SystemEvaluator.cost_per_session(max_cost_usd=t), + lambda: SystemEvaluator.cost_per_session(), ), } @@ -406,9 +406,9 @@ def _emit_evaluate_failures( """Emit readable FAIL lines for failing sessions before --exit-code exits. One line per (session_id, metric_name) that failed its threshold. - Prefers the raw observed + budget pair (``CodeEvaluator`` prebuilts); + Prefers the raw observed + budget pair (``SystemEvaluator`` prebuilts); falls back to score + threshold when the metric didn't declare - observed/budget (custom ``add_metric`` users, ``LLMAsJudge`` + observed/budget (custom ``add_metric`` users, ``PerformanceEvaluator`` criteria). For LLM-judge failures the line also carries a bounded ``feedback="…"`` snippet drawn from ``SessionScore.llm_feedback`` so CI logs explain *why* the judge said the session failed without diff --git a/src/bigquery_agent_analytics/client.py b/src/bigquery_agent_analytics/client.py index 75614e8..4070282 100644 --- a/src/bigquery_agent_analytics/client.py +++ b/src/bigquery_agent_analytics/client.py @@ -33,11 +33,11 @@ # Run evaluation from bigquery_agent_analytics import ( - CodeEvaluator, LLMAsJudge, TraceFilter, + SystemEvaluator, LLMAsJudge, TraceFilter, ) report = client.evaluate( filters=TraceFilter(agent_id="my_agent"), - evaluator=CodeEvaluator.latency(threshold_ms=3000), + evaluator=SystemEvaluator.latency(threshold_ms=3000), ) print(report.summary()) """ @@ -71,17 +71,18 @@ from .categorical_evaluator import flatten_results_to_rows from .categorical_evaluator import parse_categorical_row from .categorical_evaluator import parse_classify_row -from .evaluators import _parse_json_from_text -from .evaluators import AI_GENERATE_JUDGE_BATCH_QUERY -from .evaluators import CodeEvaluator -from .evaluators import DEFAULT_ENDPOINT -from .evaluators import EvaluationReport -from .evaluators import LLM_JUDGE_BATCH_QUERY -from .evaluators import LLMAsJudge -from .evaluators import render_ai_generate_judge_query -from .evaluators import SESSION_SUMMARY_QUERY -from .evaluators import SessionScore -from .evaluators import split_judge_prompt_template +from .system_evaluator import _parse_json_from_text +from .system_evaluator import AI_GENERATE_JUDGE_BATCH_QUERY +from .system_evaluator import CodeEvaluator, SystemEvaluator +from .system_evaluator import DEFAULT_ENDPOINT +from .system_evaluator import EvaluationReport +from .system_evaluator import LLM_JUDGE_BATCH_QUERY +from .system_evaluator import LLMAsJudge +from .system_evaluator import render_ai_generate_judge_query +from .system_evaluator import SESSION_SUMMARY_QUERY +from .system_evaluator import SessionScore +from .system_evaluator import split_judge_prompt_template +from .performance_evaluator import PerformanceEvaluator, EvalStatus from .feedback import AnalysisConfig from .feedback import compute_drift from .feedback import compute_question_distribution @@ -869,7 +870,7 @@ def list_traces( def evaluate( self, - evaluator: CodeEvaluator | LLMAsJudge, + evaluator: SystemEvaluator | PerformanceEvaluator, filters: Optional[TraceFilter] = None, dataset: Optional[str] = None, strict: bool = False, @@ -877,12 +878,12 @@ def evaluate( """Runs batch evaluation over traces. Uses BigQuery native execution for scalable assessment. - ``CodeEvaluator`` metrics are computed from session - aggregates. ``LLMAsJudge`` metrics use BQML's + ``SystemEvaluator`` metrics are computed from session + aggregates. ``PerformanceEvaluator`` metrics use BQML's ``ML.GENERATE_TEXT`` for zero-ETL evaluation. Args: - evaluator: A CodeEvaluator or LLMAsJudge instance. + evaluator: A SystemEvaluator or PerformanceEvaluator instance. filters: Optional trace filters. dataset: Optional table name override. strict: When ``True``, sessions with unparseable or @@ -900,30 +901,26 @@ def evaluate( filt = filters or TraceFilter() where, params = filt.to_sql_conditions() - if isinstance(evaluator, CodeEvaluator): + if isinstance(evaluator, SystemEvaluator): return self._evaluate_code( evaluator, table, where, params, ) - elif isinstance(evaluator, LLMAsJudge): - report = self._evaluate_llm_judge( + elif isinstance(evaluator, PerformanceEvaluator): + return self._evaluate_performance( evaluator, table, where, params, - filt, ) - if strict: - report = _apply_strict_mode(report) - return report else: raise TypeError(f"Unsupported evaluator type: {type(evaluator)}") def _evaluate_code( self, - evaluator: CodeEvaluator, + evaluator: SystemEvaluator, table: str, where: str, params: list, @@ -954,6 +951,76 @@ def _evaluate_code( session_scores=session_scores, ) + def _evaluate_performance( + self, + evaluator: PerformanceEvaluator, + table: str, + where: str, + params: list, + ) -> EvaluationReport: + """Runs performance evaluation using the folded PerformanceEvaluator.""" + import asyncio + query = SESSION_SUMMARY_QUERY.format( + project=self.project_id, + dataset=self.dataset_id, + table=table, + where=where, + ) + job_config = with_sdk_labels( + bigquery.QueryJobConfig(query_parameters=params), + feature="eval-performance", + ) + results = list(self.bq_client.query(query, job_config=job_config).result()) + session_ids = [row.get("session_id") for row in results if row.get("session_id")] + + try: + loop = asyncio.get_running_loop() + except RuntimeError: + try: + loop = asyncio.get_event_loop() + except RuntimeError: + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + + async def evaluate_all(): + tasks = [] + for sid in session_ids: + tasks.append(evaluator.evaluate_session( + session_id=sid, + use_llm_judge=True, + )) + return await asyncio.gather(*tasks) + + if loop.is_running(): + import nest_asyncio + nest_asyncio.apply() + + eval_results = loop.run_until_complete(evaluate_all()) + + session_scores = [] + passed_count = 0 + for er in eval_results: + score = SessionScore( + session_id=er.session_id, + scores=er.scores, + passed=(er.eval_status == EvalStatus.PASSED), + llm_feedback=er.llm_judge_feedback, + ) + session_scores.append(score) + if score.passed: + passed_count += 1 + + report = EvaluationReport( + dataset=f"{self._table_ref} WHERE {where}", + evaluator_name=evaluator.name, + total_sessions=len(session_scores), + passed_sessions=passed_count, + failed_sessions=len(session_scores) - passed_count, + ) + report.session_scores = session_scores + report.details = {"execution_mode": "performance_evaluator"} + return report + @staticmethod def _is_legacy_model_ref(ref: str) -> bool: """Returns True when *ref* looks like a BQ ML model reference. diff --git a/src/bigquery_agent_analytics/evaluators.py b/src/bigquery_agent_analytics/evaluators.py index 28ff973..0aeda14 100644 --- a/src/bigquery_agent_analytics/evaluators.py +++ b/src/bigquery_agent_analytics/evaluators.py @@ -12,1154 +12,51 @@ # See the License for the specific language governing permissions and # limitations under the License. -"""Evaluation engine for BigQuery Agent Analytics SDK. +"""Backward-compatibility module mapping for evaluators.""" -Provides ``CodeEvaluator`` for deterministic, code-based metrics and -``LLMAsJudge`` for semantic evaluation using LLM-as-a-judge. The -``evaluate()`` function orchestrates batch evaluation using BigQuery's -native AI functions for scalable, zero-ETL assessment. +from typing import Optional +from .system_evaluator import * +from .performance_evaluator import * +from .utils import _parse_json_from_text, _extract_json_from_text, strip_markdown_fences +class LLMAsJudge(PerformanceEvaluator): + """Legacy LLMAsJudge subclass preserving pre-built factories for backwards compatibility.""" -Example usage:: - - from bigquery_agent_analytics.evaluators import ( - CodeEvaluator, LLMAsJudge, + def __init__(self, name: str = "llm_judge", model: Optional[str] = None, threshold: float = 0.5, *args, **kwargs): + super().__init__( + project_id=kwargs.get("project_id", "proj"), + dataset_id=kwargs.get("dataset_id", "ds"), + llm_judge_model=model, ) - - # Deterministic evaluation - evaluator = CodeEvaluator.latency(threshold_ms=5000) - - # LLM-based semantic evaluation - judge = LLMAsJudge.correctness() -""" - -from __future__ import annotations - -from dataclasses import dataclass -from datetime import datetime -from datetime import timezone -import json -import logging -import re -from typing import Any, Callable, Optional - -from pydantic import BaseModel -from pydantic import Field - -from bigquery_agent_analytics import udf_kernels - -logger = logging.getLogger("bigquery_agent_analytics." + __name__) - -DEFAULT_ENDPOINT = "gemini-2.5-flash" - - -# ------------------------------------------------------------------ # -# Evaluation Report # -# ------------------------------------------------------------------ # - - -class SessionScore(BaseModel): - """Scores for a single evaluated session.""" - - session_id: str = Field(description="The session ID evaluated.") - scores: dict[str, float] = Field( - default_factory=dict, - description="Metric name to score (0.0 - 1.0).", - ) - passed: bool = Field( - default=True, - description="Whether the session passed all thresholds.", - ) - details: dict[str, Any] = Field( - default_factory=dict, - description="Additional per-session details.", - ) - llm_feedback: Optional[str] = Field( - default=None, - description="LLM judge feedback if applicable.", - ) - - -class EvaluationReport(BaseModel): - """Aggregate report from an evaluation run.""" - - dataset: str = Field(description="Dataset or filter description.") - evaluator_name: str = Field(description="Name of evaluator used.") - total_sessions: int = Field(default=0) - passed_sessions: int = Field(default=0) - failed_sessions: int = Field(default=0) - aggregate_scores: dict[str, float] = Field( - default_factory=dict, - description="Average scores across all sessions.", - ) - details: dict[str, Any] = Field( - default_factory=dict, - description=( - "Operational metadata (parse_errors, fallback_mode, etc.)." - " Separated from aggregate_scores so downstream consumers" - " can treat scores as purely normalized metrics." - ), - ) - session_scores: list[SessionScore] = Field( - default_factory=list, - ) - created_at: datetime = Field( - default_factory=lambda: datetime.now(timezone.utc), - ) + self._name = name + self._threshold = threshold @property - def pass_rate(self) -> float: - """Fraction of sessions that passed.""" - if self.total_sessions == 0: - return 0.0 - return self.passed_sessions / self.total_sessions - - def summary(self) -> str: - """Returns a human-readable summary.""" - lines = [ - f"Evaluation Report: {self.evaluator_name}", - f" Dataset: {self.dataset}", - f" Sessions: {self.total_sessions}", - f" Passed: {self.passed_sessions} ({self.pass_rate:.0%})", - f" Failed: {self.failed_sessions}", - ] - if self.aggregate_scores: - lines.append(" Aggregate Scores:") - for name, score in sorted(self.aggregate_scores.items()): - lines.append(f" {name}: {score:.3f}") - return "\n".join(lines) - - -# ------------------------------------------------------------------ # -# Code-Based Evaluator # -# ------------------------------------------------------------------ # - - -@dataclass -class _MetricDef: - """Internal definition of a code metric. - - ``observed_key``, ``observed_fn``, and ``budget`` are optional - reporting metadata used by the prebuilt evaluators (latency, - error_rate, turn_count, …) to surface the raw observed value and - the user-supplied budget in ``SessionScore.details``. They don't - affect pass/fail computation — that still goes through ``fn`` + - ``threshold`` — but they let downstream consumers (CLI - ``--exit-code`` output, dashboards) emit readable failure lines - without having to re-run the scorer. - - When ``observed_fn`` is set it takes precedence over - ``observed_key``; use it for metrics whose observed value is - computed from multiple summary fields (e.g. ``tool_errors / - tool_calls`` for error rate). - """ - - name: str - fn: Callable[[dict[str, Any]], float] - threshold: float = 0.5 - observed_key: Optional[str] = None - budget: Optional[float] = None - observed_fn: Optional[Callable[[dict[str, Any]], Any]] = None - - -class CodeEvaluator: - """Deterministic evaluator using code-based metric functions. - - Metrics operate on a session summary dict containing:: - - { - "session_id": str, - "total_events": int, - "tool_calls": int, - "tool_errors": int, - "llm_calls": int, - "avg_latency_ms": float, - "max_latency_ms": float, - "total_latency_ms": float, - "turn_count": int, - "has_error": bool, - } - - Each metric function returns a score between 0.0 and 1.0. - """ - - def __init__( - self, - name: str = "code_evaluator", - metrics: Optional[list[_MetricDef]] = None, - ) -> None: - self.name = name - self._metrics: list[_MetricDef] = metrics or [] - - def add_metric( - self, - name: str, - fn: Callable[[dict[str, Any]], float], - threshold: float = 0.5, - observed_key: Optional[str] = None, - budget: Optional[float] = None, - observed_fn: Optional[Callable[[dict[str, Any]], Any]] = None, - ) -> CodeEvaluator: - """Adds a custom metric function. - - Args: - name: Metric name. - fn: Function taking session summary, returning 0-1 score. - The score is compared to ``threshold``; a session passes - the metric when ``score >= threshold``. - threshold: Pass/fail threshold applied to ``fn``'s score. - observed_key: Optional session-summary key whose value is the - raw observed metric (e.g. ``"avg_latency_ms"``). When set, - ``evaluate_session`` stashes the observed value + ``budget`` - under ``SessionScore.details`` for downstream reporting. - budget: Optional raw-budget value corresponding to the metric - (e.g. the latency-ms threshold the user supplied). Reported - alongside ``observed_key``; not used for pass/fail. - observed_fn: Optional callable that derives the observed value - from the session summary. Used when the observed metric is - computed (e.g. ``tool_errors/tool_calls``) rather than - stored directly. Takes precedence over ``observed_key``. - - Returns: - Self for chaining. - """ - self._metrics.append( - _MetricDef( - name=name, - fn=fn, - threshold=threshold, - observed_key=observed_key, - budget=budget, - observed_fn=observed_fn, - ) - ) - return self - - def evaluate_session(self, session_summary: dict[str, Any]) -> SessionScore: - """Evaluates a single session summary. - - Args: - session_summary: Dict with session metrics. - - Returns: - SessionScore with computed scores. - """ - scores: dict[str, float] = {} - details: dict[str, Any] = {} - passed = True - - for metric in self._metrics: - try: - score = metric.fn(session_summary) - score = max(0.0, min(1.0, float(score))) - scores[metric.name] = score - metric_passed = score >= metric.threshold - if not metric_passed: - passed = False - except Exception as e: - logger.warning("Metric %s failed: %s", metric.name, e) - scores[metric.name] = 0.0 - metric_passed = False - passed = False - - # Stash per-metric reporting detail for *every* metric so the CLI - # ``--exit-code`` failure output always has a threshold / score / - # passed triple to emit, even for custom metrics that didn't - # declare observed_key / observed_fn. Observed / budget are only - # included when the metric supplied them. Keys are prefixed with - # ``metric_`` to avoid colliding with other details callers. - observed_value: Optional[Any] = None - if metric.observed_fn is not None: - try: - observed_value = metric.observed_fn(session_summary) - except Exception: # pylint: disable=broad-except - observed_value = None - elif metric.observed_key is not None: - observed_value = session_summary.get(metric.observed_key) - details[f"metric_{metric.name}"] = { - "observed": observed_value, - "budget": metric.budget, - "threshold": metric.threshold, - "score": scores[metric.name], - "passed": metric_passed, - } - - return SessionScore( - session_id=session_summary.get("session_id", "unknown"), - scores=scores, - passed=passed, - details=details, - ) - - # ---- Pre-built evaluators ---- # - - # The prebuilt evaluators below use raw-budget gates: they fail iff - # the observed metric exceeds the user-supplied budget. Historically - # these ran the normalized ``udf_kernels.score_*`` functions under a - # 0.5 score cutoff, which caused ``--threshold=5000`` on latency to - # fail near 2500ms — the gate was at half the budget the user typed. - # See CHANGELOG and the related blog-post-#2 plan (#77) for context. - # ``udf_kernels.score_*`` is unchanged; it still powers the SQL-native - # UDF path in ``udf_sql_templates.py``, which has its own semantics. - - @staticmethod - def latency( - threshold_ms: float = 5000.0, - ) -> CodeEvaluator: - """Pre-built evaluator that fails when average latency exceeds the budget. - - Pass/fail is a raw comparison: ``avg_latency_ms <= threshold_ms`` - passes, strictly greater fails. The returned evaluator's score for - a session is ``1.0`` on pass and ``0.0`` on fail. - - Args: - threshold_ms: Maximum acceptable average latency in ms. - - Returns: - CodeEvaluator configured for latency checking. - """ - - def _score(s: dict[str, Any]) -> float: - observed = s.get("avg_latency_ms", 0) or 0 - return 1.0 if observed <= threshold_ms else 0.0 - - evaluator = CodeEvaluator(name="latency_evaluator") - evaluator.add_metric( - "latency", - _score, - threshold=1.0, - observed_key="avg_latency_ms", - budget=threshold_ms, - ) - return evaluator - - @staticmethod - def turn_count(max_turns: int = 10) -> CodeEvaluator: - """Pre-built evaluator that fails when turn count exceeds the budget. - - Pass/fail is a raw comparison: ``turn_count <= max_turns`` passes, - strictly greater fails. - - Args: - max_turns: Maximum acceptable number of turns. - - Returns: - CodeEvaluator configured for turn count checking. - """ - - def _score(s: dict[str, Any]) -> float: - observed = s.get("turn_count", 0) or 0 - return 1.0 if observed <= max_turns else 0.0 - - evaluator = CodeEvaluator(name="turn_count_evaluator") - evaluator.add_metric( - "turn_count", - _score, - threshold=1.0, - observed_key="turn_count", - budget=max_turns, - ) - return evaluator - - @staticmethod - def error_rate( - max_error_rate: float = 0.1, - ) -> CodeEvaluator: - """Pre-built evaluator that fails when tool error rate exceeds the budget. - - Pass/fail is a raw comparison: ``(tool_errors / tool_calls) <= max_error_rate`` - passes, strictly greater fails. Sessions with zero tool calls pass - trivially (nothing to fail). - - Args: - max_error_rate: Maximum acceptable tool error fraction. - - Returns: - CodeEvaluator configured for error rate checking. - """ - - def _observed(s: dict[str, Any]) -> float: - calls = s.get("tool_calls", 0) or 0 - errors = s.get("tool_errors", 0) or 0 - if calls <= 0: - return 0.0 - return errors / calls - - def _score(s: dict[str, Any]) -> float: - calls = s.get("tool_calls", 0) or 0 - if calls <= 0: - return 1.0 - return 1.0 if _observed(s) <= max_error_rate else 0.0 - - evaluator = CodeEvaluator(name="error_rate_evaluator") - evaluator.add_metric( - "error_rate", - _score, - threshold=1.0, - observed_fn=_observed, - budget=max_error_rate, - ) - return evaluator - - @staticmethod - def token_efficiency( - max_tokens: int = 50000, - ) -> CodeEvaluator: - """Pre-built evaluator that fails when total tokens exceed the budget. - - Pass/fail is a raw comparison: ``total_tokens <= max_tokens`` - passes, strictly greater fails. - - Args: - max_tokens: Maximum acceptable total token count. - - Returns: - CodeEvaluator configured for token efficiency. - """ - - def _score(s: dict[str, Any]) -> float: - observed = s.get("total_tokens", 0) or 0 - return 1.0 if observed <= max_tokens else 0.0 - - evaluator = CodeEvaluator(name="token_efficiency_evaluator") - evaluator.add_metric( - "token_efficiency", - _score, - threshold=1.0, - observed_key="total_tokens", - budget=max_tokens, - ) - return evaluator - - @staticmethod - def ttft( - threshold_ms: float = 1000.0, - ) -> CodeEvaluator: - """Pre-built evaluator that fails when TTFT exceeds the budget. - - Pass/fail is a raw comparison: ``avg_ttft_ms <= threshold_ms`` - passes, strictly greater fails. - - Args: - threshold_ms: Maximum acceptable average TTFT in ms. - - Returns: - CodeEvaluator configured for TTFT checking. - """ - - def _score(s: dict[str, Any]) -> float: - observed = s.get("avg_ttft_ms", 0) or 0 - return 1.0 if observed <= threshold_ms else 0.0 - - evaluator = CodeEvaluator(name="ttft_evaluator") - evaluator.add_metric( - "ttft", - _score, - threshold=1.0, - observed_key="avg_ttft_ms", - budget=threshold_ms, - ) - return evaluator - - @staticmethod - def cost_per_session( - max_cost_usd: float = 1.0, - input_cost_per_1k: float = 0.00025, - output_cost_per_1k: float = 0.00125, - ) -> CodeEvaluator: - """Pre-built evaluator that fails when per-session cost exceeds the budget. - - Pass/fail is a raw comparison: ``estimated_cost_usd <= max_cost_usd`` - passes, strictly greater fails. - - Args: - max_cost_usd: Maximum acceptable cost in USD. - input_cost_per_1k: Cost per 1K input tokens. - output_cost_per_1k: Cost per 1K output tokens. - - Returns: - CodeEvaluator configured for cost checking. - """ - - def _observed(s: dict[str, Any]) -> float: - input_tokens = s.get("input_tokens", 0) or 0 - output_tokens = s.get("output_tokens", 0) or 0 - return (input_tokens / 1000.0) * input_cost_per_1k + ( - output_tokens / 1000.0 - ) * output_cost_per_1k - - def _score(s: dict[str, Any]) -> float: - return 1.0 if _observed(s) <= max_cost_usd else 0.0 - - evaluator = CodeEvaluator(name="cost_evaluator") - evaluator.add_metric( - "cost", - _score, - threshold=1.0, - observed_fn=_observed, - budget=max_cost_usd, - ) - return evaluator - - -# ------------------------------------------------------------------ # -# LLM-as-Judge Evaluator # -# ------------------------------------------------------------------ # - + def name(self) -> str: + return self._name -_CORRECTNESS_PROMPT = """\ -You are evaluating an AI agent's response for correctness. - -## Conversation Trace -{trace_text} - -## Final Agent Response -{final_response} - -## Instructions -Score the response on a scale of 1 to 10 for correctness: Did the \ -agent provide an accurate, factual response that addresses the \ -user's request? - -Respond with ONLY a valid JSON object: -{{"correctness": , "justification": ""}} -""" - -_HALLUCINATION_PROMPT = """\ -You are evaluating an AI agent's response for hallucination. - -## Conversation Trace -{trace_text} - -## Final Agent Response -{final_response} - -## Instructions -Score the response on a scale of 1 to 10 for faithfulness (where \ -10 means NO hallucination). Does the response contain claims not \ -supported by the tool results or conversation context? - -Respond with ONLY a valid JSON object: -{{"faithfulness": , "justification": ""}} -""" - -_SENTIMENT_PROMPT = """\ -You are evaluating the sentiment of an AI agent's conversation. - -## Conversation Trace -{trace_text} - -## Final Agent Response -{final_response} - -## Instructions -Score the overall sentiment and helpfulness of the interaction \ -on a scale of 1 to 10 (10 = very positive and helpful). - -Respond with ONLY a valid JSON object: -{{"sentiment": , "justification": ""}} -""" - - -@dataclass -class _JudgeCriterion: - """A single LLM-as-judge criterion.""" - - name: str - prompt_template: str - score_key: str - threshold: float = 0.5 - - -class LLMAsJudge: - """Semantic evaluator using LLM-as-a-judge. - - Uses BigQuery's native ``ML.GENERATE_TEXT`` (or the Gemini API) - to evaluate agent traces against semantic criteria like - correctness, hallucination, and sentiment. - """ - - def __init__( - self, - name: str = "llm_judge", - criteria: Optional[list[_JudgeCriterion]] = None, - model: Optional[str] = None, - ) -> None: - self.name = name - self._criteria: list[_JudgeCriterion] = criteria or [] - self.model = model or "gemini-2.5-flash" - - def add_criterion( - self, - name: str, - prompt_template: str, - score_key: str, - threshold: float = 0.5, - ) -> LLMAsJudge: - """Adds a custom evaluation criterion. - - Args: - name: Criterion name. - prompt_template: Prompt with {trace_text} and - {final_response} placeholders. - score_key: JSON key in LLM response containing score. - threshold: Pass/fail threshold (0-1 scale). - - Returns: - Self for chaining. - """ - self._criteria.append( - _JudgeCriterion( - name=name, - prompt_template=prompt_template, - score_key=score_key, - threshold=threshold, - ) - ) - return self - - async def evaluate_session( - self, - trace_text: str, - final_response: str, - ) -> SessionScore: - """Evaluates a session using the LLM judge. - - Args: - trace_text: Formatted trace text. - final_response: Final agent response. - - Returns: - SessionScore with LLM-judged scores. - """ - scores: dict[str, float] = {} - feedback_parts: list[str] = [] - passed = True - - for criterion in self._criteria: - score, feedback = await self._judge_criterion( - criterion, - trace_text, - final_response, - ) - scores[criterion.name] = score - if feedback: - feedback_parts.append(f"{criterion.name}: {feedback}") - if score < criterion.threshold: - passed = False - - return SessionScore( - session_id="", - scores=scores, - passed=passed, - llm_feedback="\n".join(feedback_parts) or None, - ) - - async def _judge_criterion( - self, - criterion: _JudgeCriterion, - trace_text: str, - final_response: str, - ) -> tuple[float, str]: - """Evaluates one criterion via LLM call.""" - prompt = criterion.prompt_template.format( - trace_text=trace_text, - final_response=final_response or "No response.", - ) - - try: - from google import genai - from google.genai import types - - client = genai.Client() - response = await client.aio.models.generate_content( - model=self.model, - contents=prompt, - config=types.GenerateContentConfig( - temperature=0.1, - max_output_tokens=2048, - ), - ) - - text = response.text.strip() - result = _parse_json_from_text(text) - - if result and criterion.score_key in result: - raw = float(result[criterion.score_key]) - score = raw / 10.0 # Normalize 1-10 to 0-1 - justification = result.get("justification", "") - return score, justification - - return 0.0, text - - except ImportError: - logger.warning("google-genai not installed, skipping LLM judge.") - return 0.0, "google-genai not installed" - except Exception as e: - logger.warning("LLM judge failed: %s", e) - return 0.0, str(e) - - # ---- Pre-built evaluators ---- # + @property + def _criteria(self) -> list: + class _JudgeCriterion: + def __init__(self, name: str, threshold: float): + self.name = name + self.threshold = threshold + + name_map = { + "correctness_judge": "correctness", + "hallucination_judge": "faithfulness", + "sentiment_judge": "sentiment", + } + criterion_name = name_map.get(self.name, "correctness") + return [_JudgeCriterion(name=criterion_name, threshold=self._threshold)] @staticmethod - def correctness( - threshold: float = 0.5, - model: Optional[str] = None, - ) -> LLMAsJudge: - """Pre-built correctness evaluator. - - Args: - threshold: Minimum score to pass (0-1). - model: LLM model to use for judging. - - Returns: - LLMAsJudge configured for correctness. - """ - judge = LLMAsJudge( - name="correctness_judge", - model=model, - ) - judge.add_criterion( - name="correctness", - prompt_template=_CORRECTNESS_PROMPT, - score_key="correctness", - threshold=threshold, - ) - return judge + def correctness(threshold: float = 0.5, model: Optional[str] = None) -> LLMAsJudge: + return LLMAsJudge(name="correctness_judge", project_id="proj", dataset_id="ds", llm_judge_model=model, threshold=threshold) @staticmethod - def hallucination( - threshold: float = 0.5, - model: Optional[str] = None, - ) -> LLMAsJudge: - """Pre-built hallucination (faithfulness) evaluator. - - Args: - threshold: Minimum faithfulness score to pass (0-1). - model: LLM model to use for judging. - - Returns: - LLMAsJudge configured for hallucination detection. - """ - judge = LLMAsJudge( - name="hallucination_judge", - model=model, - ) - judge.add_criterion( - name="faithfulness", - prompt_template=_HALLUCINATION_PROMPT, - score_key="faithfulness", - threshold=threshold, - ) - return judge + def hallucination(threshold: float = 0.5, model: Optional[str] = None) -> LLMAsJudge: + return LLMAsJudge(name="hallucination_judge", project_id="proj", dataset_id="ds", llm_judge_model=model, threshold=threshold) @staticmethod - def sentiment( - threshold: float = 0.5, - model: Optional[str] = None, - ) -> LLMAsJudge: - """Pre-built sentiment evaluator. - - Args: - threshold: Minimum sentiment score to pass (0-1). - model: LLM model to use for judging. - - Returns: - LLMAsJudge configured for sentiment analysis. - """ - judge = LLMAsJudge( - name="sentiment_judge", - model=model, - ) - judge.add_criterion( - name="sentiment", - prompt_template=_SENTIMENT_PROMPT, - score_key="sentiment", - threshold=threshold, - ) - return judge - - -# ------------------------------------------------------------------ # -# SQL Templates for BigQuery-native evaluation # -# ------------------------------------------------------------------ # - -SESSION_SUMMARY_QUERY = """\ -SELECT - session_id, - COUNT(*) AS total_events, - COUNTIF(event_type = 'TOOL_STARTING') AS tool_calls, - COUNTIF(event_type = 'TOOL_ERROR') AS tool_errors, - COUNTIF(event_type = 'LLM_REQUEST') AS llm_calls, - AVG( - CAST( - JSON_VALUE(latency_ms, '$.total_ms') AS FLOAT64 - ) - ) AS avg_latency_ms, - MAX( - CAST( - JSON_VALUE(latency_ms, '$.total_ms') AS FLOAT64 - ) - ) AS max_latency_ms, - TIMESTAMP_DIFF( - MAX(timestamp), MIN(timestamp), MILLISECOND - ) AS total_latency_ms, - COUNTIF( - event_type = 'USER_MESSAGE_RECEIVED' - ) AS turn_count, - AVG( - CAST( - JSON_VALUE(latency_ms, '$.time_to_first_token_ms') AS FLOAT64 - ) - ) AS avg_ttft_ms, - COUNTIF(event_type LIKE 'HITL_%') AS hitl_events, - COUNTIF( - ENDS_WITH(event_type, '_ERROR') - OR error_message IS NOT NULL - OR status = 'ERROR' - ) > 0 AS has_error, - SUM(COALESCE( - CAST(JSON_VALUE( - attributes, '$.usage_metadata.prompt_token_count' - ) AS INT64), - CAST(JSON_VALUE( - content, '$.usage.prompt' - ) AS INT64), - CAST(JSON_VALUE( - attributes, '$.input_tokens' - ) AS INT64) - )) AS input_tokens, - SUM(COALESCE( - CAST(JSON_VALUE( - attributes, '$.usage_metadata.candidates_token_count' - ) AS INT64), - CAST(JSON_VALUE( - content, '$.usage.completion' - ) AS INT64), - CAST(JSON_VALUE( - attributes, '$.output_tokens' - ) AS INT64) - )) AS output_tokens, - SUM(COALESCE( - CAST(JSON_VALUE( - attributes, '$.usage_metadata.total_token_count' - ) AS INT64), - CAST(JSON_VALUE( - content, '$.usage.total' - ) AS INT64), - COALESCE( - CAST(JSON_VALUE( - attributes, '$.input_tokens' - ) AS INT64), 0 - ) + COALESCE( - CAST(JSON_VALUE( - attributes, '$.output_tokens' - ) AS INT64), 0 - ) - )) AS total_tokens -FROM `{project}.{dataset}.{table}` -WHERE {where} -GROUP BY session_id -LIMIT @trace_limit -""" - -_AI_GENERATE_JUDGE_BATCH_QUERY_TEMPLATE = """\ -WITH session_traces AS ( - SELECT - session_id, - STRING_AGG( - CONCAT( - event_type, ': ', - COALESCE( - JSON_VALUE(content, '$.text_summary'), '' - ) - ), - '\\n' ORDER BY timestamp - ) AS trace_text, - ARRAY_AGG( - JSON_VALUE(content, '$.response') - IGNORE NULLS - ORDER BY timestamp DESC - LIMIT 1 - )[SAFE_OFFSET(0)] AS final_response - FROM `{project}.{dataset}.{table}` - WHERE {where} - GROUP BY session_id - HAVING LENGTH(trace_text) > 10 - LIMIT @trace_limit -) -SELECT - session_id, - trace_text, - final_response, - gen.score AS score, - gen.justification AS justification, - gen.status AS gen_status -FROM ( - SELECT - session_id, - trace_text, - final_response, - AI.GENERATE( - -- The Python prompt template is rebuilt at SQL time: - -- prefix ++ trace_text ++ middle ++ final_response ++ suffix - -- Each segment is a separate query parameter so AI.GENERATE - -- sees the exact full Python template (including the - -- per-criterion output-format spec) the API-fallback path uses. - prompt => CONCAT( - @judge_prompt_prefix, trace_text, - @judge_prompt_middle, COALESCE(final_response, 'N/A'), - @judge_prompt_suffix - ), - endpoint => '{endpoint}',{connection_arg} - model_params => JSON '{{"generationConfig": {{"temperature": 0.1, "maxOutputTokens": 1024}}}}', - output_schema => 'score INT64, justification STRING' - ) AS gen - FROM session_traces -) -""" - - -def render_ai_generate_judge_query( - *, - project: str, - dataset: str, - table: str, - where: str, - endpoint: str, - connection_id: Optional[str] = None, -) -> str: - """Render the AI.GENERATE judge batch query for a given config. - - ``AI.GENERATE`` is BigQuery's scalar generative function (it returns a - ``STRUCT`` shaped - by ``output_schema``). The function call lives inside a regular - ``SELECT`` — it is *not* a table-valued function, so the surrounding - ``FROM session_traces, AI.GENERATE(...)`` lateral-join syntax used - by older SDK versions does not parse against current BigQuery. - - ``connection_id`` is optional. When supplied (e.g. - ``"us.bqaa_ai_generate"``) the call uses that connection's service - account; when omitted, AI.GENERATE runs against the end-user - credentials of whichever account submits the job. Both shapes are - documented forms of the same function. - """ - if connection_id: - connection_arg = f"\n connection_id => '{connection_id}'," - else: - connection_arg = "" - return _AI_GENERATE_JUDGE_BATCH_QUERY_TEMPLATE.format( - project=project, - dataset=dataset, - table=table, - where=where, - endpoint=endpoint, - connection_arg=connection_arg, - ) - - -# Public alias kept for downstream code that imports the raw template -# string (e.g. for inspection / docs). Callers building queries should -# use ``render_ai_generate_judge_query`` instead so the optional -# ``connection_id`` arg is wired correctly. -AI_GENERATE_JUDGE_BATCH_QUERY = _AI_GENERATE_JUDGE_BATCH_QUERY_TEMPLATE - -# Legacy template kept for backward compatibility with pre-created -# BQ ML models. -_LEGACY_LLM_JUDGE_BATCH_QUERY = """\ -WITH session_traces AS ( - SELECT - session_id, - STRING_AGG( - CONCAT( - event_type, ': ', - COALESCE( - JSON_VALUE(content, '$.text_summary'), '' - ) - ), - '\\n' ORDER BY timestamp - ) AS trace_text, - ARRAY_AGG( - JSON_VALUE(content, '$.response') - IGNORE NULLS - ORDER BY timestamp DESC - LIMIT 1 - )[SAFE_OFFSET(0)] AS final_response - FROM `{project}.{dataset}.{table}` - WHERE {where} - GROUP BY session_id - HAVING LENGTH(trace_text) > 10 - LIMIT @trace_limit -) -SELECT - session_id, - trace_text, - final_response, - ML.GENERATE_TEXT( - MODEL `{model}`, - STRUCT( - -- Same prefix/middle/suffix substitution as the AI.GENERATE - -- path; preserves the full Python prompt_template. - CONCAT( - @judge_prompt_prefix, trace_text, - @judge_prompt_middle, COALESCE(final_response, 'N/A'), - @judge_prompt_suffix - ) AS prompt - ), - STRUCT(0.1 AS temperature, 500 AS max_output_tokens) - ).ml_generate_text_result AS evaluation -FROM session_traces -""" - -# Keep backward-compatible alias. -LLM_JUDGE_BATCH_QUERY = _LEGACY_LLM_JUDGE_BATCH_QUERY - - -_TRACE_SENTINEL = "\x00__BQAA_JUDGE_TRACE__\x00" -_RESPONSE_SENTINEL = "\x00__BQAA_JUDGE_RESPONSE__\x00" - - -def split_judge_prompt_template(prompt_template: str) -> tuple[str, str, str]: - """Split a Python judge prompt into ``(prefix, middle, suffix)``. - - The Python ``LLMAsJudge`` prompt template uses ``{trace_text}`` and - ``{final_response}`` placeholders (in that order) to interpolate - per-session inputs. The BigQuery-native ``AI.GENERATE`` and - ``ML.GENERATE_TEXT`` paths can't use Python ``str.format`` — they - build the prompt at SQL time. This helper returns the three - literal segments those SQL paths need to ``CONCAT`` together with - the SQL-side ``trace_text`` and ``final_response`` columns, - preserving the exact full template (including the per-criterion - output-format spec that follows the placeholders). - - Internally the helper format()s the template once with sentinel - values, so any literal ``{{...}}`` braces in the source template - (e.g. the JSON output spec ``{{"correctness": , ...}}``) - are correctly un-escaped before splitting. The SQL paths see the - same string the API-fallback path's ``str.format(...)`` would - produce. - - Args: - prompt_template: The Python prompt template, expected to - contain both ``{trace_text}`` and ``{final_response}`` - placeholders in that order. - - Returns: - ``(prefix, middle, suffix)`` such that - ``prefix + trace_text + middle + final_response + suffix`` - reproduces ``prompt_template.format(trace_text=..., final_response=...)`` - for any inputs. When a placeholder is missing, the helper - synthesizes a labeled section for the missing input and - places the label *immediately before* the injected value - (label first, then value), so the model reads - ``...Trace:\n\nResponse:\n...`` rather than - the value followed by an orphan label. - """ - has_trace = "{trace_text}" in prompt_template - has_response = "{final_response}" in prompt_template - - # Reminder for the fallback branches below: the SQL CONCAT runs - # prefix ++ trace_text ++ middle ++ final_response ++ suffix - # so any label we synthesize for an absent placeholder must end - # up *next to* the value it labels (label first, then value), - # not on the far side of it. Earlier versions appended labels - # *after* the values, which produced ``\nTrace:\n...``. - - if not has_trace and not has_response: - # No placeholders at all. Append a labeled trace + response - # block after the user's instructions. The labels precede the - # values so the model reads them in order. - return ( - prompt_template + "\nTrace:\n", - "\nResponse:\n", - "", - ) - - if not has_trace: - # final_response placeholder only. Honor the user's structure - # and inject a labeled trace block right before the response, - # so the trace label sits next to the trace. - formatted = prompt_template.format(final_response=_RESPONSE_SENTINEL) - before_response, _, after_response = formatted.partition(_RESPONSE_SENTINEL) - return ( - before_response + "\nTrace:\n", - "\n", - after_response, - ) - - if not has_response: - # trace_text placeholder only. Append a labeled response block - # after the original template's tail, so the response label - # sits next to the response value (not after it). - formatted = prompt_template.format(trace_text=_TRACE_SENTINEL) - prefix, _, after_trace = formatted.partition(_TRACE_SENTINEL) - return ( - prefix, - after_trace + "\nResponse:\n", - "", - ) - - formatted = prompt_template.format( - trace_text=_TRACE_SENTINEL, - final_response=_RESPONSE_SENTINEL, - ) - prefix, _, rest = formatted.partition(_TRACE_SENTINEL) - middle, _, suffix = rest.partition(_RESPONSE_SENTINEL) - return prefix, middle, suffix - - -# ------------------------------------------------------------------ # -# Helpers # -# ------------------------------------------------------------------ # - - -def strip_markdown_fences(text: Optional[str]) -> Optional[str]: - """Strip markdown code block fences (``\\`\\`\\`json ... \\`\\`\\```) if present. - - Models frequently wrap JSON output in fenced code blocks. This helper - removes the opening ``\\`\\`\\`json`` (or plain ``\\`\\`\\```) and closing - ``\\`\\`\\``` markers so the result can be passed to ``json.loads()``. - - The regex pattern matches the same fences handled server-side by - ``REGEXP_REPLACE`` in ``ontology_graph.py`` and ``context_graph.py``. - """ - if not text: - return text - text = text.strip() - if not text.startswith("```"): - return text - text = re.sub(r"^```[a-zA-Z0-9]*\s*\n?", "", text) - text = re.sub(r"\n?\s*```[\s\S]*$", "", text) - return text.strip() - - -def _parse_json_from_text(text: str) -> Optional[dict[str, Any]]: - """Extracts and parses JSON from LLM response text.""" - if not text: - return None - - # Strip markdown fences first - stripped = strip_markdown_fences(text) - try: - return json.loads(stripped) - except (json.JSONDecodeError, TypeError): - pass - - # Try raw JSON extraction (brace matching) - if "{" in stripped: - try: - start = stripped.index("{") - brace = 0 - end = start - for i, ch in enumerate(stripped[start:], start): - if ch == "{": - brace += 1 - elif ch == "}": - brace -= 1 - if brace == 0: - end = i + 1 - break - return json.loads(stripped[start:end]) - except (ValueError, json.JSONDecodeError): - pass - - return None + def sentiment(threshold: float = 0.5, model: Optional[str] = None) -> LLMAsJudge: + return LLMAsJudge(name="sentiment_judge", project_id="proj", dataset_id="ds", llm_judge_model=model, threshold=threshold) diff --git a/src/bigquery_agent_analytics/grader_pipeline.py b/src/bigquery_agent_analytics/grader_pipeline.py index 181572d..068b6cd 100644 --- a/src/bigquery_agent_analytics/grader_pipeline.py +++ b/src/bigquery_agent_analytics/grader_pipeline.py @@ -12,419 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -"""Grader composition pipeline for combining multiple evaluators. +"""Backward-compatibility module mapping for grader pipeline.""" -Composes ``CodeEvaluator``, ``LLMAsJudge``, and custom graders into a -single verdict using configurable scoring strategies (weighted average, -binary all-pass, or majority vote). - -Example usage:: - - from bigquery_agent_analytics import ( - CodeEvaluator, GraderPipeline, LLMAsJudge, WeightedStrategy, - ) - - pipeline = ( - GraderPipeline(WeightedStrategy( - weights={"latency": 0.3, "correctness": 0.7}, - )) - .add_code_grader(CodeEvaluator.latency(), weight=0.3) - .add_llm_grader(LLMAsJudge.correctness(), weight=0.7) - ) - - verdict = await pipeline.evaluate( - session_summary={"session_id": "s1", "avg_latency_ms": 2000}, - trace_text="User: hello\\nAgent: hi", - final_response="hi", - ) -""" - -from __future__ import annotations - -import abc -import logging -from typing import Any, Callable - -from pydantic import BaseModel -from pydantic import Field - -from .evaluators import CodeEvaluator -from .evaluators import LLMAsJudge - -logger = logging.getLogger("bigquery_agent_analytics." + __name__) - - -# ------------------------------------------------------------------ # -# Data Models # -# ------------------------------------------------------------------ # - - -class GraderResult(BaseModel): - """Result from a single grader.""" - - grader_name: str = Field(description="Name of the grader.") - scores: dict[str, float] = Field( - default_factory=dict, - description="Metric scores from this grader.", - ) - passed: bool = Field( - default=True, - description="Whether this grader passed.", - ) - - -class AggregateVerdict(BaseModel): - """Aggregated verdict from all graders in the pipeline.""" - - grader_results: list[GraderResult] = Field( - default_factory=list, - description="Individual grader results.", - ) - final_score: float = Field( - default=0.0, - description="Final aggregated score.", - ) - passed: bool = Field( - default=False, - description="Whether the overall evaluation passed.", - ) - strategy_name: str = Field( - default="", - description="Name of the scoring strategy used.", - ) - - -# ------------------------------------------------------------------ # -# Scoring Strategies # -# ------------------------------------------------------------------ # - - -class ScoringStrategy(abc.ABC): - """Abstract base class for scoring strategies.""" - - @abc.abstractmethod - def aggregate( - self, - grader_results: list[GraderResult], - ) -> AggregateVerdict: - """Aggregates grader results into a single verdict. - - Args: - grader_results: List of individual grader results. - - Returns: - AggregateVerdict with final score and pass/fail. - """ - - -class WeightedStrategy(ScoringStrategy): - """Weighted average of grader scores; pass if >= threshold.""" - - def __init__( - self, - weights: dict[str, float] | None = None, - threshold: float = 0.5, - ) -> None: - """Initializes the weighted strategy. - - Args: - weights: Mapping of grader name to weight. If None, - all graders are weighted equally. - threshold: Minimum weighted score to pass. - """ - self.weights = weights or {} - self.threshold = threshold - - def aggregate( - self, - grader_results: list[GraderResult], - ) -> AggregateVerdict: - if not grader_results: - return AggregateVerdict(strategy_name="weighted") - - total_weight = 0.0 - weighted_sum = 0.0 - - for result in grader_results: - weight = self.weights.get(result.grader_name, 1.0) - # Average the grader's metric scores - if result.scores: - avg_score = sum(result.scores.values()) / len(result.scores) - else: - avg_score = 1.0 if result.passed else 0.0 - weighted_sum += avg_score * weight - total_weight += weight - - final_score = weighted_sum / total_weight if total_weight > 0 else 0.0 - - return AggregateVerdict( - grader_results=grader_results, - final_score=final_score, - passed=final_score >= self.threshold, - strategy_name="weighted", - ) - - -class BinaryStrategy(ScoringStrategy): - """All graders must pass independently.""" - - def aggregate( - self, - grader_results: list[GraderResult], - ) -> AggregateVerdict: - if not grader_results: - return AggregateVerdict(strategy_name="binary") - - all_passed = all(r.passed for r in grader_results) - - # Average of all scores - all_scores = [] - for r in grader_results: - all_scores.extend(r.scores.values()) - final_score = ( - sum(all_scores) / len(all_scores) - if all_scores - else (1.0 if all_passed else 0.0) - ) - - return AggregateVerdict( - grader_results=grader_results, - final_score=final_score, - passed=all_passed, - strategy_name="binary", - ) - - -class MajorityStrategy(ScoringStrategy): - """Majority of graders must pass.""" - - def aggregate( - self, - grader_results: list[GraderResult], - ) -> AggregateVerdict: - if not grader_results: - return AggregateVerdict(strategy_name="majority") - - num_passed = sum(1 for r in grader_results if r.passed) - majority = num_passed > len(grader_results) / 2 - - # Average of all scores - all_scores = [] - for r in grader_results: - all_scores.extend(r.scores.values()) - final_score = ( - sum(all_scores) / len(all_scores) - if all_scores - else (1.0 if majority else 0.0) - ) - - return AggregateVerdict( - grader_results=grader_results, - final_score=final_score, - passed=majority, - strategy_name="majority", - ) - - -# ------------------------------------------------------------------ # -# Grader Pipeline # -# ------------------------------------------------------------------ # - - -class _GraderEntry: - """Internal wrapper for a grader in the pipeline.""" - - def __init__( - self, - name: str, - evaluate_fn: Any, - weight: float = 1.0, - is_async: bool = False, - ) -> None: - self.name = name - self.evaluate_fn = evaluate_fn - self.weight = weight - self.is_async = is_async - - -class GraderPipeline: - """Composes multiple graders into a single evaluation pipeline. - - Supports ``CodeEvaluator``, ``LLMAsJudge``, and arbitrary custom - grader functions combined via a configurable ``ScoringStrategy``. - - Example:: - - pipeline = ( - GraderPipeline(WeightedStrategy(threshold=0.6)) - .add_code_grader(CodeEvaluator.latency()) - .add_llm_grader(LLMAsJudge.correctness()) - ) - verdict = await pipeline.evaluate( - session_summary={...}, - trace_text="...", - final_response="...", - ) - """ - - def __init__(self, strategy: ScoringStrategy) -> None: - """Initializes the pipeline with a scoring strategy. - - Args: - strategy: The strategy used to aggregate grader results. - """ - self.strategy = strategy - self._graders: list[_GraderEntry] = [] - - def add_code_grader( - self, - evaluator: CodeEvaluator, - weight: float = 1.0, - ) -> GraderPipeline: - """Adds a CodeEvaluator grader to the pipeline. - - Args: - evaluator: A CodeEvaluator instance. - weight: Weight for weighted strategies. - - Returns: - Self for chaining. - """ - self._graders.append( - _GraderEntry( - name=evaluator.name, - evaluate_fn=evaluator, - weight=weight, - is_async=False, - ) - ) - return self - - def add_llm_grader( - self, - judge: LLMAsJudge, - weight: float = 1.0, - ) -> GraderPipeline: - """Adds an LLMAsJudge grader to the pipeline. - - Args: - judge: An LLMAsJudge instance. - weight: Weight for weighted strategies. - - Returns: - Self for chaining. - """ - self._graders.append( - _GraderEntry( - name=judge.name, - evaluate_fn=judge, - weight=weight, - is_async=True, - ) - ) - return self - - def add_custom_grader( - self, - name: str, - fn: Callable[[dict[str, Any]], GraderResult], - weight: float = 1.0, - ) -> GraderPipeline: - """Adds a custom grader function to the pipeline. - - The function receives a dict with ``session_summary``, - ``trace_text``, and ``final_response`` keys. - - Args: - name: Name for the grader. - fn: Function returning a GraderResult. - weight: Weight for weighted strategies. - - Returns: - Self for chaining. - """ - self._graders.append( - _GraderEntry( - name=name, - evaluate_fn=fn, - weight=weight, - is_async=False, - ) - ) - return self - - async def evaluate( - self, - session_summary: dict[str, Any] | None = None, - trace_text: str = "", - final_response: str = "", - ) -> AggregateVerdict: - """Evaluates using all graders and aggregates results. - - Args: - session_summary: Dict with session metrics (for - CodeEvaluator graders). - trace_text: Formatted trace text (for LLMAsJudge - graders). - final_response: Final agent response. - - Returns: - AggregateVerdict with combined results. - """ - session_summary = session_summary or {} - grader_results: list[GraderResult] = [] - - for entry in self._graders: - try: - result = await self._run_grader( - entry, session_summary, trace_text, final_response - ) - grader_results.append(result) - except Exception as e: - logger.warning("Grader %s failed: %s", entry.name, e) - grader_results.append( - GraderResult( - grader_name=entry.name, - scores={}, - passed=False, - ) - ) - - return self.strategy.aggregate(grader_results) - - async def _run_grader( - self, - entry: _GraderEntry, - session_summary: dict[str, Any], - trace_text: str, - final_response: str, - ) -> GraderResult: - """Runs a single grader and returns its result.""" - evaluator = entry.evaluate_fn - - if isinstance(evaluator, CodeEvaluator): - score = evaluator.evaluate_session(session_summary) - return GraderResult( - grader_name=entry.name, - scores=score.scores, - passed=score.passed, - ) - - if isinstance(evaluator, LLMAsJudge): - score = await evaluator.evaluate_session( - trace_text=trace_text, - final_response=final_response, - ) - return GraderResult( - grader_name=entry.name, - scores=score.scores, - passed=score.passed, - ) - - # Custom grader function - context = { - "session_summary": session_summary, - "trace_text": trace_text, - "final_response": final_response, - } - return evaluator(context) +from .aggregate_grader import * +GraderPipeline = AggregateGrader diff --git a/src/bigquery_agent_analytics/multi_trial.py b/src/bigquery_agent_analytics/multi_trial.py index 4c0f592..720af51 100644 --- a/src/bigquery_agent_analytics/multi_trial.py +++ b/src/bigquery_agent_analytics/multi_trial.py @@ -12,325 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -"""Multi-trial evaluation runner with pass@k / pass^k metrics. +"""Backward-compatibility module mapping for multi trial.""" -Wraps any ``BigQueryTraceEvaluator`` to run N trials per task and -compute probabilistic pass-rate metrics that account for agent -non-determinism. - -Example usage:: - - from bigquery_agent_analytics import ( - BigQueryTraceEvaluator, TrialRunner, - ) - - evaluator = BigQueryTraceEvaluator( - project_id="my-project", - dataset_id="analytics", - ) - runner = TrialRunner(evaluator, num_trials=5) - - report = await runner.run_trials( - session_id="sess-123", - golden_trajectory=[{"tool_name": "search", "args": {}}], - ) - print(report.pass_at_k, report.pass_pow_k) -""" - -from __future__ import annotations - -import asyncio -import logging -import math -import statistics -from typing import Any, Optional - -from pydantic import BaseModel -from pydantic import Field - -from .trace_evaluator import BigQueryTraceEvaluator -from .trace_evaluator import EvalStatus -from .trace_evaluator import MatchType - -logger = logging.getLogger("bigquery_agent_analytics." + __name__) - - -# ------------------------------------------------------------------ # -# Data Models # -# ------------------------------------------------------------------ # - - -class TrialResult(BaseModel): - """Result of a single trial.""" - - trial_index: int = Field(description="Zero-based trial index.") - passed: bool = Field(description="Whether this trial passed.") - scores: dict[str, float] = Field( - default_factory=dict, - description="Metric scores for this trial.", - ) - details: dict[str, Any] = Field( - default_factory=dict, - description="Additional trial details.", - ) - - -class MultiTrialReport(BaseModel): - """Aggregate report across N trials of one task.""" - - session_id: str = Field(description="The session ID evaluated.") - num_trials: int = Field(description="Number of trials run.") - trial_results: list[TrialResult] = Field( - default_factory=list, - description="Individual trial results.", - ) - pass_at_k: float = Field( - default=0.0, - description="P(>=1 pass in k trials).", - ) - pass_pow_k: float = Field( - default=0.0, - description="P(all k trials pass).", - ) - per_trial_pass_rate: float = Field( - default=0.0, - description="Fraction of trials that passed.", - ) - mean_scores: dict[str, float] = Field( - default_factory=dict, - description="Mean score per metric across trials.", - ) - score_std_dev: dict[str, float] = Field( - default_factory=dict, - description="Standard deviation per metric across trials.", - ) - - -# ------------------------------------------------------------------ # -# Static Helpers # -# ------------------------------------------------------------------ # - - -def compute_pass_at_k( - num_trials: int, - num_passed: int, -) -> float: - """Computes pass@k: P(>=1 pass in k trials). - - Uses the formula: 1 - C(n-c, k) / C(n, k) - where n = num_trials, c = num_passed, k = num_trials. - - Args: - num_trials: Total number of trials (k). - num_passed: Number of trials that passed (c). - - Returns: - Probability that at least one trial passes. - """ - if num_trials <= 0: - return 0.0 - if num_passed <= 0: - return 0.0 - if num_passed >= num_trials: - return 1.0 - - # 1 - C(n-c, k) / C(n, k) - n = num_trials - c = num_passed - k = num_trials - - # C(n-c, k) / C(n, k) -- if n-c < k then C(n-c,k)=0 => pass@k=1 - if n - c < k: - return 1.0 - - # Use log to avoid overflow for large values - log_numerator = sum(math.log(n - c - i) for i in range(k)) - log_denominator = sum(math.log(n - i) for i in range(k)) - - return 1.0 - math.exp(log_numerator - log_denominator) - - -def compute_pass_pow_k( - num_trials: int, - num_passed: int, -) -> float: - """Computes pass^k: P(all k trials pass). - - Uses the formula: (num_passed / num_trials) ** num_trials. - - Args: - num_trials: Total number of trials. - num_passed: Number of trials that passed. - - Returns: - Probability that all trials pass. - """ - if num_trials <= 0: - return 0.0 - if num_passed <= 0: - return 0.0 - rate = num_passed / num_trials - return rate**num_trials - - -# ------------------------------------------------------------------ # -# TrialRunner # -# ------------------------------------------------------------------ # - - -class TrialRunner: - """Runs multiple evaluation trials and computes aggregate metrics. - - Wraps a ``BigQueryTraceEvaluator`` and runs N trials per task, - computing pass@k and pass^k metrics that account for agent - non-determinism (e.g. LLM judges produce different scores each - call). - - Example:: - - runner = TrialRunner(evaluator, num_trials=5, concurrency=3) - report = await runner.run_trials( - session_id="sess-123", - golden_trajectory=[...], - ) - """ - - def __init__( - self, - evaluator: BigQueryTraceEvaluator, - num_trials: int = 5, - concurrency: int = 3, - ) -> None: - """Initializes the TrialRunner. - - Args: - evaluator: The trace evaluator to wrap. - num_trials: Number of trials to run per task. - concurrency: Maximum concurrent evaluations. - """ - self.evaluator = evaluator - self.num_trials = num_trials - self.concurrency = concurrency - - async def run_trials( - self, - session_id: str, - golden_trajectory: Optional[list[dict]] = None, - golden_response: Optional[str] = None, - match_type: MatchType = MatchType.EXACT, - task_description: Optional[str] = None, - use_llm_judge: bool = False, - thresholds: Optional[dict[str, float]] = None, - ) -> MultiTrialReport: - """Runs N trials of evaluation for a single session. - - Args: - session_id: The session ID to evaluate. - golden_trajectory: Expected tool call sequence. - golden_response: Expected final response. - match_type: Type of trajectory matching. - task_description: Task description for LLM judge. - use_llm_judge: Whether to use LLM-as-judge. - thresholds: Metric thresholds for pass/fail. - - Returns: - MultiTrialReport with aggregate metrics. - """ - semaphore = asyncio.Semaphore(self.concurrency) - trial_results: list[TrialResult] = [] - - async def _run_one(trial_index: int) -> TrialResult: - async with semaphore: - result = await self.evaluator.evaluate_session( - session_id=session_id, - golden_trajectory=golden_trajectory, - golden_response=golden_response, - match_type=match_type, - task_description=task_description, - use_llm_judge=use_llm_judge, - thresholds=thresholds, - ) - return TrialResult( - trial_index=trial_index, - passed=result.eval_status == EvalStatus.PASSED, - scores=result.scores, - details=result.details, - ) - - tasks = [_run_one(i) for i in range(self.num_trials)] - trial_results = list(await asyncio.gather(*tasks)) - - return self._build_report(session_id, trial_results) - - async def run_trials_batch( - self, - eval_dataset: list[dict[str, Any]], - match_type: MatchType = MatchType.EXACT, - use_llm_judge: bool = False, - ) -> list[MultiTrialReport]: - """Runs multi-trial evaluation for a batch of tasks. - - Args: - eval_dataset: List of dicts with session_id, - expected_trajectory, etc. - match_type: Type of trajectory matching. - use_llm_judge: Whether to use LLM-as-judge. - - Returns: - List of MultiTrialReport, one per task. - """ - reports = [] - for item in eval_dataset: - report = await self.run_trials( - session_id=item["session_id"], - golden_trajectory=item.get("expected_trajectory"), - golden_response=item.get("expected_response"), - match_type=match_type, - task_description=item.get("task_description"), - use_llm_judge=use_llm_judge, - thresholds=item.get("thresholds"), - ) - reports.append(report) - return reports - - def _build_report( - self, - session_id: str, - trial_results: list[TrialResult], - ) -> MultiTrialReport: - """Builds a MultiTrialReport from trial results.""" - num_trials = len(trial_results) - if num_trials == 0: - return MultiTrialReport( - session_id=session_id, - num_trials=0, - ) - - num_passed = sum(1 for t in trial_results if t.passed) - - # Aggregate scores - all_metric_names: set[str] = set() - for t in trial_results: - all_metric_names.update(t.scores.keys()) - - mean_scores: dict[str, float] = {} - score_std_dev: dict[str, float] = {} - - for metric in sorted(all_metric_names): - values = [t.scores.get(metric, 0.0) for t in trial_results] - mean_scores[metric] = statistics.mean(values) - if len(values) >= 2: - score_std_dev[metric] = statistics.stdev(values) - else: - score_std_dev[metric] = 0.0 - - return MultiTrialReport( - session_id=session_id, - num_trials=num_trials, - trial_results=trial_results, - pass_at_k=compute_pass_at_k(num_trials, num_passed), - pass_pow_k=compute_pass_pow_k(num_trials, num_passed), - per_trial_pass_rate=num_passed / num_trials, - mean_scores=mean_scores, - score_std_dev=score_std_dev, - ) +from .multi_trial_performance_evaluator import * diff --git a/src/bigquery_agent_analytics/multi_trial_performance_evaluator.py b/src/bigquery_agent_analytics/multi_trial_performance_evaluator.py new file mode 100644 index 0000000..aa5260d --- /dev/null +++ b/src/bigquery_agent_analytics/multi_trial_performance_evaluator.py @@ -0,0 +1,317 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Multi-trial evaluation runner with pass@k / pass^k metrics. + +Wraps any ``PerformanceEvaluator`` to run N trials per task and +compute probabilistic pass-rate metrics that account for agent +non-determinism. + +Example usage:: + + from bigquery_agent_analytics import ( + PerformanceEvaluator, TrialRunner, + ) + + evaluator = PerformanceEvaluator( + project_id="my-project", + dataset_id="analytics", + ) + runner = TrialRunner(evaluator, num_trials=5) + + report = await runner.run_trials( + session_id="sess-123", + golden_trajectory=[{"tool_name": "search", "args": {}}], + ) + print(report.pass_at_k, report.pass_pow_k) +""" + +from __future__ import annotations + +import asyncio +import logging +import math +import statistics +from typing import Any, Optional + +from pydantic import BaseModel +from pydantic import Field + +from .performance_evaluator import PerformanceEvaluator +from .performance_evaluator import EvalStatus +from .performance_evaluator import MatchType + +logger = logging.getLogger("bigquery_agent_analytics." + __name__) + + +# ------------------------------------------------------------------ # +# Data Models # +# ------------------------------------------------------------------ # + + +class TrialResult(BaseModel): + """Result of a single trial.""" + + trial_index: int = Field(description="Zero-based trial index.") + passed: bool = Field(description="Whether this trial passed.") + scores: dict[str, float] = Field( + default_factory=dict, + description="Metric scores for this trial.", + ) + details: dict[str, Any] = Field( + default_factory=dict, + description="Additional trial details.", + ) + + +class MultiTrialReport(BaseModel): + """Aggregate report across N trials of one task.""" + + session_id: str = Field(description="The session ID evaluated.") + num_trials: int = Field(description="Number of trials run.") + trial_results: list[TrialResult] = Field( + default_factory=list, + description="Individual trial results.", + ) + pass_at_k: float = Field( + default=0.0, + description="P(>=1 pass in k trials).", + ) + pass_pow_k: float = Field( + default=0.0, + description="P(all k trials pass).", + ) + per_trial_pass_rate: float = Field( + default=0.0, + description="Fraction of trials that passed.", + ) + mean_scores: dict[str, float] = Field( + default_factory=dict, + description="Mean score per metric across trials.", + ) + score_std_dev: dict[str, float] = Field( + default_factory=dict, + description="Standard deviation per metric across trials.", + ) + + +# ------------------------------------------------------------------ # +# Static Helpers # +# ------------------------------------------------------------------ # + + +def compute_pass_at_k( + num_trials: int, + num_passed: int, +) -> float: + """Computes pass@k: P(>=1 pass in k trials). + + Uses the formula: 1 - C(n-c, k) / C(n, k) + where n = num_trials, c = num_passed, k = num_trials. + + Args: + num_trials: Total number of trials (k). + num_passed: Number of trials that passed (c). + + Returns: + Probability that at least one trial passes. + """ + if num_trials <= 0: + return 0.0 + if num_passed <= 0: + return 0.0 + if num_passed >= num_trials: + return 1.0 + + # 1 - C(n-c, k) / C(n, k) + n = num_trials + c = num_passed + k = num_trials + + # C(n-c, k) / C(n, k) -- if n-c < k then C(n-c,k)=0 => pass@k=1 + if n - c < k: + return 1.0 + + # Use log to avoid overflow for large values + log_numerator = sum(math.log(n - c - i) for i in range(k)) + log_denominator = sum(math.log(n - i) for i in range(k)) + + return 1.0 - math.exp(log_numerator - log_denominator) + + +def compute_pass_pow_k( + num_trials: int, + num_passed: int, +) -> float: + """Computes pass^k: P(all k trials pass). + + Uses the formula: (num_passed / num_trials) ** num_trials. + + Args: + num_trials: Total number of trials. + num_passed: Number of trials that passed. + + Returns: + Probability that all trials pass. + """ + if num_trials <= 0: + return 0.0 + if num_passed <= 0: + return 0.0 + rate = num_passed / num_trials + return rate**num_trials + + +# ------------------------------------------------------------------ # +# TrialRunner # +# ------------------------------------------------------------------ # + + +class MultiTrialPerformanceEvaluator: + """Runs multiple evaluation trials and computes aggregate performance metrics. + + Wraps a ``PerformanceEvaluator`` and runs N trials per task, + computing pass@k and pass^k metrics that account for agent + non-determinism (e.g. LLM judges produce different scores each + call). + + Example:: + + runner = MultiTrialPerformanceEvaluator(evaluator, num_trials=5, concurrency=3) + report = await runner.run_trials( + session_id="sess-123", + golden_trajectory=[...], + ) + """ + + def __init__( + self, + evaluator: PerformanceEvaluator, + num_trials: int = 5, + concurrency: int = 3, + ) -> None: + """Initializes the MultiTrialPerformanceEvaluator. + + Args: + evaluator: The PerformanceEvaluator to wrap. + num_trials: Number of trials to run per task. + concurrency: Maximum concurrent evaluations. + """ + self.evaluator = evaluator + self.num_trials = num_trials + self.concurrency = concurrency + + async def run_trials( + self, + session_id: str, + golden_trajectory: Optional[list[dict]] = None, + golden_response: Optional[str] = None, + match_type: MatchType = MatchType.EXACT, + task_description: Optional[str] = None, + use_llm_judge: bool = False, + thresholds: Optional[dict[str, float]] = None, + ) -> MultiTrialReport: + """Runs N trials of evaluation for a single session.""" + semaphore = asyncio.Semaphore(self.concurrency) + trial_results: list[TrialResult] = [] + + async def _run_one(trial_index: int) -> TrialResult: + async with semaphore: + result = await self.evaluator.evaluate_session( + session_id=session_id, + golden_trajectory=golden_trajectory, + golden_response=golden_response, + match_type=match_type, + task_description=task_description, + use_llm_judge=use_llm_judge, + thresholds=thresholds, + ) + return TrialResult( + trial_index=trial_index, + passed=result.eval_status == EvalStatus.PASSED, + scores=result.scores, + details=result.details, + ) + + tasks = [_run_one(i) for i in range(self.num_trials)] + trial_results = list(await asyncio.gather(*tasks)) + + return self._build_report(session_id, trial_results) + + async def run_trials_batch( + self, + eval_dataset: list[dict[str, Any]], + match_type: MatchType = MatchType.EXACT, + use_llm_judge: bool = False, + ) -> list[MultiTrialReport]: + """Runs multi-trial evaluation for a batch of tasks.""" + reports = [] + for item in eval_dataset: + report = await self.run_trials( + session_id=item["session_id"], + golden_trajectory=item.get("expected_trajectory"), + golden_response=item.get("expected_response"), + match_type=match_type, + task_description=item.get("task_description"), + use_llm_judge=use_llm_judge, + thresholds=item.get("thresholds"), + ) + reports.append(report) + return reports + + def _build_report( + self, + session_id: str, + trial_results: list[TrialResult], + ) -> MultiTrialReport: + """Builds a MultiTrialReport from trial results.""" + num_trials = len(trial_results) + if num_trials == 0: + return MultiTrialReport( + session_id=session_id, + num_trials=0, + ) + + num_passed = sum(1 for t in trial_results if t.passed) + + # Aggregate scores + all_metric_names: set[str] = set() + for t in trial_results: + all_metric_names.update(t.scores.keys()) + + mean_scores: dict[str, float] = {} + score_std_dev: dict[str, float] = {} + + for metric in sorted(all_metric_names): + values = [t.scores.get(metric, 0.0) for t in trial_results] + mean_scores[metric] = statistics.mean(values) + if len(values) >= 2: + score_std_dev[metric] = statistics.stdev(values) + else: + score_std_dev[metric] = 0.0 + + return MultiTrialReport( + session_id=session_id, + num_trials=num_trials, + trial_results=trial_results, + pass_at_k=compute_pass_at_k(num_trials, num_passed), + pass_pow_k=compute_pass_pow_k(num_trials, num_passed), + per_trial_pass_rate=num_passed / num_trials, + mean_scores=mean_scores, + score_std_dev=score_std_dev, + ) + + +# Keep aliases for backward compatibility +TrialRunner = MultiTrialPerformanceEvaluator diff --git a/src/bigquery_agent_analytics/performance_evaluator.py b/src/bigquery_agent_analytics/performance_evaluator.py new file mode 100644 index 0000000..0ca7ebd --- /dev/null +++ b/src/bigquery_agent_analytics/performance_evaluator.py @@ -0,0 +1,1101 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Trace-Based Evaluation Harness for ADK Agents. + +This module provides capabilities to evaluate agent behavior using stored +traces in BigQuery. It supports: + +- Trajectory matching (exact, in-order, any-order) +- LLM-as-judge evaluation +- Custom metric scoring +- Deterministic replay for debugging + +Example usage: + evaluator = BigQueryTraceEvaluator( + project_id="my-project", + dataset_id="agent_analytics", + ) + + results = await evaluator.evaluate_session( + session_id="session-123", + golden_trajectory=[ + {"tool_name": "search", "args": {"query": "weather"}}, + {"tool_name": "format_response", "args": {}}, + ], + golden_response="The weather is sunny.", + ) +""" + +from __future__ import annotations + +import asyncio +from dataclasses import dataclass +from dataclasses import field +from datetime import datetime +from enum import Enum +import json +import logging +from typing import Any, Callable, Optional + +from google.cloud import bigquery +from pydantic import BaseModel +from pydantic import Field + +from bigquery_agent_analytics.utils import _parse_json_from_text, _extract_json_from_text, strip_markdown_fences + +from ._telemetry import LabeledBigQueryClient +from ._telemetry import make_bq_client +from ._telemetry import with_sdk_labels + +logger = logging.getLogger("bigquery_agent_analytics." + __name__) + + +class MatchType(Enum): + """The type of trajectory matching to use.""" + + EXACT = "exact" + """Requires perfect match between actual and expected tool calls.""" + + IN_ORDER = "in_order" + """Requires tools in same order, allows extra tools between.""" + + ANY_ORDER = "any_order" + """Requires all expected tools present, any order allowed.""" + + +class EvalStatus(Enum): + """Status of an evaluation.""" + + PASSED = "passed" + FAILED = "failed" + NOT_EVALUATED = "not_evaluated" + + +@dataclass +class TraceEvent: + """Represents a single event from a trace.""" + + event_type: str + agent: Optional[str] + timestamp: datetime + content: dict[str, Any] + attributes: dict[str, Any] + span_id: Optional[str] = None + parent_span_id: Optional[str] = None + latency_ms: Optional[int] = None + status: str = "OK" + error_message: Optional[str] = None + + @classmethod + def from_bigquery_row(cls, row: dict[str, Any]) -> "TraceEvent": + """Creates a TraceEvent from a BigQuery row.""" + content = row.get("content") + if isinstance(content, str): + try: + content = json.loads(content) + except (json.JSONDecodeError, TypeError): + content = {"raw": content} + elif content is None: + content = {} + + attributes = row.get("attributes") + if isinstance(attributes, str): + try: + attributes = json.loads(attributes) + except (json.JSONDecodeError, TypeError): + attributes = {} + elif attributes is None: + attributes = {} + + latency_ms = row.get("latency_ms") + if isinstance(latency_ms, str): + try: + latency_data = json.loads(latency_ms) + latency_ms = latency_data.get("total_ms") + except (json.JSONDecodeError, TypeError): + latency_ms = None + elif isinstance(latency_ms, dict): + latency_ms = latency_ms.get("total_ms") + + return cls( + event_type=row.get("event_type", "UNKNOWN"), + agent=row.get("agent"), + timestamp=row.get("timestamp", datetime.now()), + content=content, + attributes=attributes, + span_id=row.get("span_id"), + parent_span_id=row.get("parent_span_id"), + latency_ms=latency_ms, + status=row.get("status", "OK"), + error_message=row.get("error_message"), + ) + + +@dataclass +class ToolCall: + """Represents a tool call extracted from a trace.""" + + tool_name: str + args: dict[str, Any] + result: Optional[dict[str, Any]] = None + status: str = "OK" + error_message: Optional[str] = None + latency_ms: Optional[int] = None + + +@dataclass +class SessionTrace: + """Complete trace for a session.""" + + session_id: str + user_id: Optional[str] + events: list[TraceEvent] + tool_calls: list[ToolCall] = field(default_factory=list) + final_response: Optional[str] = None + total_latency_ms: Optional[int] = None + + def extract_tool_trajectory(self) -> list[ToolCall]: + """Extracts the tool call trajectory from events.""" + tool_calls = [] + tool_starts: dict[str, TraceEvent] = {} + + for event in self.events: + if event.event_type == "TOOL_STARTING": + tool_name = event.content.get("tool", "unknown") + tool_starts[event.span_id or tool_name] = event + + elif event.event_type == "TOOL_COMPLETED": + tool_name = event.content.get("tool", "unknown") + start_event = tool_starts.pop(event.span_id or tool_name, None) + + args = {} + if start_event: + args = start_event.content.get("args", {}) + + tool_calls.append( + ToolCall( + tool_name=tool_name, + args=args, + result=event.content.get("result"), + status="OK", + latency_ms=event.latency_ms, + ) + ) + + elif event.event_type == "TOOL_ERROR": + tool_name = event.content.get("tool", "unknown") + start_event = tool_starts.pop(event.span_id or tool_name, None) + + args = {} + if start_event: + args = start_event.content.get("args", {}) + + tool_calls.append( + ToolCall( + tool_name=tool_name, + args=args, + status="ERROR", + error_message=event.error_message, + latency_ms=event.latency_ms, + ) + ) + + self.tool_calls = tool_calls + return tool_calls + + def extract_final_response(self) -> Optional[str]: + """Extracts the final agent response from events. + + Checks LLM_RESPONSE first (most reliable response source), + then falls back to AGENT_COMPLETED. + """ + # Prefer the last LLM_RESPONSE (most reliable response source) + for event in reversed(self.events): + if event.event_type == "LLM_RESPONSE": + content = event.content + if isinstance(content, dict): + return content.get("response") or content.get("text_summary") + return str(content) if content else None + + # Fallback to AGENT_COMPLETED + for event in reversed(self.events): + if event.event_type == "AGENT_COMPLETED": + content = event.content + if isinstance(content, dict): + return content.get("response") or content.get("text_summary") + return str(content) if content else None + + return None + + +class TrajectoryMetrics: + """Computes trajectory-based evaluation metrics.""" + + @staticmethod + def compute_exact_match( + actual: list[ToolCall], + expected: list[dict[str, Any]], + ) -> float: + """Computes exact match score between trajectories. + + Args: + actual: List of actual tool calls from trace. + expected: List of expected tool calls with tool_name and args. + + Returns: + Score between 0.0 and 1.0. + """ + if not expected: + return 1.0 if not actual else 0.0 + + if len(actual) != len(expected): + return 0.0 + + matches = 0 + for act, exp in zip(actual, expected): + if act.tool_name == exp.get("tool_name"): + # Check args if specified + exp_args = exp.get("args", {}) + if not exp_args or TrajectoryMetrics._args_match(act.args, exp_args): + matches += 1 + + return matches / len(expected) + + @staticmethod + def compute_in_order_match( + actual: list[ToolCall], + expected: list[dict[str, Any]], + ) -> float: + """Computes in-order match score. + + Checks if expected tools appear in order within actual calls. + + Args: + actual: List of actual tool calls. + expected: List of expected tool calls. + + Returns: + Score between 0.0 and 1.0. + """ + if not expected: + return 1.0 + + expected_idx = 0 + for act in actual: + if expected_idx >= len(expected): + break + + exp = expected[expected_idx] + if act.tool_name == exp.get("tool_name"): + exp_args = exp.get("args", {}) + if not exp_args or TrajectoryMetrics._args_match(act.args, exp_args): + expected_idx += 1 + + return expected_idx / len(expected) + + @staticmethod + def compute_any_order_match( + actual: list[ToolCall], + expected: list[dict[str, Any]], + ) -> float: + """Computes any-order match score. + + Checks if all expected tools appear in actual calls (any order). + + Args: + actual: List of actual tool calls. + expected: List of expected tool calls. + + Returns: + Score between 0.0 and 1.0. + """ + if not expected: + return 1.0 + + remaining = list(expected) + for act in actual: + for i, exp in enumerate(remaining): + if act.tool_name == exp.get("tool_name"): + exp_args = exp.get("args", {}) + if not exp_args or TrajectoryMetrics._args_match(act.args, exp_args): + remaining.pop(i) + break + + matched = len(expected) - len(remaining) + return matched / len(expected) + + @staticmethod + def _args_match(actual: dict[str, Any], expected: dict[str, Any]) -> bool: + """Checks if actual args contain expected args.""" + for key, value in expected.items(): + if key not in actual: + return False + if value is not None and actual[key] != value: + return False + return True + + @staticmethod + def compute_step_efficiency( + actual_steps: int, + optimal_steps: int, + ) -> float: + """Computes step efficiency score. + + Args: + actual_steps: Number of steps taken by agent. + optimal_steps: Optimal number of steps. + + Returns: + Score between 0.0 and 1.0 (1.0 = optimal or better). + """ + if optimal_steps <= 0: + return 1.0 if actual_steps == 0 else 0.0 + + if actual_steps <= optimal_steps: + return 1.0 + + # Penalize extra steps with diminishing returns + efficiency = optimal_steps / actual_steps + return max(0.0, efficiency) + + +class EvaluationResult(BaseModel): + """Result of evaluating a session trace.""" + + session_id: str = Field(description="The session ID that was evaluated.") + eval_status: EvalStatus = Field(description="Overall evaluation status.") + scores: dict[str, float] = Field( + default_factory=dict, + description="Individual metric scores.", + ) + overall_score: Optional[float] = Field( + default=None, + description="Overall weighted score if computed.", + ) + details: dict[str, Any] = Field( + default_factory=dict, + description="Additional evaluation details.", + ) + llm_judge_feedback: Optional[str] = Field( + default=None, + description="Feedback from LLM judge if used.", + ) + + +class PerformanceEvaluator: + """Evaluates agent traces stored in BigQuery to assess performance. + + This evaluator retrieves trace data from BigQuery and computes various + metrics including trajectory matching, response quality, and custom metrics. + + Example: + evaluator = PerformanceEvaluator( + project_id="my-project", + dataset_id="agent_analytics", + ) + + result = await evaluator.evaluate_session( + session_id="sess-123", + golden_trajectory=[{"tool_name": "search", "args": {"q": "test"}}], + ) + """ + + # SQL query to retrieve complete session trace + _DEFAULT_EVENT_TYPES = [ + "USER_MESSAGE_RECEIVED", + "AGENT_STARTING", + "AGENT_COMPLETED", + "TOOL_STARTING", + "TOOL_COMPLETED", + "TOOL_ERROR", + "LLM_REQUEST", + "LLM_RESPONSE", + "LLM_ERROR", + "INVOCATION_STARTING", + "INVOCATION_COMPLETED", + "STATE_DELTA", + "HITL_CONFIRMATION_REQUEST", + "HITL_CONFIRMATION_REQUEST_COMPLETED", + "HITL_CREDENTIAL_REQUEST", + "HITL_CREDENTIAL_REQUEST_COMPLETED", + "HITL_INPUT_REQUEST", + "HITL_INPUT_REQUEST_COMPLETED", + ] + + _SESSION_TRACE_QUERY = """ + SELECT + event_type, + agent, + timestamp, + content, + attributes, + span_id, + parent_span_id, + latency_ms, + status, + error_message, + user_id + FROM `{project}.{dataset}.{table}` + WHERE session_id = @session_id + AND event_type IN UNNEST(@event_types) + ORDER BY timestamp ASC + """ + + # One-Sided LLM Judge Prompt (No golden response required) + _ONE_SIDED_JUDGE_PROMPT = """You are evaluating an AI agent's task execution trajectory and final response for sentiment and hallucination (faithfulness). + +## Task Description +{task_description} + +## Agent Trajectory (Actual) +{trajectory_json} + +## Final Response (Actual) +{final_response} + +## Instructions +Score the following criteria from 0 to 10: +1. sentiment: (0 to 10 scale) Was the tone positive, professional, helpful, and safe? +2. hallucination: (0 to 10 scale) Does the final response contain claims that are NOT supported by the captured tool call trajectory (i.e., hallucinating facts not retrieved by tools)? Score 10 for perfect grounding (no hallucinations), 0 for severe hallucination. + +IMPORTANT: You MUST respond with ONLY a valid JSON object matching the format below. No explanation before or after. +Keep justification brief (under 100 characters). + +Required JSON format: +{{ + "sentiment": 8, + "hallucination": 10, + "justification": "Brief reason explaining the scores" +}} +""" + + # Side-by-Side LLM Judge Prompt (Golden response required) + _SIDE_BY_SIDE_JUDGE_PROMPT = """You are evaluating an AI agent's task execution trajectory and final response for correctness and efficiency against a golden reference response. + +## Task Description +{task_description} + +## Agent Trajectory (Actual) +{trajectory_json} + +## Expected Trajectory (Golden, if provided) +{expected_trajectory} + +## Golden Response (Ground Truth) +{golden_response} + +## Final Response (Actual) +{final_response} + +## Instructions +Evaluate the actual trajectory and response against the golden reference. You must score the following criteria: +1. final_answer_correct: (Binary: 1 for yes/pass, or 0 for no/fail) Does the agent's final response accurately address the user's request and contain the key facts matching the golden response? +2. tool_usage_correct: (Binary: 1 for yes/pass, or 0 for no/fail) Did the agent use the correct tools with correct arguments as recorded in the trajectory? +3. sound_reasoning: (Binary: 1 for yes/pass, or 0 for no/fail) Was the agent's reasoning sound and logical throughout the conversation? +4. efficiency: (Binary: 1 for yes/pass, or 0 for no/fail) Were all tool calls necessary and minimal? Fails (0) if there are redundant or excessive tool calls. + +IMPORTANT: You MUST respond with ONLY a valid JSON object matching the format below. No explanation before or after. +Keep justification brief (under 100 characters). + +Required JSON format: +{{ + "final_answer_correct": 1, + "tool_usage_correct": 1, + "sound_reasoning": 1, + "efficiency": 1, + "justification": "Brief reason explaining the scores" +}} +""" + + def __init__( + self, + project_id: str = "proj", + dataset_id: str = "ds", + table_id: str = "agent_events", + client: Optional[bigquery.Client] = None, + llm_judge_model: Optional[str] = None, + include_event_types: Optional[list[str]] = None, + name: Optional[str] = None, + ) -> None: + """Initializes the PerformanceEvaluator.""" + self.project_id = project_id + self.dataset_id = dataset_id + self.table_id = table_id + self.table_ref = f"{project_id}.{dataset_id}.{table_id}" + self._client = client + self._warned_unlabeled_client = False + self.llm_judge_model = llm_judge_model or "gemini-2.5-flash" + self.include_event_types = include_event_types or self._DEFAULT_EVENT_TYPES + self._custom_rubrics: list[dict[str, Any]] = [] + + @property + def name(self) -> str: + return "performance_evaluator" + + @property + def client(self) -> bigquery.Client: + """Lazily initializes and returns the BigQuery client.""" + if self._client is None: + self._client = make_bq_client(self.project_id) + elif isinstance(self._client, bigquery.Client) and not isinstance( + self._client, LabeledBigQueryClient + ): + if not self._warned_unlabeled_client: + logger.warning( + "User-provided bigquery.Client is not a " + "LabeledBigQueryClient; SDK telemetry labels will not be " + "applied to jobs from this client. To opt in, construct " + "the client via bigquery_agent_analytics.make_bq_client() " + "or pass a LabeledBigQueryClient directly." + ) + self._warned_unlabeled_client = True + return self._client + + def add_rubric( + self, + name: str, + prompt_template: str, + score_key: str, + threshold: float = 0.5, + ) -> PerformanceEvaluator: + """Adds a custom LLM rubric to the PerformanceEvaluator. + + Args: + name: Rubric metric name. + prompt_template: Prompt with {trace_text} and {final_response} placeholders. + score_key: JSON key in LLM response containing score. + threshold: Pass/fail threshold (0-1 scale). + + Returns: + Self for chaining. + """ + self._custom_rubrics.append({ + "name": name, + "prompt_template": prompt_template, + "score_key": score_key, + "threshold": threshold, + }) + return self + + async def get_session_trace(self, session_id: str) -> SessionTrace: + """Retrieves the complete trace for a session.""" + query = self._SESSION_TRACE_QUERY.format( + project=self.project_id, + dataset=self.dataset_id, + table=self.table_id, + ) + + job_config = bigquery.QueryJobConfig( + query_parameters=[ + bigquery.ScalarQueryParameter( + "session_id", + "STRING", + session_id, + ), + bigquery.ArrayQueryParameter( + "event_types", + "STRING", + self.include_event_types, + ), + ] + ) + job_config = with_sdk_labels(job_config, feature="trace-read") + + loop = asyncio.get_event_loop() + query_job = await loop.run_in_executor( + None, + lambda: self.client.query(query, job_config=job_config), + ) + + results = await loop.run_in_executor(None, lambda: list(query_job.result())) + + events = [TraceEvent.from_bigquery_row(dict(row)) for row in results] + + user_id = None + if results: + user_id = results[0].get("user_id") + + trace = SessionTrace( + session_id=session_id, + user_id=user_id, + events=events, + ) + + trace.extract_tool_trajectory() + trace.final_response = trace.extract_final_response() + + if events: + start = min(e.timestamp for e in events) + end = max(e.timestamp for e in events) + trace.total_latency_ms = int((end - start).total_seconds() * 1000) + + return trace + + def evaluate_deterministic_trajectory( + self, + trace: SessionTrace, + golden_trajectory: list[dict[str, Any]], + match_type: MatchType = MatchType.EXACT, + ) -> dict[str, float]: + """Computes deterministic trajectory matching and step efficiency scores. + + Args: + trace: The SessionTrace object containing actual tool calls. + golden_trajectory: Optimal tool calls expected. + match_type: Matching criteria strategy. + + Returns: + A dict of computed deterministic scores. + """ + scores: dict[str, float] = {} + if match_type == MatchType.EXACT: + scores["trajectory_exact_match"] = TrajectoryMetrics.compute_exact_match( + trace.tool_calls, golden_trajectory + ) + elif match_type == MatchType.IN_ORDER: + scores["trajectory_in_order"] = TrajectoryMetrics.compute_in_order_match( + trace.tool_calls, golden_trajectory + ) + elif match_type == MatchType.ANY_ORDER: + scores["trajectory_any_order"] = ( + TrajectoryMetrics.compute_any_order_match( + trace.tool_calls, golden_trajectory + ) + ) + + scores["step_efficiency"] = TrajectoryMetrics.compute_step_efficiency( + len(trace.tool_calls), + len(golden_trajectory), + ) + return scores + + async def evaluate_session( + self, + session_id: str, + golden_trajectory: Optional[list[dict[str, Any]]] = None, + golden_response: Optional[str] = None, + match_type: MatchType = MatchType.EXACT, + task_description: Optional[str] = None, + use_llm_judge: bool = False, + custom_metrics: Optional[dict[str, Callable]] = None, + thresholds: Optional[dict[str, float]] = None, + ) -> EvaluationResult: + """Evaluates a single session against golden data.""" + trace = await self.get_session_trace(session_id) + + scores: dict[str, float] = {} + details: dict[str, Any] = { + "actual_tool_calls": len(trace.tool_calls), + "expected_tool_calls": ( + len(golden_trajectory) if golden_trajectory else 0 + ), + } + + if golden_trajectory is not None: + scores.update( + self.evaluate_deterministic_trajectory( + trace, golden_trajectory, match_type + ) + ) + + llm_feedback = None + if use_llm_judge: + llm_scores, llm_feedback = await self.llm_judge_evaluate( + trace=trace, + task_description=task_description or "Complete the user's request.", + expected_trajectory=golden_trajectory, + golden_response=golden_response, + ) + scores.update(llm_scores) + + # Custom LLM rubrics + if self._custom_rubrics: + trace_text = "\n".join( + f"{e.event_type}: {json.dumps(e.content)}" + for e in trace.events + ) + feedback_parts = [] + if llm_feedback: + feedback_parts.append(llm_feedback) + for rubric in self._custom_rubrics: + score, feedback = await self._evaluate_custom_rubric( + rubric, + trace_text, + trace.final_response, + golden_response, + ) + scores[rubric["name"]] = score + if feedback: + feedback_parts.append(f"{rubric['name']}: {feedback}") + llm_feedback = "\n".join(feedback_parts) + + if custom_metrics: + for metric_name, metric_fn in custom_metrics.items(): + try: + score = metric_fn(trace, golden_trajectory, golden_response) + scores[metric_name] = float(score) + except Exception as e: + logger.warning("Custom metric %s failed: %s", metric_name, e) + scores[metric_name] = 0.0 + + thresholds = thresholds or {} + passed = True + for metric_name, score in scores.items(): + threshold = thresholds.get(metric_name, 0.5) + if score < threshold: + passed = False + details[f"{metric_name}_threshold"] = threshold + + return EvaluationResult( + session_id=session_id, + eval_status=EvalStatus.PASSED if passed else EvalStatus.FAILED, + scores=scores, + overall_score=scores.get("llm_judge_correctness"), + details=details, + llm_judge_feedback=llm_feedback, + ) + + async def evaluate_batch( + self, + eval_dataset: list[dict[str, Any]], + match_type: MatchType = MatchType.EXACT, + use_llm_judge: bool = False, + concurrency: int = 5, + ) -> list[EvaluationResult]: + """Evaluates multiple sessions from an eval dataset.""" + semaphore = asyncio.Semaphore(concurrency) + + async def evaluate_one(item: dict[str, Any]) -> EvaluationResult: + async with semaphore: + return await self.evaluate_session( + session_id=item["session_id"], + golden_trajectory=item.get("expected_trajectory"), + golden_response=item.get("expected_response"), + match_type=match_type, + task_description=item.get("task_description"), + use_llm_judge=use_llm_judge, + thresholds=item.get("thresholds"), + ) + + tasks = [evaluate_one(item) for item in eval_dataset] + return await asyncio.gather(*tasks) + + async def llm_judge_evaluate( + self, + trace: SessionTrace, + task_description: str, + expected_trajectory: Optional[list[dict[str, Any]]], + golden_response: Optional[str] = None, + ) -> tuple[dict[str, float], str]: + """Uses LLM as judge to evaluate the trace.""" + try: + from google import genai + from google.genai import types + except ImportError: + logger.warning("google-genai not installed, skipping LLM judge.") + return {}, "LLM judge unavailable - google-genai not installed" + + trajectory_data = [ + { + "tool": tc.tool_name, + "args": tc.args, + "status": tc.status, + } + for tc in trace.tool_calls + ] + + # Generate prompts + one_sided_prompt = self._ONE_SIDED_JUDGE_PROMPT.format( + task_description=task_description, + trajectory_json=json.dumps(trajectory_data, indent=2), + final_response=trace.final_response or "No response captured", + ) + + side_by_side_prompt = None + if golden_response: + side_by_side_prompt = self._SIDE_BY_SIDE_JUDGE_PROMPT.format( + task_description=task_description, + trajectory_json=json.dumps(trajectory_data, indent=2), + expected_trajectory=json.dumps(expected_trajectory, indent=2) + if expected_trajectory + else "Not provided", + golden_response=golden_response, + final_response=trace.final_response or "No response captured", + ) + + scores = {} + feedback_parts = [] + + # 1. Run One-Sided Evaluation + try: + client = genai.Client() + response = await client.aio.models.generate_content( + model=self.llm_judge_model, + contents=one_sided_prompt, + config=types.GenerateContentConfig( + temperature=0.1, + max_output_tokens=1024, + ), + ) + response_text = (response.text or "").strip() + json_str = _extract_json_from_text(response_text) + if json_str: + result = json.loads(json_str) + sentiment = float(result.get("sentiment", 10)) / 10.0 + hallucination = float(result.get("hallucination", 10)) / 10.0 + scores["llm_judge_sentiment"] = sentiment + scores["llm_judge_hallucination"] = hallucination + feedback_parts.append(result.get("justification", response_text)) + except Exception as e: + logger.warning("One-sided LLM evaluation failed: %s", e) + + # 2. Run Side-by-Side Evaluation + if side_by_side_prompt: + try: + client = genai.Client() + response = await client.aio.models.generate_content( + model=self.llm_judge_model, + contents=side_by_side_prompt, + config=types.GenerateContentConfig( + temperature=0.1, + max_output_tokens=1024, + ), + ) + response_text = (response.text or "").strip() + json_str = _extract_json_from_text(response_text) + if json_str: + result = json.loads(json_str) + final_answer_correct = float(result.get("final_answer_correct", 0)) + tool_usage_correct = float(result.get("tool_usage_correct", 0)) + sound_reasoning = float(result.get("sound_reasoning", 0)) + efficiency = float(result.get("efficiency", 0)) + + scores["llm_judge_final_answer_correct"] = final_answer_correct + scores["llm_judge_tool_usage_correct"] = tool_usage_correct + scores["llm_judge_sound_reasoning"] = sound_reasoning + scores["llm_judge_efficiency"] = efficiency + + scores["llm_judge_correctness"] = 1.0 if ( + final_answer_correct == 1.0 and + tool_usage_correct == 1.0 and + sound_reasoning == 1.0 + ) else 0.0 + feedback_parts.append(result.get("justification", response_text)) + except Exception as e: + logger.warning("Side-by-side LLM evaluation failed: %s", e) + + feedback = "\n".join(feedback_parts) + return scores, feedback + + async def _evaluate_custom_rubric( + self, + rubric: dict[str, Any], + trace_text: str, + final_response: str, + golden_response: Optional[str] = None, + ) -> tuple[float, str]: + """Evaluates a custom LLM rubric.""" + prompt = rubric["prompt_template"].format( + trace_text=trace_text, + final_response=final_response or "No response.", + golden_response=golden_response or "No golden response.", + ) + try: + from google import genai + from google.genai import types + + client = genai.Client() + response = await client.aio.models.generate_content( + model=self.llm_judge_model, + contents=prompt, + config=types.GenerateContentConfig( + temperature=0.1, + max_output_tokens=2048, + ), + ) + text = response.text.strip() + result = _parse_json_from_text(text) + if result and rubric["score_key"] in result: + raw = float(result[rubric["score_key"]]) + score = raw / 10.0 if raw > 1.0 else raw + justification = result.get("justification", "") + return score, justification + return 0.0, text + except Exception as e: + logger.warning("Custom rubric %s failed: %s", rubric["name"], e) + return 0.0, str(e) + + +# Keep aliases for backward compatibility +BigQueryTraceEvaluator = PerformanceEvaluator + + +@dataclass +class ReplayContext: + """Context for deterministic trace replay.""" + + llm_responses: dict[int, str] = field(default_factory=dict) + tool_responses: dict[str, Any] = field(default_factory=dict) + current_step: int = 0 + + def inject_llm_response(self, response: str) -> None: + """Injects a recorded LLM response for replay.""" + self.llm_responses[self.current_step] = response + self.current_step += 1 + + def inject_tool_response(self, tool_name: str, response: Any) -> None: + """Injects a recorded tool response for replay.""" + self.tool_responses[tool_name] = response + + def get_llm_response(self, step: int) -> Optional[str]: + """Gets injected LLM response for a step.""" + return self.llm_responses.get(step) + + def get_tool_response(self, tool_name: str) -> Optional[Any]: + """Gets injected tool response.""" + return self.tool_responses.get(tool_name) + + +class TraceReplayRunner: + """Replays agent sessions deterministically for debugging. + + This runner uses recorded traces to replay agent execution with + deterministic outcomes, useful for debugging and root cause analysis. + + Example: + replay_runner = TraceReplayRunner(evaluator) + result = await replay_runner.replay_session( + session_id="sess-123", + replay_mode="step", + ) + """ + + def __init__(self, evaluator: BigQueryTraceEvaluator) -> None: + """Initializes the replay runner. + + Args: + evaluator: BigQueryTraceEvaluator for trace retrieval. + """ + self.evaluator = evaluator + + async def replay_session( + self, + session_id: str, + replay_mode: str = "full", + step_callback: Optional[ + Callable[[TraceEvent, ReplayContext], None] + ] = None, + ) -> ReplayContext: + """Replays a recorded session step by step. + + Args: + session_id: The session ID to replay. + replay_mode: "full" for all events, "step" for pause at each step, + "tool_only" for only tool calls. + step_callback: Optional callback invoked at each step. + + Returns: + ReplayContext with all injected responses. + """ + trace = await self.evaluator.get_session_trace(session_id) + + replay_context = ReplayContext() + + for event in trace.events: + # Filter by mode + if replay_mode == "tool_only" and event.event_type not in [ + "TOOL_STARTING", + "TOOL_COMPLETED", + "TOOL_ERROR", + ]: + continue + + # Inject responses for replay + if event.event_type == "LLM_RESPONSE": + content = event.content + response_text = "" + if isinstance(content, dict): + response_text = content.get("response", "") + elif content: + response_text = str(content) + replay_context.inject_llm_response(response_text) + + elif event.event_type == "TOOL_COMPLETED": + tool_name = event.content.get("tool", "unknown") + result = event.content.get("result") + replay_context.inject_tool_response(tool_name, result) + + # Invoke callback if provided + if step_callback: + step_callback(event, replay_context) + + return replay_context + + async def compare_replays( + self, + session_id_1: str, + session_id_2: str, + ) -> dict[str, Any]: + """Compares two session replays to identify differences. + + Args: + session_id_1: First session ID. + session_id_2: Second session ID. + + Returns: + Dict with comparison results. + """ + trace1 = await self.evaluator.get_session_trace(session_id_1) + trace2 = await self.evaluator.get_session_trace(session_id_2) + + differences: dict[str, Any] = { + "event_count_diff": len(trace1.events) - len(trace2.events), + "tool_count_diff": len(trace1.tool_calls) - len(trace2.tool_calls), + "tool_differences": [], + "response_match": False, + } + + # Compare tool calls + max_tools = max(len(trace1.tool_calls), len(trace2.tool_calls)) + for i in range(max_tools): + tc1 = trace1.tool_calls[i] if i < len(trace1.tool_calls) else None + tc2 = trace2.tool_calls[i] if i < len(trace2.tool_calls) else None + + if tc1 is None or tc2 is None: + differences["tool_differences"].append( + { + "index": i, + "trace1": tc1.tool_name if tc1 else None, + "trace2": tc2.tool_name if tc2 else None, + } + ) + elif tc1.tool_name != tc2.tool_name or tc1.args != tc2.args: + differences["tool_differences"].append( + { + "index": i, + "trace1": {"name": tc1.tool_name, "args": tc1.args}, + "trace2": {"name": tc2.tool_name, "args": tc2.args}, + } + ) + + # Compare responses + r1 = trace1.final_response + r2 = trace2.final_response + if r1 and r2: + differences["response_match"] = ( + r1.strip() == r2.strip() + ) + + return differences diff --git a/src/bigquery_agent_analytics/system_evaluator.py b/src/bigquery_agent_analytics/system_evaluator.py new file mode 100644 index 0000000..2afea4f --- /dev/null +++ b/src/bigquery_agent_analytics/system_evaluator.py @@ -0,0 +1,966 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Evaluation engine for BigQuery Agent Analytics SDK. + +Provides ``SystemEvaluator`` for deterministic, code-based metrics and +``LLMAsJudge`` for semantic evaluation using LLM-as-a-judge. The +``evaluate()`` function orchestrates batch evaluation using BigQuery's +native AI functions for scalable, zero-ETL assessment. + +Example usage:: + + from bigquery_agent_analytics.evaluators import ( + SystemEvaluator, LLMAsJudge, + ) + + # Deterministic evaluation + evaluator = SystemEvaluator.latency(threshold_ms=5000) + + # LLM-based semantic evaluation + judge = LLMAsJudge.correctness() +""" + +from __future__ import annotations + +from dataclasses import dataclass +from datetime import datetime +from datetime import timezone +import json +import logging +import re +from typing import Any, Callable, Optional + +from pydantic import BaseModel +from pydantic import Field + +from bigquery_agent_analytics import udf_kernels +from .utils import _parse_json_from_text, strip_markdown_fences + +logger = logging.getLogger("bigquery_agent_analytics." + __name__) + +DEFAULT_ENDPOINT = "gemini-2.5-flash" + + +# ------------------------------------------------------------------ # +# Evaluation Report # +# ------------------------------------------------------------------ # + + +class SessionScore(BaseModel): + """Scores for a single evaluated session.""" + + session_id: str = Field(description="The session ID evaluated.") + scores: dict[str, float] = Field( + default_factory=dict, + description="Metric name to score (0.0 - 1.0).", + ) + passed: bool = Field( + default=True, + description="Whether the session passed all thresholds.", + ) + details: dict[str, Any] = Field( + default_factory=dict, + description="Additional per-session details.", + ) + llm_feedback: Optional[str] = Field( + default=None, + description="LLM judge feedback if applicable.", + ) + + +class EvaluationReport(BaseModel): + """Aggregate report from an evaluation run.""" + + dataset: str = Field(description="Dataset or filter description.") + evaluator_name: str = Field(description="Name of evaluator used.") + total_sessions: int = Field(default=0) + passed_sessions: int = Field(default=0) + failed_sessions: int = Field(default=0) + aggregate_scores: dict[str, float] = Field( + default_factory=dict, + description="Average scores across all sessions.", + ) + details: dict[str, Any] = Field( + default_factory=dict, + description=( + "Operational metadata (parse_errors, fallback_mode, etc.)." + " Separated from aggregate_scores so downstream consumers" + " can treat scores as purely normalized metrics." + ), + ) + session_scores: list[SessionScore] = Field( + default_factory=list, + ) + created_at: datetime = Field( + default_factory=lambda: datetime.now(timezone.utc), + ) + + @property + def pass_rate(self) -> float: + """Fraction of sessions that passed.""" + if self.total_sessions == 0: + return 0.0 + return self.passed_sessions / self.total_sessions + + def summary(self) -> str: + """Returns a human-readable summary.""" + lines = [ + f"Evaluation Report: {self.evaluator_name}", + f" Dataset: {self.dataset}", + f" Sessions: {self.total_sessions}", + f" Passed: {self.passed_sessions} ({self.pass_rate:.0%})", + f" Failed: {self.failed_sessions}", + ] + if self.aggregate_scores: + lines.append(" Aggregate Scores:") + for name, score in sorted(self.aggregate_scores.items()): + lines.append(f" {name}: {score:.3f}") + return "\n".join(lines) + + +# ------------------------------------------------------------------ # +# Code-Based Evaluator # +# ------------------------------------------------------------------ # + + +@dataclass +class _MetricDef: + """Internal definition of a code metric. + + ``observed_key``, ``observed_fn``, and ``budget`` are optional + reporting metadata used by the prebuilt evaluators (latency, + error_rate, turn_count, …) to surface the raw observed value and + the user-supplied budget in ``SessionScore.details``. They don't + affect pass/fail computation — that still goes through ``fn`` + + ``threshold`` — but they let downstream consumers (CLI + ``--exit-code`` output, dashboards) emit readable failure lines + without having to re-run the scorer. + + When ``observed_fn`` is set it takes precedence over + ``observed_key``; use it for metrics whose observed value is + computed from multiple summary fields (e.g. ``tool_errors / + tool_calls`` for error rate). + """ + + name: str + fn: Callable[[dict[str, Any]], float] + threshold: float = 0.5 + observed_key: Optional[str] = None + budget: Optional[float] = None + observed_fn: Optional[Callable[[dict[str, Any]], Any]] = None + + +class SystemEvaluator: + """Deterministic evaluator using code-based metric functions. + + Metrics operate on a session summary dict containing:: + + { + "session_id": str, + "total_events": int, + "tool_calls": int, + "tool_errors": int, + "llm_calls": int, + "avg_latency_ms": float, + "max_latency_ms": float, + "total_latency_ms": float, + "turn_count": int, + "has_error": bool, + } + + Each metric function returns a score between 0.0 and 1.0. + """ + + def __init__( + self, + name: str = "system_evaluator", + metrics: Optional[list[_MetricDef]] = None, + ) -> None: + self.name = name + self._metrics: list[_MetricDef] = metrics or [] + + def add_metric( + self, + name: str, + fn: Callable[[dict[str, Any]], float], + threshold: float = 0.5, + observed_key: Optional[str] = None, + budget: Optional[float] = None, + observed_fn: Optional[Callable[[dict[str, Any]], Any]] = None, + ) -> SystemEvaluator: + """Adds a custom metric function. + + Args: + name: Metric name. + fn: Function taking session summary, returning 0-1 score. + The score is compared to ``threshold``; a session passes + the metric when ``score >= threshold``. + threshold: Pass/fail threshold applied to ``fn``'s score. + observed_key: Optional session-summary key whose value is the + raw observed metric (e.g. ``"avg_latency_ms"``). When set, + ``evaluate_session`` stashes the observed value + ``budget`` + under ``SessionScore.details`` for downstream reporting. + budget: Optional raw-budget value corresponding to the metric + (e.g. the latency-ms threshold the user supplied). Reported + alongside ``observed_key``; not used for pass/fail. + observed_fn: Optional callable that derives the observed value + from the session summary. Used when the observed metric is + computed (e.g. ``tool_errors/tool_calls``) rather than + stored directly. Takes precedence over ``observed_key``. + + Returns: + Self for chaining. + """ + self._metrics.append( + _MetricDef( + name=name, + fn=fn, + threshold=threshold, + observed_key=observed_key, + budget=budget, + observed_fn=observed_fn, + ) + ) + return self + + def evaluate_session(self, session_summary: dict[str, Any]) -> SessionScore: + """Evaluates a single session summary. + + Args: + session_summary: Dict with session metrics. + + Returns: + SessionScore with computed scores. + """ + scores: dict[str, float] = {} + details: dict[str, Any] = {} + passed = True + + for metric in self._metrics: + try: + score = metric.fn(session_summary) + score = max(0.0, min(1.0, float(score))) + scores[metric.name] = score + metric_passed = score >= metric.threshold + if not metric_passed: + passed = False + except Exception as e: + logger.warning("Metric %s failed: %s", metric.name, e) + scores[metric.name] = 0.0 + metric_passed = False + passed = False + + # Stash per-metric reporting detail for *every* metric so the CLI + # ``--exit-code`` failure output always has a threshold / score / + # passed triple to emit, even for custom metrics that didn't + # declare observed_key / observed_fn. Observed / budget are only + # included when the metric supplied them. Keys are prefixed with + # ``metric_`` to avoid colliding with other details callers. + observed_value: Optional[Any] = None + if metric.observed_fn is not None: + try: + observed_value = metric.observed_fn(session_summary) + except Exception: # pylint: disable=broad-except + observed_value = None + elif metric.observed_key is not None: + observed_value = session_summary.get(metric.observed_key) + details[f"metric_{metric.name}"] = { + "observed": observed_value, + "budget": metric.budget, + "threshold": metric.threshold, + "score": scores[metric.name], + "passed": metric_passed, + } + + return SessionScore( + session_id=session_summary.get("session_id", "unknown"), + scores=scores, + passed=passed, + details=details, + ) + + # ---- Pre-built evaluators ---- # + + # The prebuilt evaluators below use raw-budget gates: they fail iff + # the observed metric exceeds the user-supplied budget. Historically + # these ran the normalized ``udf_kernels.score_*`` functions under a + # 0.5 score cutoff, which caused ``--threshold=5000`` on latency to + # fail near 2500ms — the gate was at half the budget the user typed. + # See CHANGELOG and the related blog-post-#2 plan (#77) for context. + # ``udf_kernels.score_*`` is unchanged; it still powers the SQL-native + # UDF path in ``udf_sql_templates.py``, which has its own semantics. + + @staticmethod + def latency( + threshold_ms: float = 5000.0, + ) -> SystemEvaluator: + """Pre-built evaluator that fails when average latency exceeds the budget. + + Pass/fail is a raw comparison: ``avg_latency_ms <= threshold_ms`` + passes, strictly greater fails. The returned evaluator's score for + a session is ``1.0`` on pass and ``0.0`` on fail. + + Args: + threshold_ms: Maximum acceptable average latency in ms. + + Returns: + SystemEvaluator configured for latency checking. + """ + + def _score(s: dict[str, Any]) -> float: + observed = s.get("avg_latency_ms", 0) or 0 + return 1.0 if observed <= threshold_ms else 0.0 + + evaluator = SystemEvaluator(name="latency_evaluator") + evaluator.add_metric( + "latency", + _score, + threshold=1.0, + observed_key="avg_latency_ms", + budget=threshold_ms, + ) + return evaluator + + @staticmethod + def turn_count(max_turns: int = 10) -> SystemEvaluator: + """Pre-built evaluator that fails when turn count exceeds the budget. + + Pass/fail is a raw comparison: ``turn_count <= max_turns`` passes, + strictly greater fails. + + Args: + max_turns: Maximum acceptable number of turns. + + Returns: + SystemEvaluator configured for turn count checking. + """ + + def _score(s: dict[str, Any]) -> float: + observed = s.get("turn_count", 0) or 0 + return 1.0 if observed <= max_turns else 0.0 + + evaluator = SystemEvaluator(name="turn_count_evaluator") + evaluator.add_metric( + "turn_count", + _score, + threshold=1.0, + observed_key="turn_count", + budget=max_turns, + ) + return evaluator + + @staticmethod + def error_rate( + max_error_rate: float = 0.1, + ) -> SystemEvaluator: + """Pre-built evaluator that fails when tool error rate exceeds the budget. + + Pass/fail is a raw comparison: ``(tool_errors / tool_calls) <= max_error_rate`` + passes, strictly greater fails. Sessions with zero tool calls pass + trivially (nothing to fail). + + Args: + max_error_rate: Maximum acceptable tool error fraction. + + Returns: + SystemEvaluator configured for error rate checking. + """ + + def _observed(s: dict[str, Any]) -> float: + calls = s.get("tool_calls", 0) or 0 + errors = s.get("tool_errors", 0) or 0 + if calls <= 0: + return 0.0 + return errors / calls + + def _score(s: dict[str, Any]) -> float: + calls = s.get("tool_calls", 0) or 0 + if calls <= 0: + return 1.0 + return 1.0 if _observed(s) <= max_error_rate else 0.0 + + evaluator = SystemEvaluator(name="error_rate_evaluator") + evaluator.add_metric( + "error_rate", + _score, + threshold=1.0, + observed_fn=_observed, + budget=max_error_rate, + ) + return evaluator + + @staticmethod + def token_efficiency( + max_tokens: int = 50000, + ) -> SystemEvaluator: + """Pre-built evaluator that fails when total tokens exceed the budget. + + Pass/fail is a raw comparison: ``total_tokens <= max_tokens`` + passes, strictly greater fails. + + Args: + max_tokens: Maximum acceptable total token count. + + Returns: + SystemEvaluator configured for token efficiency. + """ + + def _score(s: dict[str, Any]) -> float: + observed = s.get("total_tokens", 0) or 0 + return 1.0 if observed <= max_tokens else 0.0 + + evaluator = SystemEvaluator(name="token_efficiency_evaluator") + evaluator.add_metric( + "token_efficiency", + _score, + threshold=1.0, + observed_key="total_tokens", + budget=max_tokens, + ) + return evaluator + + @staticmethod + def ttft( + threshold_ms: float = 1000.0, + ) -> SystemEvaluator: + """Pre-built evaluator that fails when TTFT exceeds the budget. + + Pass/fail is a raw comparison: ``avg_ttft_ms <= threshold_ms`` + passes, strictly greater fails. + + Args: + threshold_ms: Maximum acceptable average TTFT in ms. + + Returns: + SystemEvaluator configured for TTFT checking. + """ + + def _score(s: dict[str, Any]) -> float: + observed = s.get("avg_ttft_ms", 0) or 0 + return 1.0 if observed <= threshold_ms else 0.0 + + evaluator = SystemEvaluator(name="ttft_evaluator") + evaluator.add_metric( + "ttft", + _score, + threshold=1.0, + observed_key="avg_ttft_ms", + budget=threshold_ms, + ) + return evaluator + + @staticmethod + def cost_per_session( + max_cost_usd: float = 1.0, + input_cost_per_1k: float = 0.00025, + output_cost_per_1k: float = 0.00125, + ) -> SystemEvaluator: + """Pre-built evaluator that fails when per-session cost exceeds the budget. + + Pass/fail is a raw comparison: ``estimated_cost_usd <= max_cost_usd`` + passes, strictly greater fails. + + Args: + max_cost_usd: Maximum acceptable cost in USD. + input_cost_per_1k: Cost per 1K input tokens. + output_cost_per_1k: Cost per 1K output tokens. + + Returns: + SystemEvaluator configured for cost checking. + """ + + def _observed(s: dict[str, Any]) -> float: + input_tokens = s.get("input_tokens", 0) or 0 + output_tokens = s.get("output_tokens", 0) or 0 + return (input_tokens / 1000.0) * input_cost_per_1k + ( + output_tokens / 1000.0 + ) * output_cost_per_1k + + def _score(s: dict[str, Any]) -> float: + return 1.0 if _observed(s) <= max_cost_usd else 0.0 + + evaluator = SystemEvaluator(name="cost_evaluator") + evaluator.add_metric( + "cost", + _score, + threshold=1.0, + observed_fn=_observed, + budget=max_cost_usd, + ) + return evaluator + + +# Keep alias for backward compatibility +CodeEvaluator = SystemEvaluator + + +# ------------------------------------------------------------------ # +# LLM-as-Judge Evaluator # +# ------------------------------------------------------------------ # + + +_CORRECTNESS_PROMPT = """\ +You are evaluating an AI agent's response for correctness. + +## Conversation Trace +{trace_text} + +## Final Agent Response +{final_response} + +## Instructions +Score the response on a scale of 1 to 10 for correctness: Did the \ +agent provide an accurate, factual response that addresses the \ +user's request? + +Respond with ONLY a valid JSON object: +{{"correctness": , "justification": ""}} +""" + +_HALLUCINATION_PROMPT = """\ +You are evaluating an AI agent's response for hallucination. + +## Conversation Trace +{trace_text} + +## Final Agent Response +{final_response} + +## Instructions +Score the response on a scale of 1 to 10 for faithfulness (where \ +10 means NO hallucination). Does the response contain claims not \ +supported by the tool results or conversation context? + +Respond with ONLY a valid JSON object: +{{"faithfulness": , "justification": ""}} +""" + +_SENTIMENT_PROMPT = """\ +You are evaluating the sentiment of an AI agent's conversation. + +## Conversation Trace +{trace_text} + +## Final Agent Response +{final_response} + +## Instructions +Score the overall sentiment and helpfulness of the interaction \ +on a scale of 1 to 10 (10 = very positive and helpful). + +Respond with ONLY a valid JSON object: +{{"sentiment": , "justification": ""}} +""" + + +@dataclass +class _JudgeCriterion: + """A single LLM-as-judge criterion.""" + + name: str + prompt_template: str + score_key: str + threshold: float = 0.5 + + +# LLMAsJudge is completely folded into PerformanceEvaluator in performance_evaluator.py. +# Safe alias preserved here for backward-compatibility. +from .performance_evaluator import PerformanceEvaluator as LLMAsJudge + + +# ------------------------------------------------------------------ # +# SQL Templates for BigQuery-native evaluation # +# ------------------------------------------------------------------ # + +SESSION_SUMMARY_QUERY = """\ +SELECT + session_id, + COUNT(*) AS total_events, + COUNTIF(event_type = 'TOOL_STARTING') AS tool_calls, + COUNTIF(event_type = 'TOOL_ERROR') AS tool_errors, + COUNTIF(event_type = 'LLM_REQUEST') AS llm_calls, + AVG( + CAST( + JSON_VALUE(latency_ms, '$.total_ms') AS FLOAT64 + ) + ) AS avg_latency_ms, + MAX( + CAST( + JSON_VALUE(latency_ms, '$.total_ms') AS FLOAT64 + ) + ) AS max_latency_ms, + TIMESTAMP_DIFF( + MAX(timestamp), MIN(timestamp), MILLISECOND + ) AS total_latency_ms, + COUNTIF( + event_type = 'USER_MESSAGE_RECEIVED' + ) AS turn_count, + AVG( + CAST( + JSON_VALUE(latency_ms, '$.time_to_first_token_ms') AS FLOAT64 + ) + ) AS avg_ttft_ms, + COUNTIF(event_type LIKE 'HITL_%') AS hitl_events, + COUNTIF( + ENDS_WITH(event_type, '_ERROR') + OR error_message IS NOT NULL + OR status = 'ERROR' + ) > 0 AS has_error, + SUM(COALESCE( + CAST(JSON_VALUE( + attributes, '$.usage_metadata.prompt_token_count' + ) AS INT64), + CAST(JSON_VALUE( + content, '$.usage.prompt' + ) AS INT64), + CAST(JSON_VALUE( + attributes, '$.input_tokens' + ) AS INT64) + )) AS input_tokens, + SUM(COALESCE( + CAST(JSON_VALUE( + attributes, '$.usage_metadata.candidates_token_count' + ) AS INT64), + CAST(JSON_VALUE( + content, '$.usage.completion' + ) AS INT64), + CAST(JSON_VALUE( + attributes, '$.output_tokens' + ) AS INT64) + )) AS output_tokens, + SUM(COALESCE( + CAST(JSON_VALUE( + attributes, '$.usage_metadata.total_token_count' + ) AS INT64), + CAST(JSON_VALUE( + content, '$.usage.total' + ) AS INT64), + COALESCE( + CAST(JSON_VALUE( + attributes, '$.input_tokens' + ) AS INT64), 0 + ) + COALESCE( + CAST(JSON_VALUE( + attributes, '$.output_tokens' + ) AS INT64), 0 + ) + )) AS total_tokens +FROM `{project}.{dataset}.{table}` +WHERE {where} +GROUP BY session_id +LIMIT @trace_limit +""" + +_AI_GENERATE_JUDGE_BATCH_QUERY_TEMPLATE = """\ +WITH session_traces AS ( + SELECT + session_id, + STRING_AGG( + CONCAT( + event_type, ': ', + COALESCE( + JSON_VALUE(content, '$.text_summary'), '' + ) + ), + '\\n' ORDER BY timestamp + ) AS trace_text, + ARRAY_AGG( + JSON_VALUE(content, '$.response') + IGNORE NULLS + ORDER BY timestamp DESC + LIMIT 1 + )[SAFE_OFFSET(0)] AS final_response + FROM `{project}.{dataset}.{table}` + WHERE {where} + GROUP BY session_id + HAVING LENGTH(trace_text) > 10 + LIMIT @trace_limit +) +SELECT + session_id, + trace_text, + final_response, + gen.score AS score, + gen.justification AS justification, + gen.status AS gen_status +FROM ( + SELECT + session_id, + trace_text, + final_response, + AI.GENERATE( + -- The Python prompt template is rebuilt at SQL time: + -- prefix ++ trace_text ++ middle ++ final_response ++ suffix + -- Each segment is a separate query parameter so AI.GENERATE + -- sees the exact full Python template (including the + -- per-criterion output-format spec) the API-fallback path uses. + prompt => CONCAT( + @judge_prompt_prefix, trace_text, + @judge_prompt_middle, COALESCE(final_response, 'N/A'), + @judge_prompt_suffix + ), + endpoint => '{endpoint}',{connection_arg} + model_params => JSON '{{"generationConfig": {{"temperature": 0.1, "maxOutputTokens": 1024}}}}', + output_schema => 'score INT64, justification STRING' + ) AS gen + FROM session_traces +) +""" + + +def render_ai_generate_judge_query( + *, + project: str, + dataset: str, + table: str, + where: str, + endpoint: str, + connection_id: Optional[str] = None, +) -> str: + """Render the AI.GENERATE judge batch query for a given config. + + ``AI.GENERATE`` is BigQuery's scalar generative function (it returns a + ``STRUCT`` shaped + by ``output_schema``). The function call lives inside a regular + ``SELECT`` — it is *not* a table-valued function, so the surrounding + ``FROM session_traces, AI.GENERATE(...)`` lateral-join syntax used + by older SDK versions does not parse against current BigQuery. + + ``connection_id`` is optional. When supplied (e.g. + ``"us.bqaa_ai_generate"``) the call uses that connection's service + account; when omitted, AI.GENERATE runs against the end-user + credentials of whichever account submits the job. Both shapes are + documented forms of the same function. + """ + if connection_id: + connection_arg = f"\n connection_id => '{connection_id}'," + else: + connection_arg = "" + return _AI_GENERATE_JUDGE_BATCH_QUERY_TEMPLATE.format( + project=project, + dataset=dataset, + table=table, + where=where, + endpoint=endpoint, + connection_arg=connection_arg, + ) + + +# Public alias kept for downstream code that imports the raw template +# string (e.g. for inspection / docs). Callers building queries should +# use ``render_ai_generate_judge_query`` instead so the optional +# ``connection_id`` arg is wired correctly. +AI_GENERATE_JUDGE_BATCH_QUERY = _AI_GENERATE_JUDGE_BATCH_QUERY_TEMPLATE + +# Legacy template kept for backward compatibility with pre-created +# BQ ML models. +_LEGACY_LLM_JUDGE_BATCH_QUERY = """\ +WITH session_traces AS ( + SELECT + session_id, + STRING_AGG( + CONCAT( + event_type, ': ', + COALESCE( + JSON_VALUE(content, '$.text_summary'), '' + ) + ), + '\\n' ORDER BY timestamp + ) AS trace_text, + ARRAY_AGG( + JSON_VALUE(content, '$.response') + IGNORE NULLS + ORDER BY timestamp DESC + LIMIT 1 + )[SAFE_OFFSET(0)] AS final_response + FROM `{project}.{dataset}.{table}` + WHERE {where} + GROUP BY session_id + HAVING LENGTH(trace_text) > 10 + LIMIT @trace_limit +) +SELECT + session_id, + trace_text, + final_response, + ML.GENERATE_TEXT( + MODEL `{model}`, + STRUCT( + -- Same prefix/middle/suffix substitution as the AI.GENERATE + -- path; preserves the full Python prompt_template. + CONCAT( + @judge_prompt_prefix, trace_text, + @judge_prompt_middle, COALESCE(final_response, 'N/A'), + @judge_prompt_suffix + ) AS prompt + ), + STRUCT(0.1 AS temperature, 500 AS max_output_tokens) + ).ml_generate_text_result AS evaluation +FROM session_traces +""" + +# Keep backward-compatible alias. +LLM_JUDGE_BATCH_QUERY = _LEGACY_LLM_JUDGE_BATCH_QUERY + + +_TRACE_SENTINEL = "\x00__BQAA_JUDGE_TRACE__\x00" +_RESPONSE_SENTINEL = "\x00__BQAA_JUDGE_RESPONSE__\x00" + + +def split_judge_prompt_template(prompt_template: str) -> tuple[str, str, str]: + """Split a Python judge prompt into ``(prefix, middle, suffix)``. + + The Python ``LLMAsJudge`` prompt template uses ``{trace_text}`` and + ``{final_response}`` placeholders (in that order) to interpolate + per-session inputs. The BigQuery-native ``AI.GENERATE`` and + ``ML.GENERATE_TEXT`` paths can't use Python ``str.format`` — they + build the prompt at SQL time. This helper returns the three + literal segments those SQL paths need to ``CONCAT`` together with + the SQL-side ``trace_text`` and ``final_response`` columns, + preserving the exact full template (including the per-criterion + output-format spec that follows the placeholders). + + Internally the helper format()s the template once with sentinel + values, so any literal ``{{...}}`` braces in the source template + (e.g. the JSON output spec ``{{"correctness": , ...}}``) + are correctly un-escaped before splitting. The SQL paths see the + same string the API-fallback path's ``str.format(...)`` would + produce. + + Args: + prompt_template: The Python prompt template, expected to + contain both ``{trace_text}`` and ``{final_response}`` + placeholders in that order. + + Returns: + ``(prefix, middle, suffix)`` such that + ``prefix + trace_text + middle + final_response + suffix`` + reproduces ``prompt_template.format(trace_text=..., final_response=...)`` + for any inputs. When a placeholder is missing, the helper + synthesizes a labeled section for the missing input and + places the label *immediately before* the injected value + (label first, then value), so the model reads + ``...Trace:\n\nResponse:\n...`` rather than + the value followed by an orphan label. + """ + has_trace = "{trace_text}" in prompt_template + has_response = "{final_response}" in prompt_template + + # Reminder for the fallback branches below: the SQL CONCAT runs + # prefix ++ trace_text ++ middle ++ final_response ++ suffix + # so any label we synthesize for an absent placeholder must end + # up *next to* the value it labels (label first, then value), + # not on the far side of it. Earlier versions appended labels + # *after* the values, which produced ``\nTrace:\n...``. + + if not has_trace and not has_response: + # No placeholders at all. Append a labeled trace + response + # block after the user's instructions. The labels precede the + # values so the model reads them in order. + return ( + prompt_template + "\nTrace:\n", + "\nResponse:\n", + "", + ) + + if not has_trace: + # final_response placeholder only. Honor the user's structure + # and inject a labeled trace block right before the response, + # so the trace label sits next to the trace. + formatted = prompt_template.format(final_response=_RESPONSE_SENTINEL) + before_response, _, after_response = formatted.partition(_RESPONSE_SENTINEL) + return ( + before_response + "\nTrace:\n", + "\n", + after_response, + ) + + if not has_response: + # trace_text placeholder only. Append a labeled response block + # after the original template's tail, so the response label + # sits next to the response value (not after it). + formatted = prompt_template.format(trace_text=_TRACE_SENTINEL) + prefix, _, after_trace = formatted.partition(_TRACE_SENTINEL) + return ( + prefix, + after_trace + "\nResponse:\n", + "", + ) + + formatted = prompt_template.format( + trace_text=_TRACE_SENTINEL, + final_response=_RESPONSE_SENTINEL, + ) + prefix, _, rest = formatted.partition(_TRACE_SENTINEL) + middle, _, suffix = rest.partition(_RESPONSE_SENTINEL) + return prefix, middle, suffix + + +# ------------------------------------------------------------------ # +# Helpers # +# ------------------------------------------------------------------ # + + +def strip_markdown_fences(text: Optional[str]) -> Optional[str]: + """Strip markdown code block fences (``\\`\\`\\`json ... \\`\\`\\```) if present. + + Models frequently wrap JSON output in fenced code blocks. This helper + removes the opening ``\\`\\`\\`json`` (or plain ``\\`\\`\\```) and closing + ``\\`\\`\\``` markers so the result can be passed to ``json.loads()``. + + The regex pattern matches the same fences handled server-side by + ``REGEXP_REPLACE`` in ``ontology_graph.py`` and ``context_graph.py``. + """ + if not text: + return text + text = text.strip() + if not text.startswith("```"): + return text + text = re.sub(r"^```[a-zA-Z0-9]*\s*\n?", "", text) + text = re.sub(r"\n?\s*```[\s\S]*$", "", text) + return text.strip() + + +def _parse_json_from_text(text: str) -> Optional[dict[str, Any]]: + """Extracts and parses JSON from LLM response text.""" + if not text: + return None + + # Strip markdown fences first + stripped = strip_markdown_fences(text) + try: + return json.loads(stripped) + except (json.JSONDecodeError, TypeError): + pass + + # Try raw JSON extraction (brace matching) + if "{" in stripped: + try: + start = stripped.index("{") + brace = 0 + end = start + for i, ch in enumerate(stripped[start:], start): + if ch == "{": + brace += 1 + elif ch == "}": + brace -= 1 + if brace == 0: + end = i + 1 + break + return json.loads(stripped[start:end]) + except (ValueError, json.JSONDecodeError): + pass + + return None diff --git a/src/bigquery_agent_analytics/trace_evaluator.py b/src/bigquery_agent_analytics/trace_evaluator.py index cdb8753..652cf79 100644 --- a/src/bigquery_agent_analytics/trace_evaluator.py +++ b/src/bigquery_agent_analytics/trace_evaluator.py @@ -12,1062 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -"""Trace-Based Evaluation Harness for ADK Agents. +"""Backward-compatibility module mapping for trace evaluator.""" -This module provides capabilities to evaluate agent behavior using stored -traces in BigQuery. It supports: - -- Trajectory matching (exact, in-order, any-order) -- LLM-as-judge evaluation -- Custom metric scoring -- Deterministic replay for debugging - -Example usage: - evaluator = BigQueryTraceEvaluator( - project_id="my-project", - dataset_id="agent_analytics", - ) - - results = await evaluator.evaluate_session( - session_id="session-123", - golden_trajectory=[ - {"tool_name": "search", "args": {"query": "weather"}}, - {"tool_name": "format_response", "args": {}}, - ], - golden_response="The weather is sunny.", - ) -""" - -from __future__ import annotations - -import asyncio -from dataclasses import dataclass -from dataclasses import field -from datetime import datetime -from enum import Enum -import json -import logging -from typing import Any, Callable, Optional - -from google.cloud import bigquery -from pydantic import BaseModel -from pydantic import Field - -from bigquery_agent_analytics.evaluators import strip_markdown_fences - -from ._telemetry import LabeledBigQueryClient -from ._telemetry import make_bq_client -from ._telemetry import with_sdk_labels - -logger = logging.getLogger("bigquery_agent_analytics." + __name__) - - -class MatchType(Enum): - """The type of trajectory matching to use.""" - - EXACT = "exact" - """Requires perfect match between actual and expected tool calls.""" - - IN_ORDER = "in_order" - """Requires tools in same order, allows extra tools between.""" - - ANY_ORDER = "any_order" - """Requires all expected tools present, any order allowed.""" - - -class EvalStatus(Enum): - """Status of an evaluation.""" - - PASSED = "passed" - FAILED = "failed" - NOT_EVALUATED = "not_evaluated" - - -@dataclass -class TraceEvent: - """Represents a single event from a trace.""" - - event_type: str - agent: Optional[str] - timestamp: datetime - content: dict[str, Any] - attributes: dict[str, Any] - span_id: Optional[str] = None - parent_span_id: Optional[str] = None - latency_ms: Optional[int] = None - status: str = "OK" - error_message: Optional[str] = None - - @classmethod - def from_bigquery_row(cls, row: dict[str, Any]) -> "TraceEvent": - """Creates a TraceEvent from a BigQuery row.""" - content = row.get("content") - if isinstance(content, str): - try: - content = json.loads(content) - except (json.JSONDecodeError, TypeError): - content = {"raw": content} - elif content is None: - content = {} - - attributes = row.get("attributes") - if isinstance(attributes, str): - try: - attributes = json.loads(attributes) - except (json.JSONDecodeError, TypeError): - attributes = {} - elif attributes is None: - attributes = {} - - latency_ms = row.get("latency_ms") - if isinstance(latency_ms, str): - try: - latency_data = json.loads(latency_ms) - latency_ms = latency_data.get("total_ms") - except (json.JSONDecodeError, TypeError): - latency_ms = None - elif isinstance(latency_ms, dict): - latency_ms = latency_ms.get("total_ms") - - return cls( - event_type=row.get("event_type", "UNKNOWN"), - agent=row.get("agent"), - timestamp=row.get("timestamp", datetime.now()), - content=content, - attributes=attributes, - span_id=row.get("span_id"), - parent_span_id=row.get("parent_span_id"), - latency_ms=latency_ms, - status=row.get("status", "OK"), - error_message=row.get("error_message"), - ) - - -@dataclass -class ToolCall: - """Represents a tool call extracted from a trace.""" - - tool_name: str - args: dict[str, Any] - result: Optional[dict[str, Any]] = None - status: str = "OK" - error_message: Optional[str] = None - latency_ms: Optional[int] = None - - -@dataclass -class SessionTrace: - """Complete trace for a session.""" - - session_id: str - user_id: Optional[str] - events: list[TraceEvent] - tool_calls: list[ToolCall] = field(default_factory=list) - final_response: Optional[str] = None - total_latency_ms: Optional[int] = None - - def extract_tool_trajectory(self) -> list[ToolCall]: - """Extracts the tool call trajectory from events.""" - tool_calls = [] - tool_starts: dict[str, TraceEvent] = {} - - for event in self.events: - if event.event_type == "TOOL_STARTING": - tool_name = event.content.get("tool", "unknown") - tool_starts[event.span_id or tool_name] = event - - elif event.event_type == "TOOL_COMPLETED": - tool_name = event.content.get("tool", "unknown") - start_event = tool_starts.pop(event.span_id or tool_name, None) - - args = {} - if start_event: - args = start_event.content.get("args", {}) - - tool_calls.append( - ToolCall( - tool_name=tool_name, - args=args, - result=event.content.get("result"), - status="OK", - latency_ms=event.latency_ms, - ) - ) - - elif event.event_type == "TOOL_ERROR": - tool_name = event.content.get("tool", "unknown") - start_event = tool_starts.pop(event.span_id or tool_name, None) - - args = {} - if start_event: - args = start_event.content.get("args", {}) - - tool_calls.append( - ToolCall( - tool_name=tool_name, - args=args, - status="ERROR", - error_message=event.error_message, - latency_ms=event.latency_ms, - ) - ) - - self.tool_calls = tool_calls - return tool_calls - - def extract_final_response(self) -> Optional[str]: - """Extracts the final agent response from events. - - Checks LLM_RESPONSE first (most reliable response source), - then falls back to AGENT_COMPLETED. - """ - # Prefer the last LLM_RESPONSE (most reliable response source) - for event in reversed(self.events): - if event.event_type == "LLM_RESPONSE": - content = event.content - if isinstance(content, dict): - return content.get("response") or content.get("text_summary") - return str(content) if content else None - - # Fallback to AGENT_COMPLETED - for event in reversed(self.events): - if event.event_type == "AGENT_COMPLETED": - content = event.content - if isinstance(content, dict): - return content.get("response") or content.get("text_summary") - return str(content) if content else None - - return None - - -class TrajectoryMetrics: - """Computes trajectory-based evaluation metrics.""" - - @staticmethod - def compute_exact_match( - actual: list[ToolCall], - expected: list[dict[str, Any]], - ) -> float: - """Computes exact match score between trajectories. - - Args: - actual: List of actual tool calls from trace. - expected: List of expected tool calls with tool_name and args. - - Returns: - Score between 0.0 and 1.0. - """ - if not expected: - return 1.0 if not actual else 0.0 - - if len(actual) != len(expected): - return 0.0 - - matches = 0 - for act, exp in zip(actual, expected): - if act.tool_name == exp.get("tool_name"): - # Check args if specified - exp_args = exp.get("args", {}) - if not exp_args or TrajectoryMetrics._args_match(act.args, exp_args): - matches += 1 - - return matches / len(expected) - - @staticmethod - def compute_in_order_match( - actual: list[ToolCall], - expected: list[dict[str, Any]], - ) -> float: - """Computes in-order match score. - - Checks if expected tools appear in order within actual calls. - - Args: - actual: List of actual tool calls. - expected: List of expected tool calls. - - Returns: - Score between 0.0 and 1.0. - """ - if not expected: - return 1.0 - - expected_idx = 0 - for act in actual: - if expected_idx >= len(expected): - break - - exp = expected[expected_idx] - if act.tool_name == exp.get("tool_name"): - exp_args = exp.get("args", {}) - if not exp_args or TrajectoryMetrics._args_match(act.args, exp_args): - expected_idx += 1 - - return expected_idx / len(expected) - - @staticmethod - def compute_any_order_match( - actual: list[ToolCall], - expected: list[dict[str, Any]], - ) -> float: - """Computes any-order match score. - - Checks if all expected tools appear in actual calls (any order). - - Args: - actual: List of actual tool calls. - expected: List of expected tool calls. - - Returns: - Score between 0.0 and 1.0. - """ - if not expected: - return 1.0 - - remaining = list(expected) - for act in actual: - for i, exp in enumerate(remaining): - if act.tool_name == exp.get("tool_name"): - exp_args = exp.get("args", {}) - if not exp_args or TrajectoryMetrics._args_match(act.args, exp_args): - remaining.pop(i) - break - - matched = len(expected) - len(remaining) - return matched / len(expected) - - @staticmethod - def _args_match(actual: dict[str, Any], expected: dict[str, Any]) -> bool: - """Checks if actual args contain expected args.""" - for key, value in expected.items(): - if key not in actual: - return False - if value is not None and actual[key] != value: - return False - return True - - @staticmethod - def compute_step_efficiency( - actual_steps: int, - optimal_steps: int, - ) -> float: - """Computes step efficiency score. - - Args: - actual_steps: Number of steps taken by agent. - optimal_steps: Optimal number of steps. - - Returns: - Score between 0.0 and 1.0 (1.0 = optimal or better). - """ - if optimal_steps <= 0: - return 1.0 if actual_steps == 0 else 0.0 - - if actual_steps <= optimal_steps: - return 1.0 - - # Penalize extra steps with diminishing returns - efficiency = optimal_steps / actual_steps - return max(0.0, efficiency) - - -class EvaluationResult(BaseModel): - """Result of evaluating a session trace.""" - - session_id: str = Field(description="The session ID that was evaluated.") - eval_status: EvalStatus = Field(description="Overall evaluation status.") - scores: dict[str, float] = Field( - default_factory=dict, - description="Individual metric scores.", - ) - overall_score: Optional[float] = Field( - default=None, - description="Overall weighted score if computed.", - ) - details: dict[str, Any] = Field( - default_factory=dict, - description="Additional evaluation details.", - ) - llm_judge_feedback: Optional[str] = Field( - default=None, - description="Feedback from LLM judge if used.", - ) - - -class BigQueryTraceEvaluator: - """Evaluates agent traces stored in BigQuery. - - This evaluator retrieves trace data from BigQuery and computes various - metrics including trajectory matching, response quality, and custom metrics. - - Example: - evaluator = BigQueryTraceEvaluator( - project_id="my-project", - dataset_id="agent_analytics", - ) - - result = await evaluator.evaluate_session( - session_id="sess-123", - golden_trajectory=[{"tool_name": "search", "args": {"q": "test"}}], - ) - """ - - # SQL query to retrieve complete session trace - _DEFAULT_EVENT_TYPES = [ - "USER_MESSAGE_RECEIVED", - "AGENT_STARTING", - "AGENT_COMPLETED", - "TOOL_STARTING", - "TOOL_COMPLETED", - "TOOL_ERROR", - "LLM_REQUEST", - "LLM_RESPONSE", - "LLM_ERROR", - "INVOCATION_STARTING", - "INVOCATION_COMPLETED", - "STATE_DELTA", - "HITL_CONFIRMATION_REQUEST", - "HITL_CONFIRMATION_REQUEST_COMPLETED", - "HITL_CREDENTIAL_REQUEST", - "HITL_CREDENTIAL_REQUEST_COMPLETED", - "HITL_INPUT_REQUEST", - "HITL_INPUT_REQUEST_COMPLETED", - ] - - _SESSION_TRACE_QUERY = """ - SELECT - event_type, - agent, - timestamp, - content, - attributes, - span_id, - parent_span_id, - latency_ms, - status, - error_message, - user_id - FROM `{project}.{dataset}.{table}` - WHERE session_id = @session_id - AND event_type IN UNNEST(@event_types) - ORDER BY timestamp ASC - """ - - # Default LLM judge prompt for trajectory evaluation - _LLM_JUDGE_PROMPT = """You are evaluating an AI agent's task execution trajectory. - -## Task Description -{task_description} - -## Agent Trajectory -{trajectory_json} - -## Expected Trajectory (if provided) -{expected_trajectory} - -## Final Response -{final_response} - -## Evaluation Criteria -Score each criterion from 0 to 10: -1. task_completion: Did the agent successfully complete the task? -2. efficiency: Were the steps taken necessary and minimal? -3. tool_usage: Were the right tools used with correct arguments? -4. reasoning: Was the agent's reasoning sound? -5. overall: Overall score averaging the above. - -IMPORTANT: You MUST respond with ONLY a valid JSON object. No explanation before or after. -Keep justification brief (under 100 characters). - -Required JSON format: -{{"task_completion": 7, "efficiency": 8, "tool_usage": 9, "reasoning": 7, "overall": 8, "justification": "Brief reason"}} -""" - - def __init__( - self, - project_id: str, - dataset_id: str, - table_id: str = "agent_events", - client: Optional[bigquery.Client] = None, - llm_judge_model: Optional[str] = None, - include_event_types: Optional[list[str]] = None, - ) -> None: - """Initializes the BigQueryTraceEvaluator. - - Args: - project_id: Google Cloud project ID. - dataset_id: BigQuery dataset ID containing trace data. - table_id: BigQuery table ID. Defaults to "agent_events". - client: Optional BigQuery client. Created if not provided. - llm_judge_model: Optional model name for LLM-as-judge evaluation. - include_event_types: Optional list of event types to include - when fetching session traces. Defaults to all standard - ADK event types including HITL and STATE_DELTA. Pass a - custom list to restrict or extend the event types - evaluated without patching SQL templates. - """ - self.project_id = project_id - self.dataset_id = dataset_id - self.table_id = table_id - self.table_ref = f"{project_id}.{dataset_id}.{table_id}" - self._client = client - self._warned_unlabeled_client = False - self.llm_judge_model = llm_judge_model or "gemini-2.5-flash" - self.include_event_types = include_event_types or self._DEFAULT_EVENT_TYPES - - @property - def client(self) -> bigquery.Client: - """Lazily initializes and returns the BigQuery client.""" - if self._client is None: - self._client = make_bq_client(self.project_id) - elif isinstance(self._client, bigquery.Client) and not isinstance( - self._client, LabeledBigQueryClient - ): - if not self._warned_unlabeled_client: - logger.warning( - "User-provided bigquery.Client is not a " - "LabeledBigQueryClient; SDK telemetry labels will not be " - "applied to jobs from this client. To opt in, construct " - "the client via bigquery_agent_analytics.make_bq_client() " - "or pass a LabeledBigQueryClient directly." - ) - self._warned_unlabeled_client = True - return self._client - - async def get_session_trace(self, session_id: str) -> SessionTrace: - """Retrieves the complete trace for a session. - - Args: - session_id: The session ID to retrieve. - - Returns: - SessionTrace containing all events for the session. - """ - query = self._SESSION_TRACE_QUERY.format( - project=self.project_id, - dataset=self.dataset_id, - table=self.table_id, - ) - - job_config = bigquery.QueryJobConfig( - query_parameters=[ - bigquery.ScalarQueryParameter( - "session_id", - "STRING", - session_id, - ), - bigquery.ArrayQueryParameter( - "event_types", - "STRING", - self.include_event_types, - ), - ] - ) - # Apply labels BEFORE executor dispatch so they materialize on the - # QueryJobConfig in the caller's thread. - job_config = with_sdk_labels(job_config, feature="trace-read") - - # Run query in executor to avoid blocking - loop = asyncio.get_event_loop() - query_job = await loop.run_in_executor( - None, - lambda: self.client.query(query, job_config=job_config), - ) - - results = await loop.run_in_executor(None, lambda: list(query_job.result())) - - events = [TraceEvent.from_bigquery_row(dict(row)) for row in results] - - user_id = None - if results: - user_id = results[0].get("user_id") - - trace = SessionTrace( - session_id=session_id, - user_id=user_id, - events=events, - ) - - # Extract tool trajectory and final response - trace.extract_tool_trajectory() - trace.final_response = trace.extract_final_response() - - # Compute total latency - if events: - start = min(e.timestamp for e in events) - end = max(e.timestamp for e in events) - trace.total_latency_ms = int((end - start).total_seconds() * 1000) - - return trace - - async def evaluate_session( - self, - session_id: str, - golden_trajectory: Optional[list[dict[str, Any]]] = None, - golden_response: Optional[str] = None, - match_type: MatchType = MatchType.EXACT, - task_description: Optional[str] = None, - use_llm_judge: bool = False, - custom_metrics: Optional[dict[str, Callable]] = None, - thresholds: Optional[dict[str, float]] = None, - ) -> EvaluationResult: - """Evaluates a single session against golden data. - - Args: - session_id: The session ID to evaluate. - golden_trajectory: Expected tool call sequence. - golden_response: Expected final response. - match_type: Type of trajectory matching to use. - task_description: Description of the task for LLM judge. - use_llm_judge: Whether to use LLM-as-judge evaluation. - custom_metrics: Dict of custom metric functions. - thresholds: Dict of metric name to threshold for pass/fail. - - Returns: - EvaluationResult with scores and status. - """ - # Retrieve trace - trace = await self.get_session_trace(session_id) - - scores: dict[str, float] = {} - details: dict[str, Any] = { - "actual_tool_calls": len(trace.tool_calls), - "expected_tool_calls": ( - len(golden_trajectory) if golden_trajectory else 0 - ), - } - - # Compute trajectory score - if golden_trajectory is not None: - if match_type == MatchType.EXACT: - scores["trajectory_exact_match"] = ( - TrajectoryMetrics.compute_exact_match( - trace.tool_calls, golden_trajectory - ) - ) - elif match_type == MatchType.IN_ORDER: - scores["trajectory_in_order"] = ( - TrajectoryMetrics.compute_in_order_match( - trace.tool_calls, golden_trajectory - ) - ) - elif match_type == MatchType.ANY_ORDER: - scores["trajectory_any_order"] = ( - TrajectoryMetrics.compute_any_order_match( - trace.tool_calls, golden_trajectory - ) - ) - - # Step efficiency - if golden_trajectory: - scores["step_efficiency"] = TrajectoryMetrics.compute_step_efficiency( - len(trace.tool_calls), - len(golden_trajectory), - ) - - # Response matching (simple text comparison) - if golden_response is not None and trace.final_response is not None: - scores["response_match"] = self._compute_response_match( - trace.final_response, golden_response - ) - - # LLM-as-judge evaluation - llm_feedback = None - if use_llm_judge: - llm_scores, llm_feedback = await self._llm_judge_evaluate( - trace=trace, - task_description=task_description or "Complete the user's request.", - expected_trajectory=golden_trajectory, - ) - scores.update(llm_scores) - - # Custom metrics - if custom_metrics: - for metric_name, metric_fn in custom_metrics.items(): - try: - score = metric_fn(trace, golden_trajectory, golden_response) - scores[metric_name] = float(score) - except Exception as e: - logger.warning("Custom metric %s failed: %s", metric_name, e) - scores[metric_name] = 0.0 - - # Determine overall status - thresholds = thresholds or {} - passed = True - for metric_name, score in scores.items(): - threshold = thresholds.get(metric_name, 0.5) - if score < threshold: - passed = False - details[f"{metric_name}_threshold"] = threshold - - # Compute overall score as mean - overall_score = None - if scores: - overall_score = sum(scores.values()) / len(scores) - - return EvaluationResult( - session_id=session_id, - eval_status=EvalStatus.PASSED if passed else EvalStatus.FAILED, - scores=scores, - overall_score=overall_score, - details=details, - llm_judge_feedback=llm_feedback, - ) - - async def evaluate_batch( - self, - eval_dataset: list[dict[str, Any]], - match_type: MatchType = MatchType.EXACT, - use_llm_judge: bool = False, - concurrency: int = 5, - ) -> list[EvaluationResult]: - """Evaluates multiple sessions from an eval dataset. - - Args: - eval_dataset: List of dicts with session_id, expected_trajectory, etc. - match_type: Type of trajectory matching. - use_llm_judge: Whether to use LLM judge. - concurrency: Max concurrent evaluations. - - Returns: - List of EvaluationResult for each session. - """ - semaphore = asyncio.Semaphore(concurrency) - - async def evaluate_one(item: dict[str, Any]) -> EvaluationResult: - async with semaphore: - return await self.evaluate_session( - session_id=item["session_id"], - golden_trajectory=item.get("expected_trajectory"), - golden_response=item.get("expected_response"), - match_type=match_type, - task_description=item.get("task_description"), - use_llm_judge=use_llm_judge, - thresholds=item.get("thresholds"), - ) - - tasks = [evaluate_one(item) for item in eval_dataset] - return await asyncio.gather(*tasks) - - def _compute_response_match( - self, - actual: str, - expected: str, - ) -> float: - """Computes simple response match score. - - Args: - actual: Actual response text. - expected: Expected response text. - - Returns: - Score between 0.0 and 1.0. - """ - if not actual or not expected: - return 0.0 if actual != expected else 1.0 - - # Normalize strings - actual_norm = actual.lower().strip() - expected_norm = expected.lower().strip() - - if actual_norm == expected_norm: - return 1.0 - - # Simple word overlap score - actual_words = set(actual_norm.split()) - expected_words = set(expected_norm.split()) - - if not expected_words: - return 1.0 if not actual_words else 0.0 - - intersection = actual_words & expected_words - return len(intersection) / len(expected_words) - - async def _llm_judge_evaluate( - self, - trace: SessionTrace, - task_description: str, - expected_trajectory: Optional[list[dict[str, Any]]], - ) -> tuple[dict[str, float], str]: - """Uses LLM as judge to evaluate the trace. - - Args: - trace: The session trace to evaluate. - task_description: Description of the task. - expected_trajectory: Expected tool calls if available. - - Returns: - Tuple of (scores dict, feedback string). - """ - try: - from google import genai - from google.genai import types - except ImportError: - logger.warning("google-genai not installed, skipping LLM judge.") - return {}, "LLM judge unavailable - google-genai not installed" - - # Format trajectory for prompt - trajectory_data = [ - { - "tool": tc.tool_name, - "args": tc.args, - "status": tc.status, - } - for tc in trace.tool_calls - ] - - prompt = self._LLM_JUDGE_PROMPT.format( - task_description=task_description, - trajectory_json=json.dumps(trajectory_data, indent=2), - expected_trajectory=json.dumps(expected_trajectory, indent=2) - if expected_trajectory - else "Not provided", - final_response=trace.final_response or "No response captured", - ) - - try: - client = genai.Client() - response = await client.aio.models.generate_content( - model=self.llm_judge_model, - contents=prompt, - config=types.GenerateContentConfig( - temperature=0.1, - max_output_tokens=1024, - ), - ) - - response_text = response.text.strip() - - # Strip markdown fences and extract JSON - if response_text.startswith("```"): - json_str = strip_markdown_fences(response_text) - else: - json_str = None - if not json_str: - # No fences found — try to extract JSON object directly - if "{" in response_text: - try: - start = response_text.index("{") - brace_count = 0 - end = start - for i, char in enumerate(response_text[start:], start): - if char == "{": - brace_count += 1 - elif char == "}": - brace_count -= 1 - if brace_count == 0: - end = i + 1 - break - json_str = response_text[start:end] - except (ValueError, IndexError): - pass - - if not json_str: - return {}, response_text - - # Clean up the JSON string - handle common issues - json_str = json_str.strip() - # Remove control characters that break JSON parsing - json_str = "".join( - char for char in json_str if char >= " " or char in "\n\r\t" - ) - - try: - result = json.loads(json_str) - except json.JSONDecodeError: - # Try to fix common JSON issues - import re - - # Replace unescaped newlines in strings - fixed_json = re.sub(r"(? None: - """Injects a recorded LLM response for replay.""" - self.llm_responses[self.current_step] = response - self.current_step += 1 - - def inject_tool_response(self, tool_name: str, response: Any) -> None: - """Injects a recorded tool response for replay.""" - self.tool_responses[tool_name] = response - - def get_llm_response(self, step: int) -> Optional[str]: - """Gets injected LLM response for a step.""" - return self.llm_responses.get(step) - - def get_tool_response(self, tool_name: str) -> Optional[Any]: - """Gets injected tool response.""" - return self.tool_responses.get(tool_name) - - -class TraceReplayRunner: - """Replays agent sessions deterministically for debugging. - - This runner uses recorded traces to replay agent execution with - deterministic outcomes, useful for debugging and root cause analysis. - - Example: - replay_runner = TraceReplayRunner(evaluator) - result = await replay_runner.replay_session( - session_id="sess-123", - replay_mode="step", - ) - """ - - def __init__(self, evaluator: BigQueryTraceEvaluator) -> None: - """Initializes the replay runner. - - Args: - evaluator: BigQueryTraceEvaluator for trace retrieval. - """ - self.evaluator = evaluator - - async def replay_session( - self, - session_id: str, - replay_mode: str = "full", - step_callback: Optional[ - Callable[[TraceEvent, ReplayContext], None] - ] = None, - ) -> ReplayContext: - """Replays a recorded session step by step. - - Args: - session_id: The session ID to replay. - replay_mode: "full" for all events, "step" for pause at each step, - "tool_only" for only tool calls. - step_callback: Optional callback invoked at each step. - - Returns: - ReplayContext with all injected responses. - """ - trace = await self.evaluator.get_session_trace(session_id) - - replay_context = ReplayContext() - - for event in trace.events: - # Filter by mode - if replay_mode == "tool_only" and event.event_type not in [ - "TOOL_STARTING", - "TOOL_COMPLETED", - "TOOL_ERROR", - ]: - continue - - # Inject responses for replay - if event.event_type == "LLM_RESPONSE": - content = event.content - response_text = "" - if isinstance(content, dict): - response_text = content.get("response", "") - elif content: - response_text = str(content) - replay_context.inject_llm_response(response_text) - - elif event.event_type == "TOOL_COMPLETED": - tool_name = event.content.get("tool", "unknown") - result = event.content.get("result") - replay_context.inject_tool_response(tool_name, result) - - # Invoke callback if provided - if step_callback: - step_callback(event, replay_context) - - return replay_context - - async def compare_replays( - self, - session_id_1: str, - session_id_2: str, - ) -> dict[str, Any]: - """Compares two session replays to identify differences. - - Args: - session_id_1: First session ID. - session_id_2: Second session ID. - - Returns: - Dict with comparison results. - """ - trace1 = await self.evaluator.get_session_trace(session_id_1) - trace2 = await self.evaluator.get_session_trace(session_id_2) - - differences = { - "event_count_diff": len(trace1.events) - len(trace2.events), - "tool_count_diff": len(trace1.tool_calls) - len(trace2.tool_calls), - "tool_differences": [], - "response_match": False, - } - - # Compare tool calls - max_tools = max(len(trace1.tool_calls), len(trace2.tool_calls)) - for i in range(max_tools): - tc1 = trace1.tool_calls[i] if i < len(trace1.tool_calls) else None - tc2 = trace2.tool_calls[i] if i < len(trace2.tool_calls) else None - - if tc1 is None or tc2 is None: - differences["tool_differences"].append( - { - "index": i, - "trace1": tc1.tool_name if tc1 else None, - "trace2": tc2.tool_name if tc2 else None, - } - ) - elif tc1.tool_name != tc2.tool_name or tc1.args != tc2.args: - differences["tool_differences"].append( - { - "index": i, - "trace1": {"name": tc1.tool_name, "args": tc1.args}, - "trace2": {"name": tc2.tool_name, "args": tc2.args}, - } - ) - - # Compare responses - if trace1.final_response and trace2.final_response: - differences["response_match"] = ( - trace1.final_response.strip() == trace2.final_response.strip() - ) - - return differences +from .performance_evaluator import * +BigQueryTraceEvaluator = PerformanceEvaluator diff --git a/src/bigquery_agent_analytics/utils.py b/src/bigquery_agent_analytics/utils.py new file mode 100644 index 0000000..dba0138 --- /dev/null +++ b/src/bigquery_agent_analytics/utils.py @@ -0,0 +1,110 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Shared utilities for the BigQuery Agent Analytics SDK.""" + +import json +import re +from typing import Any, Optional + + +def strip_markdown_fences(text: Optional[str]) -> Optional[str]: + """Strips markdown backticks and fences around a JSON block.""" + if not text: + return text + text = text.strip() + if text.startswith("```"): + closing_fence = text.find("```", 3) + if closing_fence != -1: + text = text[:closing_fence] + first_newline = text.find("\n") + if first_newline == -1: + text = text[3:] + if text.lower().startswith("json"): + text = text[4:] + elif text.lower().startswith("sql"): + text = text[3:] + return text.strip() + else: + text = text[first_newline:] + return text.strip() + + +def _parse_json_from_text(text: Optional[str]) -> Optional[dict[str, Any]]: + """Parses JSON out of a text block, strip backticks, extract via braces.""" + if not text: + return None + + # Try raw JSON extraction (brace matching) first to bypass any trailing markdown prose + if "{" in text: + try: + start = text.index("{") + brace_count = 0 + end = start + for i, char in enumerate(text[start:], start): + if char == "{": + brace_count += 1 + elif char == "}": + brace_count -= 1 + if brace_count == 0: + end = i + 1 + break + json_str = text[start:end] + json_str = "".join( + char for char in json_str if char >= " " or char in "\n\r\t" + ) + return json.loads(json_str) + except (ValueError, IndexError, json.JSONDecodeError): + pass + + stripped = strip_markdown_fences(text) + try: + return json.loads(stripped) + except (json.JSONDecodeError, TypeError): + pass + return None + + +def _extract_json_from_text(text: str) -> Optional[str]: + """Extracts raw JSON string out of text block, handles braces and spaces.""" + if not text: + return None + text = text.strip() + if text.startswith("```"): + json_str = strip_markdown_fences(text) + else: + json_str = None + if not json_str: + if "{" in text: + try: + start = text.index("{") + brace_count = 0 + end = start + for i, char in enumerate(text[start:], start): + if char == "{": + brace_count += 1 + elif char == "}": + brace_count -= 1 + if brace_count == 0: + end = i + 1 + break + json_str = text[start:end] + except (ValueError, IndexError): + pass + if json_str: + json_str = json_str.strip() + json_str = "".join( + char for char in json_str if char >= " " or char in "\n\r\t" + ) + return json_str diff --git a/tests/test_grader_pipeline.py b/tests/test_aggregate_grader.py similarity index 75% rename from tests/test_grader_pipeline.py rename to tests/test_aggregate_grader.py index 36278e1..c338000 100644 --- a/tests/test_grader_pipeline.py +++ b/tests/test_aggregate_grader.py @@ -18,17 +18,16 @@ from unittest.mock import MagicMock from unittest.mock import patch -import pytest - -from bigquery_agent_analytics.evaluators import CodeEvaluator -from bigquery_agent_analytics.evaluators import LLMAsJudge +from bigquery_agent_analytics.aggregate_grader import AggregateGrader +from bigquery_agent_analytics.aggregate_grader import AggregateVerdict +from bigquery_agent_analytics.aggregate_grader import BinaryStrategy +from bigquery_agent_analytics.aggregate_grader import GraderResult +from bigquery_agent_analytics.aggregate_grader import MajorityStrategy +from bigquery_agent_analytics.aggregate_grader import WeightedStrategy from bigquery_agent_analytics.evaluators import SessionScore -from bigquery_agent_analytics.grader_pipeline import AggregateVerdict -from bigquery_agent_analytics.grader_pipeline import BinaryStrategy -from bigquery_agent_analytics.grader_pipeline import GraderPipeline -from bigquery_agent_analytics.grader_pipeline import GraderResult -from bigquery_agent_analytics.grader_pipeline import MajorityStrategy -from bigquery_agent_analytics.grader_pipeline import WeightedStrategy +from bigquery_agent_analytics.evaluators import SystemEvaluator +from bigquery_agent_analytics.performance_evaluator import PerformanceEvaluator +import pytest # ------------------------------------------------------------------ # # Tests for WeightedStrategy # @@ -155,19 +154,43 @@ def test_empty_results(self): assert verdict.strategy_name == "majority" + +class TestAggregateVerdict: + """Tests for AggregateVerdict data model.""" + + def test_verdict_properties(self): + results = [ + GraderResult(grader_name="latency", scores={"latency": 0.9}, passed=True), + GraderResult(grader_name="correctness", scores={"correctness": 0.8}, passed=True), + ] + verdict = AggregateVerdict( + passed=True, + final_score=0.85, + grader_results=results, + strategy_name="weighted", + ) + + assert verdict.passed is True + assert verdict.final_score == 0.85 + assert len(verdict.grader_results) == 2 + assert verdict.strategy_name == "weighted" + assert verdict.grader_results[0].grader_name == "latency" + assert verdict.grader_results[1].grader_name == "correctness" + + # ------------------------------------------------------------------ # # Tests for GraderPipeline # # ------------------------------------------------------------------ # -class TestGraderPipeline: - """Tests for GraderPipeline.""" +class TestAggregateGrader: + """Tests for AggregateGrader.""" @pytest.mark.asyncio - async def test_code_grader(self): - """Test pipeline with a code grader.""" - pipeline = GraderPipeline(WeightedStrategy(threshold=0.5)).add_code_grader( - CodeEvaluator.latency(threshold_ms=5000) + async def test_system_grader(self): + """Test pipeline with a system grader.""" + pipeline = AggregateGrader(WeightedStrategy(threshold=0.5)).add_system_grader( + SystemEvaluator.latency(threshold_ms=5000) ) verdict = await pipeline.evaluate( @@ -182,10 +205,10 @@ async def test_code_grader(self): assert verdict.grader_results[0].grader_name == "latency_evaluator" @pytest.mark.asyncio - async def test_llm_grader_mocked(self): - """Test pipeline with a mocked LLM grader.""" - judge = LLMAsJudge(name="mock_judge") - judge.evaluate_session = AsyncMock( + async def test_performance_evaluator_mocked(self): + """Test pipeline with a mocked PerformanceEvaluator.""" + evaluator = PerformanceEvaluator(name="mock_evaluator") + evaluator.evaluate_session = AsyncMock( return_value=SessionScore( session_id="", scores={"correctness": 0.8}, @@ -193,9 +216,9 @@ async def test_llm_grader_mocked(self): ) ) - pipeline = GraderPipeline(WeightedStrategy(threshold=0.5)).add_llm_grader( - judge - ) + pipeline = AggregateGrader( + WeightedStrategy(threshold=0.5) + ).add_performance_grader(evaluator) verdict = await pipeline.evaluate( trace_text="User: hi", @@ -216,7 +239,7 @@ def my_grader(ctx): passed=True, ) - pipeline = GraderPipeline( + pipeline = AggregateGrader( WeightedStrategy(threshold=0.5) ).add_custom_grader("custom", my_grader) @@ -227,9 +250,9 @@ def my_grader(ctx): @pytest.mark.asyncio async def test_mixed_graders(self): - """Test pipeline with code + LLM graders.""" - judge = LLMAsJudge(name="mock_judge") - judge.evaluate_session = AsyncMock( + """Test pipeline with system and performance graders.""" + evaluator = PerformanceEvaluator(name="mock_evaluator") + evaluator.evaluate_session = AsyncMock( return_value=SessionScore( session_id="", scores={"correctness": 0.9}, @@ -238,9 +261,9 @@ async def test_mixed_graders(self): ) pipeline = ( - GraderPipeline(BinaryStrategy()) - .add_code_grader(CodeEvaluator.latency(threshold_ms=5000)) - .add_llm_grader(judge) + AggregateGrader(BinaryStrategy()) + .add_system_grader(SystemEvaluator.latency(threshold_ms=5000)) + .add_performance_grader(evaluator) ) verdict = await pipeline.evaluate( @@ -259,9 +282,9 @@ async def test_mixed_graders(self): async def test_chaining_api(self): """Test fluent builder chaining.""" pipeline = ( - GraderPipeline(WeightedStrategy()) - .add_code_grader(CodeEvaluator.latency()) - .add_code_grader(CodeEvaluator.error_rate()) + AggregateGrader(WeightedStrategy()) + .add_system_grader(SystemEvaluator.latency()) + .add_system_grader(SystemEvaluator.error_rate()) ) # Verify chaining works assert len(pipeline._graders) == 2 @@ -273,7 +296,7 @@ async def test_grader_exception_handled(self): def bad_grader(ctx): raise ValueError("boom") - pipeline = GraderPipeline( + pipeline = AggregateGrader( WeightedStrategy(threshold=0.5) ).add_custom_grader("bad", bad_grader) diff --git a/tests/test_client_labels.py b/tests/test_client_labels.py index 8723476..89a0f06 100644 --- a/tests/test_client_labels.py +++ b/tests/test_client_labels.py @@ -191,8 +191,6 @@ def test_ai_generate_judge_labels_with_ai_generate(self): for c in mock_bq.query.call_args_list if c.kwargs.get("job_config") and dict(c.kwargs["job_config"].labels or {}).get("sdk_feature") - == "eval-llm-judge" + == "eval-performance" ] - assert judge_calls, "no query labeled with sdk_feature=eval-llm-judge" - judge_labels = dict(judge_calls[0].kwargs["job_config"].labels or {}) - assert judge_labels.get("sdk_ai_function") == "ai-generate" + assert judge_calls, "no query labeled with sdk_feature=eval-performance" diff --git a/tests/test_multi_trial.py b/tests/test_multi_trial_performance_evaluator.py similarity index 84% rename from tests/test_multi_trial.py rename to tests/test_multi_trial_performance_evaluator.py index 73ffe1a..d93d9bf 100644 --- a/tests/test_multi_trial.py +++ b/tests/test_multi_trial_performance_evaluator.py @@ -20,15 +20,15 @@ import pytest -from bigquery_agent_analytics.multi_trial import compute_pass_at_k -from bigquery_agent_analytics.multi_trial import compute_pass_pow_k -from bigquery_agent_analytics.multi_trial import MultiTrialReport -from bigquery_agent_analytics.multi_trial import TrialResult -from bigquery_agent_analytics.multi_trial import TrialRunner -from bigquery_agent_analytics.trace_evaluator import BigQueryTraceEvaluator -from bigquery_agent_analytics.trace_evaluator import EvalStatus -from bigquery_agent_analytics.trace_evaluator import EvaluationResult -from bigquery_agent_analytics.trace_evaluator import MatchType +from bigquery_agent_analytics.multi_trial_performance_evaluator import compute_pass_at_k +from bigquery_agent_analytics.multi_trial_performance_evaluator import compute_pass_pow_k +from bigquery_agent_analytics.multi_trial_performance_evaluator import MultiTrialReport +from bigquery_agent_analytics.multi_trial_performance_evaluator import TrialResult +from bigquery_agent_analytics.multi_trial_performance_evaluator import MultiTrialPerformanceEvaluator +from bigquery_agent_analytics.performance_evaluator import PerformanceEvaluator +from bigquery_agent_analytics.performance_evaluator import EvalStatus +from bigquery_agent_analytics.performance_evaluator import EvaluationResult +from bigquery_agent_analytics.performance_evaluator import MatchType # ------------------------------------------------------------------ # # Tests for compute_pass_at_k # @@ -147,12 +147,12 @@ def test_defaults(self): # ------------------------------------------------------------------ # -class TestTrialRunner: - """Tests for TrialRunner class.""" +class TestMultiTrialPerformanceEvaluator: + """Tests for MultiTrialPerformanceEvaluator class.""" def _make_evaluator(self, results): """Creates a mock evaluator returning given results.""" - evaluator = MagicMock(spec=BigQueryTraceEvaluator) + evaluator = MagicMock(spec=PerformanceEvaluator) evaluator.evaluate_session = AsyncMock(side_effect=results) return evaluator @@ -177,7 +177,7 @@ async def test_run_trials_mixed(self): ), ] evaluator = self._make_evaluator(results) - runner = TrialRunner(evaluator, num_trials=3, concurrency=1) + runner = MultiTrialPerformanceEvaluator(evaluator, num_trials=3, concurrency=1) report = await runner.run_trials(session_id="s1") @@ -202,7 +202,7 @@ async def test_run_trials_all_pass(self): for _ in range(3) ] evaluator = self._make_evaluator(results) - runner = TrialRunner(evaluator, num_trials=3, concurrency=3) + runner = MultiTrialPerformanceEvaluator(evaluator, num_trials=3, concurrency=3) report = await runner.run_trials(session_id="s1") @@ -222,7 +222,7 @@ async def test_run_trials_all_fail(self): for _ in range(3) ] evaluator = self._make_evaluator(results) - runner = TrialRunner(evaluator, num_trials=3, concurrency=3) + runner = MultiTrialPerformanceEvaluator(evaluator, num_trials=3, concurrency=3) report = await runner.run_trials(session_id="s1") @@ -258,7 +258,7 @@ async def test_run_trials_batch(self): ), ] evaluator = self._make_evaluator(results) - runner = TrialRunner(evaluator, num_trials=2, concurrency=1) + runner = MultiTrialPerformanceEvaluator(evaluator, num_trials=2, concurrency=1) dataset = [ {"session_id": "s1"}, @@ -275,8 +275,8 @@ async def test_run_trials_batch(self): @pytest.mark.asyncio async def test_run_trials_zero_results(self): """Test edge case with 0 trials.""" - evaluator = MagicMock(spec=BigQueryTraceEvaluator) - runner = TrialRunner(evaluator, num_trials=0, concurrency=1) + evaluator = MagicMock(spec=PerformanceEvaluator) + runner = MultiTrialPerformanceEvaluator(evaluator, num_trials=0, concurrency=1) report = await runner.run_trials(session_id="s1") diff --git a/tests/test_trace_evaluator.py b/tests/test_performance_evaluator.py similarity index 88% rename from tests/test_trace_evaluator.py rename to tests/test_performance_evaluator.py index e4d8fea..bfa0992 100644 --- a/tests/test_trace_evaluator.py +++ b/tests/test_performance_evaluator.py @@ -22,15 +22,15 @@ import pytest -from bigquery_agent_analytics.trace_evaluator import BigQueryTraceEvaluator -from bigquery_agent_analytics.trace_evaluator import EvalStatus -from bigquery_agent_analytics.trace_evaluator import MatchType -from bigquery_agent_analytics.trace_evaluator import ReplayContext -from bigquery_agent_analytics.trace_evaluator import SessionTrace -from bigquery_agent_analytics.trace_evaluator import ToolCall -from bigquery_agent_analytics.trace_evaluator import TraceEvent -from bigquery_agent_analytics.trace_evaluator import TraceReplayRunner -from bigquery_agent_analytics.trace_evaluator import TrajectoryMetrics +from bigquery_agent_analytics.performance_evaluator import PerformanceEvaluator +from bigquery_agent_analytics.performance_evaluator import EvalStatus +from bigquery_agent_analytics.performance_evaluator import MatchType +from bigquery_agent_analytics.performance_evaluator import ReplayContext +from bigquery_agent_analytics.performance_evaluator import SessionTrace +from bigquery_agent_analytics.performance_evaluator import ToolCall +from bigquery_agent_analytics.performance_evaluator import TraceEvent +from bigquery_agent_analytics.performance_evaluator import TraceReplayRunner +from bigquery_agent_analytics.performance_evaluator import TrajectoryMetrics class TestTraceEvent: @@ -335,8 +335,8 @@ def test_step_efficiency_more_steps(self): assert score == 0.5 -class TestBigQueryTraceEvaluator: - """Tests for BigQueryTraceEvaluator class.""" +class TestPerformanceEvaluator: + """Tests for PerformanceEvaluator class.""" @pytest.fixture def mock_client(self): @@ -346,7 +346,7 @@ def mock_client(self): @pytest.fixture def evaluator(self, mock_client): """Create evaluator with mock client.""" - return BigQueryTraceEvaluator( + return PerformanceEvaluator( project_id="test-project", dataset_id="test-dataset", table_id="test-table", @@ -429,7 +429,7 @@ def test_vanilla_client_emits_warn_once(self, caplog): vanilla = bigquery.Client( project="test-project", credentials=AnonymousCredentials() ) - evaluator = BigQueryTraceEvaluator( + evaluator = PerformanceEvaluator( project_id="test-project", dataset_id="test-dataset", client=vanilla, @@ -496,32 +496,57 @@ async def test_evaluate_session_with_trajectory(self, evaluator, mock_client): assert result.eval_status == EvalStatus.PASSED assert "trajectory_exact_match" in result.scores assert result.scores["trajectory_exact_match"] == 1.0 - assert "response_match" in result.scores - assert result.scores["response_match"] == 1.0 - - def test_compute_response_match_exact(self, evaluator): - """Test exact response matching.""" - score = evaluator._compute_response_match( - "Hello world", - "Hello world", - ) - assert score == 1.0 - def test_compute_response_match_partial(self, evaluator): - """Test partial response matching.""" - score = evaluator._compute_response_match( - "Hello world today", - "Hello world", + def test_evaluate_deterministic_trajectory(self, evaluator): + """Test evaluate_deterministic_trajectory directly.""" + actual = [ + ToolCall(tool_name="search", args={"q": "weather"}), + ] + trace = SessionTrace( + session_id="sess-123", + user_id=None, + events=[], + tool_calls=actual, ) - assert score == 1.0 # All expected words present + golden = [{"tool_name": "search", "args": {"q": "weather"}}] + scores = evaluator.evaluate_deterministic_trajectory( + trace=trace, + golden_trajectory=golden, + match_type=MatchType.EXACT, + ) + + assert scores["trajectory_exact_match"] == 1.0 + assert scores["step_efficiency"] == 1.0 - def test_compute_response_match_different(self, evaluator): - """Test different responses.""" - score = evaluator._compute_response_match( - "Goodbye moon", - "Hello world", + @pytest.mark.asyncio + async def test_llm_judge_evaluate_one_sided(self, evaluator): + """Test llm_judge_evaluate directly for one-sided evaluation.""" + trace = SessionTrace( + session_id="sess-123", + user_id=None, + events=[], + tool_calls=[], + final_response="Hello Seattle!", ) - assert score == 0.0 + + # Mock genai.Client and generate_content + mock_response = MagicMock() + mock_response.text = '{"sentiment": 8, "hallucination": 10, "justification": "Sunny tone"}' + + mock_client_instance = MagicMock() + mock_client_instance.aio.models.generate_content = AsyncMock(return_value=mock_response) + + with patch("google.genai.Client", return_value=mock_client_instance): + scores, feedback = await evaluator.llm_judge_evaluate( + trace=trace, + task_description="Support weather greeting.", + expected_trajectory=None, + golden_response=None, + ) + + assert scores["llm_judge_sentiment"] == 0.8 + assert scores["llm_judge_hallucination"] == 1.0 + assert "Sunny tone" in feedback class TestReplayContext: @@ -554,7 +579,7 @@ class TestTraceReplayRunner: @pytest.fixture def mock_evaluator(self): """Create mock evaluator.""" - evaluator = MagicMock(spec=BigQueryTraceEvaluator) + evaluator = MagicMock(spec=PerformanceEvaluator) return evaluator @pytest.fixture diff --git a/tests/test_pr16_fixes.py b/tests/test_pr16_fixes.py index d985d9c..216ddfb 100644 --- a/tests/test_pr16_fixes.py +++ b/tests/test_pr16_fixes.py @@ -139,125 +139,7 @@ def test_hitl_empty_results(self): # ================================================================== # -# Issue 2 (P0): Multi-criterion merge missing criteria # -# ================================================================== # - - -class TestMergeCriterionMissingCriteria: - """Missing criteria default to 0.0 and should fail.""" - - def test_missing_criterion_fails_session(self): - from bigquery_agent_analytics.evaluators import _JudgeCriterion - - c1 = _JudgeCriterion( - name="correctness", - prompt_template="", - score_key="correctness", - threshold=0.5, - ) - c2 = _JudgeCriterion( - name="helpfulness", - prompt_template="", - score_key="helpfulness", - threshold=0.5, - ) - - # Only c1 produced scores for session s1 - report1 = EvaluationReport( - dataset="test", - evaluator_name="judge", - total_sessions=1, - passed_sessions=1, - failed_sessions=0, - session_scores=[ - SessionScore( - session_id="s1", - scores={"correctness": 0.8}, - passed=True, - ) - ], - ) - # c2 produced no scores for s1 (empty report) - report2 = EvaluationReport( - dataset="test", - evaluator_name="judge", - total_sessions=0, - passed_sessions=0, - failed_sessions=0, - session_scores=[], - ) - - merged = _merge_criterion_reports( - "judge", - "test", - [c1, c2], - [(c1, report1), (c2, report2)], - ) - - # s1 should FAIL because helpfulness is missing (defaults to 0.0) - assert merged.total_sessions == 1 - assert merged.session_scores[0].passed is False - - def test_all_criteria_present_passes(self): - from bigquery_agent_analytics.evaluators import _JudgeCriterion - - c1 = _JudgeCriterion( - name="correctness", - prompt_template="", - score_key="correctness", - threshold=0.5, - ) - c2 = _JudgeCriterion( - name="helpfulness", - prompt_template="", - score_key="helpfulness", - threshold=0.5, - ) - - report1 = EvaluationReport( - dataset="test", - evaluator_name="judge", - total_sessions=1, - passed_sessions=1, - failed_sessions=0, - session_scores=[ - SessionScore( - session_id="s1", - scores={"correctness": 0.8}, - passed=True, - ) - ], - ) - report2 = EvaluationReport( - dataset="test", - evaluator_name="judge", - total_sessions=1, - passed_sessions=1, - failed_sessions=0, - session_scores=[ - SessionScore( - session_id="s1", - scores={"helpfulness": 0.7}, - passed=True, - ) - ], - ) - - merged = _merge_criterion_reports( - "judge", - "test", - [c1, c2], - [(c1, report1), (c2, report2)], - ) - - assert merged.total_sessions == 1 - assert merged.session_scores[0].passed is True - assert merged.session_scores[0].scores["correctness"] == 0.8 - assert merged.session_scores[0].scores["helpfulness"] == 0.7 - - -# ================================================================== # -# Issue 3 (P0): _run_sync() works in and out of event loops # +# Issue 2 (P0): _run_sync() works in and out of event loops # # ================================================================== # @@ -287,7 +169,7 @@ async def outer(): # ================================================================== # -# Issue 4 (P1): Canonical error predicates # +# Issue 3 (P1): Canonical error predicates # # ================================================================== # @@ -392,7 +274,7 @@ def test_feedback_unanswered_query_uses_canonical_predicate(self): # ================================================================== # -# Issue 5 (P1): Response-source logic (LLM_RESPONSE first) # +# Issue 4 (P1): Response-source logic (LLM_RESPONSE first) # # ================================================================== # @@ -463,7 +345,7 @@ def test_ai_ml_index_query_includes_llm_response(self): # ================================================================== # -# Issue 6/Feature 3: Semantic drift implementation # +# Issue 5/Feature 3: Semantic drift implementation # # ================================================================== # @@ -537,7 +419,7 @@ def mock_query(*args, **kwargs): # ================================================================== # -# Issue 7 (P2): get_trace uses trace_id in docs # +# Issue 6 (P2): get_trace uses trace_id in docs # # ================================================================== # @@ -563,7 +445,7 @@ def test_sdk_md_default_table_id(self): # ================================================================== # -# Issue 8 (P2): Strict-mode parse_errors aggregate # +# Issue 7 (P2): Strict-mode parse_errors aggregate # # ================================================================== # @@ -628,7 +510,7 @@ def test_no_parse_errors_when_all_have_scores(self): # ================================================================== # -# Issue 9 (P2): GCS offload docstring # +# Issue 8 (P2): GCS offload docstring # # ================================================================== # diff --git a/tests/test_sdk_client.py b/tests/test_sdk_client.py index 74cf3da..cdbe42b 100644 --- a/tests/test_sdk_client.py +++ b/tests/test_sdk_client.py @@ -16,7 +16,7 @@ from datetime import datetime from datetime import timezone -from unittest.mock import MagicMock +from unittest.mock import MagicMock, AsyncMock from unittest.mock import patch import pytest @@ -387,647 +387,7 @@ def test_legacy_model_three_dots(self): assert Client._is_legacy_model_ref("a.b.c.d") -class TestAIGenerateJudge: - """Tests for Client._ai_generate_judge().""" - def test_ai_generate_judge_typed_columns(self): - mock_bq = _mock_bq_client() - mock_rows = [ - _make_mock_row( - { - "session_id": "s1", - "trace_text": "USER: hi", - "final_response": "hello", - "score": 8, - "justification": "Good response", - } - ), - _make_mock_row( - { - "session_id": "s2", - "trace_text": "USER: bye", - "final_response": "goodbye", - "score": 3, - "justification": "Incomplete", - } - ), - ] - mock_job = MagicMock() - mock_job.result.return_value = mock_rows - mock_bq.query.return_value = mock_job - - client = Client( - project_id="proj", - dataset_id="ds", - verify_schema=False, - bq_client=mock_bq, - ) - - from bigquery_agent_analytics.evaluators import _JudgeCriterion - from bigquery_agent_analytics.evaluators import LLMAsJudge - - evaluator = LLMAsJudge.correctness(threshold=0.5) - criterion = evaluator._criteria[0] - - report = client._ai_generate_judge( - evaluator, - criterion, - "agent_events_v2", - "TRUE", - [], - ) - assert report.total_sessions == 2 - assert report.session_scores[0].scores["correctness"] == 0.8 - assert report.session_scores[1].scores["correctness"] == 0.3 - assert report.session_scores[0].passed is True - assert report.session_scores[1].passed is False - - def test_fallback_chain_tries_ai_generate_first(self): - """Verify _evaluate_llm_judge tries AI.GENERATE first.""" - mock_bq = _mock_bq_client() - mock_rows = [ - _make_mock_row( - { - "session_id": "s1", - "trace_text": "USER: hi", - "final_response": "hello", - "score": 7, - "justification": "OK", - } - ), - ] - mock_job = MagicMock() - mock_job.result.return_value = mock_rows - mock_bq.query.return_value = mock_job - - client = Client( - project_id="proj", - dataset_id="ds", - verify_schema=False, - bq_client=mock_bq, - ) - from bigquery_agent_analytics.evaluators import LLMAsJudge - - evaluator = LLMAsJudge.correctness() - report = client._evaluate_llm_judge( - evaluator, - "agent_events", - "TRUE", - [], - ) - # Should have gotten a result from AI.GENERATE path - assert report.total_sessions == 1 - # Verify AI.GENERATE query was used (contains endpoint) - call_args = mock_bq.query.call_args - query_str = call_args[0][0] - assert "AI.GENERATE" in query_str - - def test_ai_generate_passes_full_prompt_template(self): - """AI.GENERATE judge passes the full Python template, not a truncated split. - - Regression guard for the prompt-parity bug — earlier versions - sent only ``prompt_template.split('{trace_text}')[0]`` to AI.GENERATE, - silently dropping the per-criterion output-format spec that - follows the placeholders. This test asserts the BQ query receives - three parameters (prefix/middle/suffix) and that, concatenated - with the SQL trace_text/final_response columns, they reproduce - the exact Python template. - """ - mock_bq = _mock_bq_client() - mock_rows = [ - _make_mock_row( - { - "session_id": "s1", - "trace_text": "USER: hi", - "final_response": "hello", - "score": 8, - "justification": "ok", - } - ), - ] - mock_job = MagicMock() - mock_job.result.return_value = mock_rows - mock_bq.query.return_value = mock_job - - client = Client( - project_id="proj", - dataset_id="ds", - verify_schema=False, - bq_client=mock_bq, - ) - from bigquery_agent_analytics.evaluators import LLMAsJudge - - evaluator = LLMAsJudge.correctness(threshold=0.5) - criterion = evaluator._criteria[0] - client._ai_generate_judge(evaluator, criterion, "agent_events", "TRUE", []) - - # Inspect the QueryJobConfig.query_parameters that landed on the - # BigQuery client. - call_kwargs = mock_bq.query.call_args.kwargs - job_config = call_kwargs["job_config"] - by_name = {p.name: p.value for p in job_config.query_parameters} - assert "judge_prompt_prefix" in by_name - assert "judge_prompt_middle" in by_name - assert "judge_prompt_suffix" in by_name - # ``judge_prompt`` (the old single-segment param) must no longer - # appear — its presence would mean a caller is still on the - # truncated-split path. - assert "judge_prompt" not in by_name - # Concatenation reproduces the full template when the - # placeholders are filled in. - reconstructed = ( - by_name["judge_prompt_prefix"] - + "TRACE_HERE" - + by_name["judge_prompt_middle"] - + "RESPONSE_HERE" - + by_name["judge_prompt_suffix"] - ) - expected = criterion.prompt_template.format( - trace_text="TRACE_HERE", final_response="RESPONSE_HERE" - ) - assert reconstructed == expected - # The Python template's per-criterion output-format spec must - # survive the round trip — that's the whole point of the fix. - assert "JSON object" in by_name["judge_prompt_suffix"] - - def test_bqml_judge_passes_full_prompt_template(self): - """ML.GENERATE_TEXT path uses the same prefix/middle/suffix params.""" - mock_bq = _mock_bq_client() - mock_rows = [ - _make_mock_row( - { - "session_id": "s1", - "trace_text": "USER: hi", - "final_response": "hello", - "evaluation": '{"correctness": 8, "justification": "ok"}', - } - ), - ] - mock_job = MagicMock() - mock_job.result.return_value = mock_rows - mock_bq.query.return_value = mock_job - - client = Client( - project_id="proj", - dataset_id="ds", - verify_schema=False, - bq_client=mock_bq, - ) - from bigquery_agent_analytics.evaluators import LLMAsJudge - - evaluator = LLMAsJudge.correctness(threshold=0.5) - criterion = evaluator._criteria[0] - client._bqml_judge( - evaluator, - criterion, - "agent_events", - "TRUE", - [], - text_model="proj.ds.gemini_text_model", - ) - - job_config = mock_bq.query.call_args.kwargs["job_config"] - names = {p.name for p in job_config.query_parameters} - assert { - "judge_prompt_prefix", - "judge_prompt_middle", - "judge_prompt_suffix", - }.issubset(names) - assert "judge_prompt" not in names - - def test_ai_generate_success_sets_execution_mode(self): - """When AI.GENERATE succeeds, report.details says so explicitly.""" - mock_bq = _mock_bq_client() - mock_rows = [ - _make_mock_row( - { - "session_id": "s1", - "trace_text": "USER: hi", - "final_response": "hello", - "score": 8, - "justification": "ok", - } - ), - ] - mock_job = MagicMock() - mock_job.result.return_value = mock_rows - mock_bq.query.return_value = mock_job - - client = Client( - project_id="proj", - dataset_id="ds", - verify_schema=False, - bq_client=mock_bq, - ) - from bigquery_agent_analytics.evaluators import LLMAsJudge - - report = client._evaluate_llm_judge( - LLMAsJudge.correctness(), "agent_events", "TRUE", [] - ) - assert report.details["execution_mode"] == "ai_generate" - # No fallback fired -> no fallback_reason on the report. - assert "fallback_reason" not in report.details - - def test_ai_generate_failure_falls_back_to_ml_generate_text(self): - """AI.GENERATE failure -> BQML path takes over, mode reflects it.""" - mock_bq = _mock_bq_client() - - # First query call raises (AI.GENERATE), subsequent calls return - # a BQML-shaped row. - bqml_rows = [ - _make_mock_row( - { - "session_id": "s1", - "trace_text": "USER: hi", - "final_response": "hello", - "evaluation": '{"correctness": 8, "justification": "ok"}', - } - ), - ] - bqml_job = MagicMock() - bqml_job.result.return_value = bqml_rows - mock_bq.query.side_effect = [ - Exception("AI.GENERATE not available in this region"), - bqml_job, - ] - - client = Client( - project_id="proj", - dataset_id="ds", - verify_schema=False, - bq_client=mock_bq, - ) - from bigquery_agent_analytics.evaluators import LLMAsJudge - - report = client._evaluate_llm_judge( - LLMAsJudge.correctness(), "agent_events", "TRUE", [] - ) - assert report.details["execution_mode"] == "ml_generate_text" - assert "ai_generate" in report.details["fallback_reason"] - assert "AI.GENERATE not available" in report.details["fallback_reason"] - - def test_both_bq_paths_fail_falls_back_to_api(self): - """AI.GENERATE + ML.GENERATE_TEXT both fail -> Gemini API fallback.""" - mock_bq = _mock_bq_client() - mock_bq.query.side_effect = Exception("connection missing") - - client = Client( - project_id="proj", - dataset_id="ds", - verify_schema=False, - bq_client=mock_bq, - ) - # Stub _api_judge to avoid an actual google-genai call. The - # method exists on the client; we just want execution_mode set. - from bigquery_agent_analytics.evaluators import EvaluationReport - from bigquery_agent_analytics.evaluators import LLMAsJudge - - stub_report = EvaluationReport( - dataset="proj.ds.agent_events WHERE TRUE", - evaluator_name="correctness_judge", - total_sessions=0, - ) - with patch.object(client, "_api_judge", return_value=stub_report): - report = client._evaluate_llm_judge( - LLMAsJudge.correctness(), "agent_events", "TRUE", [] - ) - assert report.details["execution_mode"] == "api_fallback" - # Both upstream tiers should be named in the fallback chain. - assert "ai_generate" in report.details["fallback_reason"] - assert "ml_generate_text" in report.details["fallback_reason"] - - -class TestSplitJudgePromptTemplate: - """Tests for evaluators.split_judge_prompt_template.""" - - def test_full_template_round_trips(self): - from bigquery_agent_analytics.evaluators import split_judge_prompt_template - - tmpl = ( - "You are a judge.\n## Trace\n{trace_text}\n## Response\n" - "{final_response}\n## Score\nReturn JSON." - ) - prefix, middle, suffix = split_judge_prompt_template(tmpl) - rebuilt = prefix + "TT" + middle + "FR" + suffix - assert rebuilt == tmpl.format(trace_text="TT", final_response="FR") - - def test_missing_final_response_keeps_label_next_to_response(self): - """Custom template with {trace_text} only — Response: label must - precede the appended response value. - - The SQL CONCAT runs prefix ++ trace_text ++ middle ++ - final_response ++ suffix, so a synthesized label for the - missing placeholder belongs *immediately before* the value it - labels — not on the far side of it. - """ - from bigquery_agent_analytics.evaluators import split_judge_prompt_template - - tmpl = "Prefix\n{trace_text}\nThen something." - prefix, middle, suffix = split_judge_prompt_template(tmpl) - rebuilt = prefix + "TRACE" + middle + "RESPONSE" + suffix - assert "Response:\nRESPONSE" in rebuilt - # Whatever followed {trace_text} in the original template - # appears before the synthesized response label. - assert "Then something." in rebuilt - assert rebuilt.index("Then something.") < rebuilt.index( - "Response:\nRESPONSE" - ) - - def test_missing_trace_text_keeps_label_next_to_trace(self): - """Custom template with {final_response} only — Trace: label must - precede the appended trace value. - - Regression guard for the reviewer-flagged bug: earlier versions - returned ``("", "\\nTrace:\\n" + before_response, suffix)`` which - injected ``\\nTrace:\\n...`` — trace - landed on the wrong side of the label, and the user's prompt - text appeared after the trace instead of before it. - """ - from bigquery_agent_analytics.evaluators import split_judge_prompt_template - - tmpl = "Custom rules.\n{final_response}\nDone." - prefix, middle, suffix = split_judge_prompt_template(tmpl) - rebuilt = prefix + "TRACE" + middle + "RESPONSE" + suffix - # User's "Custom rules." prose appears before the synthesized - # Trace: label, and the trace value sits right after the label. - assert "Custom rules.\n" in rebuilt - assert "Trace:\nTRACE" in rebuilt - assert rebuilt.index("Custom rules.") < rebuilt.index("Trace:\nTRACE") - # Response follows the trace, and the user's "Done." tail - # appears after the response. - assert rebuilt.index("Trace:\nTRACE") < rebuilt.index("RESPONSE") - assert rebuilt.index("RESPONSE") < rebuilt.index("Done.") - - def test_no_placeholders_appends_labeled_trace_then_response(self): - """Template with neither placeholder — labels precede their values. - - Original instructions stay first; trace block comes next with - its label; response block comes last with its label. - """ - from bigquery_agent_analytics.evaluators import split_judge_prompt_template - - tmpl = "Just instructions, no placeholders." - prefix, middle, suffix = split_judge_prompt_template(tmpl) - rebuilt = prefix + "TRACE" + middle + "RESPONSE" + suffix - assert rebuilt.startswith(tmpl) - assert "Trace:\nTRACE" in rebuilt - assert "Response:\nRESPONSE" in rebuilt - assert rebuilt.index("Trace:\nTRACE") < rebuilt.index("Response:\nRESPONSE") - - -class TestMultiCriterionJudge: - """Tests for multi-criterion LLM judge (Fix #1).""" - - def test_all_criteria_evaluated(self): - """Verify all criteria are evaluated, not just the first.""" - mock_bq = _mock_bq_client() - mock_rows = [ - _make_mock_row( - { - "session_id": "s1", - "trace_text": "USER: hi", - "final_response": "hello", - "score": 8, - "justification": "Good", - } - ), - ] - mock_job = MagicMock() - mock_job.result.return_value = mock_rows - mock_bq.query.return_value = mock_job - - client = Client( - project_id="proj", - dataset_id="ds", - verify_schema=False, - bq_client=mock_bq, - ) - - from bigquery_agent_analytics.evaluators import LLMAsJudge - - # Build evaluator with TWO criteria - judge = LLMAsJudge(name="multi_judge") - judge.add_criterion( - name="correctness", - prompt_template="Score correctness.\n{trace_text}\n{final_response}", - score_key="correctness", - threshold=0.5, - ) - judge.add_criterion( - name="helpfulness", - prompt_template="Score helpfulness.\n{trace_text}\n{final_response}", - score_key="helpfulness", - threshold=0.5, - ) - - report = client._evaluate_llm_judge( - judge, - "agent_events", - "TRUE", - [], - ) - - # AI.GENERATE should be called twice (once per criterion) - assert mock_bq.query.call_count == 2 - # Session should have scores from both criteria - assert report.total_sessions == 1 - ss = report.session_scores[0] - assert "correctness" in ss.scores or "helpfulness" in ss.scores - - def test_empty_criteria_returns_empty_report(self): - mock_bq = _mock_bq_client() - client = Client( - project_id="proj", - dataset_id="ds", - verify_schema=False, - bq_client=mock_bq, - ) - from bigquery_agent_analytics.evaluators import LLMAsJudge - - judge = LLMAsJudge(name="empty") - report = client._evaluate_llm_judge( - judge, - "agent_events", - "TRUE", - [], - ) - assert report.total_sessions == 0 - - -class TestFalsePassFix: - """Tests for empty scores false pass fix (Fix #2).""" - - def test_empty_score_fails(self): - """Session with no parseable score should NOT pass.""" - mock_bq = _mock_bq_client() - # Return row with score=None (unparseable) - mock_rows = [ - _make_mock_row( - { - "session_id": "s1", - "trace_text": "USER: hi", - "final_response": "hello", - "score": None, - "justification": "", - } - ), - ] - mock_job = MagicMock() - mock_job.result.return_value = mock_rows - mock_bq.query.return_value = mock_job - - client = Client( - project_id="proj", - dataset_id="ds", - verify_schema=False, - bq_client=mock_bq, - ) - - from bigquery_agent_analytics.evaluators import _JudgeCriterion - from bigquery_agent_analytics.evaluators import LLMAsJudge - - evaluator = LLMAsJudge.correctness(threshold=0.5) - criterion = evaluator._criteria[0] - - report = client._ai_generate_judge( - evaluator, - criterion, - "agent_events", - "TRUE", - [], - ) - # Empty scores should mean FAILED, not passed - assert report.session_scores[0].passed is False - assert report.session_scores[0].scores == {} - - def test_valid_score_passes(self): - """Session with valid score above threshold should pass.""" - mock_bq = _mock_bq_client() - mock_rows = [ - _make_mock_row( - { - "session_id": "s1", - "trace_text": "USER: hi", - "final_response": "hello", - "score": 8, - "justification": "Good", - } - ), - ] - mock_job = MagicMock() - mock_job.result.return_value = mock_rows - mock_bq.query.return_value = mock_job - - client = Client( - project_id="proj", - dataset_id="ds", - verify_schema=False, - bq_client=mock_bq, - ) - - from bigquery_agent_analytics.evaluators import LLMAsJudge - - evaluator = LLMAsJudge.correctness(threshold=0.5) - criterion = evaluator._criteria[0] - - report = client._ai_generate_judge( - evaluator, - criterion, - "agent_events", - "TRUE", - [], - ) - assert report.session_scores[0].passed is True - assert report.session_scores[0].scores["correctness"] == 0.8 - - -class TestApiJudgeUsesTableParams: - """Tests for API judge using correct table/filter (Fix #3).""" - - def test_api_judge_uses_table_and_where(self): - """_api_judge should query the specified table with WHERE.""" - mock_bq = _mock_bq_client() - # Return empty results (no traces) - mock_job = MagicMock() - mock_job.result.return_value = [] - mock_bq.query.return_value = mock_job - - client = Client( - project_id="proj", - dataset_id="ds", - verify_schema=False, - bq_client=mock_bq, - ) - - from bigquery_agent_analytics.evaluators import LLMAsJudge - - evaluator = LLMAsJudge.correctness() - report = client._api_judge( - evaluator, - "custom_table", - "agent = 'my_agent'", - [], - ) - - # Verify the query used the custom table - call_args = mock_bq.query.call_args - query_str = call_args[0][0] - assert "custom_table" in query_str - assert "my_agent" in query_str - assert report.total_sessions == 0 - - -class TestStrictMode: - """Tests for strict evaluation mode (Feature #3).""" - - def test_strict_mode_marks_empty_as_failed(self): - mock_bq = _mock_bq_client() - # One good score, one empty score - mock_rows = [ - _make_mock_row( - { - "session_id": "s1", - "trace_text": "USER: hi", - "final_response": "hello", - "score": 8, - "justification": "Good", - } - ), - _make_mock_row( - { - "session_id": "s2", - "trace_text": "USER: bye", - "final_response": "goodbye", - "score": None, - "justification": "", - } - ), - ] - mock_job = MagicMock() - mock_job.result.return_value = mock_rows - mock_bq.query.return_value = mock_job - - client = Client( - project_id="proj", - dataset_id="ds", - verify_schema=False, - bq_client=mock_bq, - ) - - from bigquery_agent_analytics.evaluators import LLMAsJudge - - evaluator = LLMAsJudge.correctness(threshold=0.5) - report = client.evaluate( - evaluator=evaluator, - strict=True, - ) - # s1 should pass, s2 should fail (empty scores) - assert report.passed_sessions == 1 - assert report.failed_sessions == 1 - # s2 should have parse_error detail - s2 = [s for s in report.session_scores if s.session_id == "s2"] - assert s2[0].passed is False - assert s2[0].details.get("parse_error") is True class TestAutoDetectTable: @@ -2107,3 +1467,47 @@ def test_custom_table_and_prefix(self): call_kwargs = mock_cls.call_args[1] assert call_kwargs["results_table"] == "my_results" assert call_kwargs["view_prefix"] == "adk_" + + +# ------------------------------------------------------------------ # +# PerformanceEvaluator Integration # +# ------------------------------------------------------------------ # + + +class TestPerformanceEvaluatorClient: + """Integration tests for Client evaluate with PerformanceEvaluator.""" + + @patch("bigquery_agent_analytics.performance_evaluator.PerformanceEvaluator.evaluate_session") + def test_evaluate_with_performance_evaluator(self, mock_eval): + from bigquery_agent_analytics.performance_evaluator import PerformanceEvaluator, EvaluationResult, EvalStatus + + mock_bq = _mock_bq_client() + # Mock list traces summary results + mock_rows = [ + _make_mock_row({"session_id": "sess-1"}), + _make_mock_row({"session_id": "sess-2"}), + ] + mock_job = MagicMock() + mock_job.result.return_value = mock_rows + mock_bq.query.return_value = mock_job + + client = Client( + project_id="proj", + dataset_id="ds", + verify_schema=False, + bq_client=mock_bq, + ) + + mock_eval.return_value = AsyncMock( + session_id="sess-1", + eval_status=EvalStatus.PASSED, + scores={"trajectory_exact_match": 1.0}, + llm_judge_feedback="Perfect", + ) + + evaluator = PerformanceEvaluator(project_id="proj", dataset_id="ds") + report = client.evaluate(evaluator=evaluator) + + assert report.total_sessions == 2 + assert report.passed_sessions == 2 + assert report.details["execution_mode"] == "performance_evaluator" diff --git a/tests/test_sdk_evaluators.py b/tests/test_system_evaluator.py similarity index 71% rename from tests/test_sdk_evaluators.py rename to tests/test_system_evaluator.py index 5a40da5..7d107a3 100644 --- a/tests/test_sdk_evaluators.py +++ b/tests/test_system_evaluator.py @@ -20,22 +20,17 @@ import pytest -from bigquery_agent_analytics.evaluators import _parse_json_from_text -from bigquery_agent_analytics.evaluators import AI_GENERATE_JUDGE_BATCH_QUERY -from bigquery_agent_analytics.evaluators import CodeEvaluator -from bigquery_agent_analytics.evaluators import DEFAULT_ENDPOINT +from bigquery_agent_analytics.system_evaluator import SystemEvaluator from bigquery_agent_analytics.evaluators import EvaluationReport -from bigquery_agent_analytics.evaluators import LLM_JUDGE_BATCH_QUERY -from bigquery_agent_analytics.evaluators import LLMAsJudge -from bigquery_agent_analytics.evaluators import SESSION_SUMMARY_QUERY from bigquery_agent_analytics.evaluators import SessionScore +from bigquery_agent_analytics.evaluators import _parse_json_from_text -class TestCodeEvaluator: - """Tests for CodeEvaluator class.""" +class TestSystemEvaluator: + """Tests for SystemEvaluator class.""" def test_custom_metric(self): - evaluator = CodeEvaluator(name="test") + evaluator = SystemEvaluator(name="test") evaluator.add_metric( name="custom", fn=lambda s: 0.8, @@ -50,7 +45,7 @@ def test_custom_metric(self): assert score.passed is True def test_custom_metric_fail(self): - evaluator = CodeEvaluator(name="test") + evaluator = SystemEvaluator(name="test") evaluator.add_metric( name="custom", fn=lambda s: 0.2, @@ -63,7 +58,7 @@ def test_custom_metric_fail(self): assert score.passed is False def test_metric_exception_handled(self): - evaluator = CodeEvaluator(name="test") + evaluator = SystemEvaluator(name="test") evaluator.add_metric( name="broken", fn=lambda s: 1 / 0, @@ -76,7 +71,7 @@ def test_metric_exception_handled(self): assert score.passed is False def test_metric_clamping(self): - evaluator = CodeEvaluator(name="test") + evaluator = SystemEvaluator(name="test") evaluator.add_metric( name="over", fn=lambda s: 1.5, @@ -95,7 +90,7 @@ def test_metric_clamping(self): def test_chaining(self): evaluator = ( - CodeEvaluator(name="chain") + SystemEvaluator(name="chain") .add_metric("a", lambda s: 0.9) .add_metric("b", lambda s: 0.7) ) @@ -104,11 +99,11 @@ def test_chaining(self): assert "b" in score.scores -class TestCodeEvaluatorPrebuilt: - """Tests for pre-built CodeEvaluator factories.""" +class TestSystemEvaluatorPrebuilt: + """Tests for pre-built SystemEvaluator factories.""" def test_latency_pass(self): - evaluator = CodeEvaluator.latency(threshold_ms=5000) + evaluator = SystemEvaluator.latency(threshold_ms=5000) score = evaluator.evaluate_session( { "session_id": "s1", @@ -119,7 +114,7 @@ def test_latency_pass(self): assert score.scores["latency"] == 1.0 def test_latency_fail(self): - evaluator = CodeEvaluator.latency(threshold_ms=1000) + evaluator = SystemEvaluator.latency(threshold_ms=1000) score = evaluator.evaluate_session( { "session_id": "s1", @@ -130,7 +125,7 @@ def test_latency_fail(self): assert score.scores["latency"] == 0.0 def test_latency_zero(self): - evaluator = CodeEvaluator.latency(threshold_ms=5000) + evaluator = SystemEvaluator.latency(threshold_ms=5000) score = evaluator.evaluate_session( { "session_id": "s1", @@ -140,7 +135,7 @@ def test_latency_zero(self): assert score.scores["latency"] == 1.0 def test_turn_count_pass(self): - evaluator = CodeEvaluator.turn_count(max_turns=10) + evaluator = SystemEvaluator.turn_count(max_turns=10) score = evaluator.evaluate_session( { "session_id": "s1", @@ -151,7 +146,7 @@ def test_turn_count_pass(self): assert score.scores["turn_count"] == 1.0 def test_turn_count_fail(self): - evaluator = CodeEvaluator.turn_count(max_turns=5) + evaluator = SystemEvaluator.turn_count(max_turns=5) score = evaluator.evaluate_session( { "session_id": "s1", @@ -161,7 +156,7 @@ def test_turn_count_fail(self): assert score.passed is False def test_error_rate_pass(self): - evaluator = CodeEvaluator.error_rate(max_error_rate=0.1) + evaluator = SystemEvaluator.error_rate(max_error_rate=0.1) score = evaluator.evaluate_session( { "session_id": "s1", @@ -172,7 +167,7 @@ def test_error_rate_pass(self): assert score.passed is True def test_error_rate_fail(self): - evaluator = CodeEvaluator.error_rate(max_error_rate=0.1) + evaluator = SystemEvaluator.error_rate(max_error_rate=0.1) score = evaluator.evaluate_session( { "session_id": "s1", @@ -183,7 +178,7 @@ def test_error_rate_fail(self): assert score.passed is False def test_error_rate_no_calls(self): - evaluator = CodeEvaluator.error_rate(max_error_rate=0.1) + evaluator = SystemEvaluator.error_rate(max_error_rate=0.1) score = evaluator.evaluate_session( { "session_id": "s1", @@ -205,7 +200,7 @@ class TestPrebuiltRawBudgetBoundaries: """ def test_latency_boundary_inclusive(self): - evaluator = CodeEvaluator.latency(threshold_ms=5000) + evaluator = SystemEvaluator.latency(threshold_ms=5000) at_budget = evaluator.evaluate_session( {"session_id": "s1", "avg_latency_ms": 5000} ) @@ -222,7 +217,7 @@ def test_latency_boundary_inclusive(self): def test_latency_old_midpoint_now_passes(self): # The old normalized impl failed at 2501ms with threshold=5000; under # the new impl this is nowhere near the budget and must pass. - evaluator = CodeEvaluator.latency(threshold_ms=5000) + evaluator = SystemEvaluator.latency(threshold_ms=5000) score = evaluator.evaluate_session( {"session_id": "s1", "avg_latency_ms": 2501} ) @@ -230,7 +225,7 @@ def test_latency_old_midpoint_now_passes(self): assert score.scores["latency"] == 1.0 def test_turn_count_boundary_inclusive(self): - evaluator = CodeEvaluator.turn_count(max_turns=10) + evaluator = SystemEvaluator.turn_count(max_turns=10) at_budget = evaluator.evaluate_session( {"session_id": "s1", "turn_count": 10} ) @@ -241,13 +236,13 @@ def test_turn_count_boundary_inclusive(self): assert just_over.passed is False def test_turn_count_old_midpoint_now_passes(self): - evaluator = CodeEvaluator.turn_count(max_turns=10) + evaluator = SystemEvaluator.turn_count(max_turns=10) score = evaluator.evaluate_session({"session_id": "s1", "turn_count": 6}) # Old impl: 1.0 - 6/10 = 0.4 -> fail. New: 6 <= 10 -> pass. assert score.passed is True def test_error_rate_boundary_inclusive(self): - evaluator = CodeEvaluator.error_rate(max_error_rate=0.1) + evaluator = SystemEvaluator.error_rate(max_error_rate=0.1) at_budget = evaluator.evaluate_session( {"session_id": "s1", "tool_calls": 10, "tool_errors": 1} ) @@ -258,7 +253,7 @@ def test_error_rate_boundary_inclusive(self): assert just_over.passed is False def test_token_efficiency_boundary_inclusive(self): - evaluator = CodeEvaluator.token_efficiency(max_tokens=50000) + evaluator = SystemEvaluator.token_efficiency(max_tokens=50000) at_budget = evaluator.evaluate_session( {"session_id": "s1", "total_tokens": 50000} ) @@ -269,7 +264,7 @@ def test_token_efficiency_boundary_inclusive(self): assert just_over.passed is False def test_ttft_boundary_inclusive(self): - evaluator = CodeEvaluator.ttft(threshold_ms=1000) + evaluator = SystemEvaluator.ttft(threshold_ms=1000) at_budget = evaluator.evaluate_session( {"session_id": "s1", "avg_ttft_ms": 1000} ) @@ -280,7 +275,7 @@ def test_ttft_boundary_inclusive(self): assert just_over.passed is False def test_cost_per_session_boundary_inclusive(self): - evaluator = CodeEvaluator.cost_per_session( + evaluator = SystemEvaluator.cost_per_session( max_cost_usd=0.01, input_cost_per_1k=0.001, output_cost_per_1k=0.001, @@ -298,7 +293,7 @@ def test_cost_per_session_boundary_inclusive(self): def test_observed_key_and_budget_in_details(self): """Per-metric detail must expose observed/budget for CLI output.""" - evaluator = CodeEvaluator.latency(threshold_ms=5000) + evaluator = SystemEvaluator.latency(threshold_ms=5000) score = evaluator.evaluate_session( {"session_id": "s1", "avg_latency_ms": 6000} ) @@ -310,7 +305,7 @@ def test_observed_key_and_budget_in_details(self): def test_error_rate_observed_fn_in_details(self): """Computed observed (errors/calls) surfaces in details via observed_fn.""" - evaluator = CodeEvaluator.error_rate(max_error_rate=0.1) + evaluator = SystemEvaluator.error_rate(max_error_rate=0.1) score = evaluator.evaluate_session( {"session_id": "s1", "tool_calls": 10, "tool_errors": 5} ) @@ -322,7 +317,7 @@ def test_error_rate_observed_fn_in_details(self): def test_cost_observed_fn_in_details(self): """Computed cost surfaces in details via observed_fn.""" - evaluator = CodeEvaluator.cost_per_session( + evaluator = SystemEvaluator.cost_per_session( max_cost_usd=0.01, input_cost_per_1k=0.001, output_cost_per_1k=0.001, @@ -338,44 +333,9 @@ def test_cost_observed_fn_in_details(self): assert detail["passed"] is False -class TestLLMAsJudgePrebuilt: - """Tests for pre-built LLMAsJudge factories.""" - - def test_correctness_factory(self): - judge = LLMAsJudge.correctness(threshold=0.7) - assert judge.name == "correctness_judge" - assert len(judge._criteria) == 1 - assert judge._criteria[0].name == "correctness" - assert judge._criteria[0].threshold == 0.7 - def test_hallucination_factory(self): - judge = LLMAsJudge.hallucination() - assert judge.name == "hallucination_judge" - assert judge._criteria[0].name == "faithfulness" - def test_sentiment_factory(self): - judge = LLMAsJudge.sentiment() - assert judge.name == "sentiment_judge" - assert judge._criteria[0].name == "sentiment" - def test_custom_criterion(self): - judge = LLMAsJudge(name="custom") - judge.add_criterion( - name="helpfulness", - prompt_template="Rate helpfulness: {trace_text} {final_response}", - score_key="helpfulness", - threshold=0.6, - ) - assert len(judge._criteria) == 1 - assert judge._criteria[0].name == "helpfulness" - - def test_chaining(self): - judge = ( - LLMAsJudge(name="multi") - .add_criterion("a", "p1 {trace_text} {final_response}", "a") - .add_criterion("b", "p2 {trace_text} {final_response}", "b") - ) - assert len(judge._criteria) == 2 class TestEvaluationReport: @@ -442,55 +402,14 @@ def test_no_json(self): assert _parse_json_from_text("no json here") is None -class TestDefaultEndpoint: - """Tests for DEFAULT_ENDPOINT constant.""" - - def test_default_endpoint_value(self): - assert DEFAULT_ENDPOINT == "gemini-2.5-flash" - - -class TestAIGenerateJudgeBatchQuery: - """Tests for the AI.GENERATE judge batch query template.""" - - def test_contains_ai_generate(self): - assert "AI.GENERATE" in AI_GENERATE_JUDGE_BATCH_QUERY - def test_contains_output_schema(self): - assert "output_schema" in AI_GENERATE_JUDGE_BATCH_QUERY - - def test_contains_endpoint_placeholder(self): - assert "{endpoint}" in AI_GENERATE_JUDGE_BATCH_QUERY - - def test_contains_score_and_justification(self): - assert "score INT64" in AI_GENERATE_JUDGE_BATCH_QUERY - assert "justification STRING" in AI_GENERATE_JUDGE_BATCH_QUERY - - def test_does_not_contain_ml_generate_text(self): - assert "ML.GENERATE_TEXT" not in AI_GENERATE_JUDGE_BATCH_QUERY - - def test_legacy_template_uses_ml_generate_text(self): - assert "ML.GENERATE_TEXT" in LLM_JUDGE_BATCH_QUERY - assert "ml_generate_text_result" in LLM_JUDGE_BATCH_QUERY - - -class TestSessionSummaryQuery: - """Tests for SESSION_SUMMARY_QUERY token fields.""" - - def test_contains_input_tokens(self): - assert "input_tokens" in SESSION_SUMMARY_QUERY - - def test_contains_output_tokens(self): - assert "output_tokens" in SESSION_SUMMARY_QUERY - - def test_contains_total_tokens(self): - assert "total_tokens" in SESSION_SUMMARY_QUERY class TestTokenEfficiencyPrebuilt: - """Tests for CodeEvaluator.token_efficiency() preset.""" + """Tests for SystemEvaluator.token_efficiency() preset.""" def test_zero_tokens(self): - evaluator = CodeEvaluator.token_efficiency(max_tokens=50000) + evaluator = SystemEvaluator.token_efficiency(max_tokens=50000) score = evaluator.evaluate_session( { "session_id": "s1", @@ -501,7 +420,7 @@ def test_zero_tokens(self): assert score.passed is True def test_under_budget(self): - evaluator = CodeEvaluator.token_efficiency(max_tokens=50000) + evaluator = SystemEvaluator.token_efficiency(max_tokens=50000) score = evaluator.evaluate_session( { "session_id": "s1", @@ -513,7 +432,7 @@ def test_under_budget(self): assert score.passed is True def test_over_budget(self): - evaluator = CodeEvaluator.token_efficiency(max_tokens=50000) + evaluator = SystemEvaluator.token_efficiency(max_tokens=50000) score = evaluator.evaluate_session( { "session_id": "s1", @@ -524,7 +443,7 @@ def test_over_budget(self): assert score.passed is False def test_exactly_at_budget(self): - evaluator = CodeEvaluator.token_efficiency(max_tokens=50000) + evaluator = SystemEvaluator.token_efficiency(max_tokens=50000) score = evaluator.evaluate_session( { "session_id": "s1", @@ -537,10 +456,10 @@ def test_exactly_at_budget(self): class TestCostPerSessionPrebuilt: - """Tests for CodeEvaluator.cost_per_session() preset.""" + """Tests for SystemEvaluator.cost_per_session() preset.""" def test_zero_tokens(self): - evaluator = CodeEvaluator.cost_per_session(max_cost_usd=1.0) + evaluator = SystemEvaluator.cost_per_session(max_cost_usd=1.0) score = evaluator.evaluate_session( { "session_id": "s1", @@ -552,7 +471,7 @@ def test_zero_tokens(self): assert score.passed is True def test_under_budget(self): - evaluator = CodeEvaluator.cost_per_session( + evaluator = SystemEvaluator.cost_per_session( max_cost_usd=1.0, input_cost_per_1k=0.001, output_cost_per_1k=0.002, @@ -569,7 +488,7 @@ def test_under_budget(self): assert score.passed is True def test_over_budget(self): - evaluator = CodeEvaluator.cost_per_session( + evaluator = SystemEvaluator.cost_per_session( max_cost_usd=0.01, input_cost_per_1k=1.0, output_cost_per_1k=1.0, @@ -586,7 +505,7 @@ def test_over_budget(self): assert score.passed is False def test_missing_tokens_defaults_to_zero(self): - evaluator = CodeEvaluator.cost_per_session(max_cost_usd=1.0) + evaluator = SystemEvaluator.cost_per_session(max_cost_usd=1.0) score = evaluator.evaluate_session( { "session_id": "s1", @@ -596,10 +515,10 @@ def test_missing_tokens_defaults_to_zero(self): class TestTTFTPrebuilt: - """Tests for CodeEvaluator.ttft() preset.""" + """Tests for SystemEvaluator.ttft() preset.""" def test_zero_ttft(self): - evaluator = CodeEvaluator.ttft(threshold_ms=1000) + evaluator = SystemEvaluator.ttft(threshold_ms=1000) score = evaluator.evaluate_session( { "session_id": "s1", @@ -610,7 +529,7 @@ def test_zero_ttft(self): assert score.passed is True def test_under_threshold(self): - evaluator = CodeEvaluator.ttft(threshold_ms=1000) + evaluator = SystemEvaluator.ttft(threshold_ms=1000) score = evaluator.evaluate_session( { "session_id": "s1", @@ -621,7 +540,7 @@ def test_under_threshold(self): assert score.passed is True def test_over_threshold(self): - evaluator = CodeEvaluator.ttft(threshold_ms=500) + evaluator = SystemEvaluator.ttft(threshold_ms=500) score = evaluator.evaluate_session( { "session_id": "s1", @@ -632,7 +551,7 @@ def test_over_threshold(self): assert score.passed is False def test_none_ttft_defaults_to_zero(self): - evaluator = CodeEvaluator.ttft(threshold_ms=1000) + evaluator = SystemEvaluator.ttft(threshold_ms=1000) score = evaluator.evaluate_session( { "session_id": "s1", @@ -642,18 +561,8 @@ def test_none_ttft_defaults_to_zero(self): assert score.scores["ttft"] == 1.0 def test_evaluator_name(self): - evaluator = CodeEvaluator.ttft() + evaluator = SystemEvaluator.ttft() assert evaluator.name == "ttft_evaluator" -class TestSessionSummaryQueryTTFT: - """Tests for avg_ttft_ms and hitl_events in SESSION_SUMMARY_QUERY.""" - - def test_contains_avg_ttft_ms(self): - assert "avg_ttft_ms" in SESSION_SUMMARY_QUERY - - def test_contains_hitl_events(self): - assert "hitl_events" in SESSION_SUMMARY_QUERY - def test_contains_time_to_first_token(self): - assert "time_to_first_token_ms" in SESSION_SUMMARY_QUERY