GoogleCloudPlatform
diff --git a/‎README.md‎
Lines changed: 1 addition & 1 deletion b/‎README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎SDK.md‎
Lines changed: 16 additions & 16 deletions b/‎SDK.md‎
Lines changed: 16 additions & 16 deletions
diff --git a/‎docs/design.md‎
Lines changed: 11 additions & 11 deletions b/‎docs/design.md‎
Lines changed: 11 additions & 11 deletions
diff --git a/‎docs/hatteras_evaluation.md‎
Lines changed: 3 additions & 3 deletions b/‎docs/hatteras_evaluation.md‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎docs/implementation_plan_concept_index_runtime.md‎
Lines changed: 1 addition & 1 deletion b/‎docs/implementation_plan_concept_index_runtime.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/implementation_plan_remote_function.md‎
Lines changed: 32 additions & 15 deletions b/‎docs/implementation_plan_remote_function.md‎
Lines changed: 32 additions & 15 deletions
diff --git a/‎docs/prd_unified_analytics_interface.md‎
Lines changed: 3 additions & 3 deletions b/‎docs/prd_unified_analytics_interface.md‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎docs/python_udf_support_design.md‎
Lines changed: 4 additions & 4 deletions b/‎docs/python_udf_support_design.md‎
Lines changed: 4 additions & 4 deletions
@@ -123,7 +123,7 @@ src/bigquery_agent_analytics/
 │   └── formatter.py               # Output formatting (json/text/table)
 │
 ├── Evaluation
-│   ├── evaluators.py              # CodeEvaluator + LLMAsJudge
+│   ├── evaluators.py              # SystemEvaluator + LLMAsJudge
 │   ├── trace_evaluator.py         # Trajectory matching & replay
 │   ├── multi_trial.py             # Multi-trial runner + pass@k
 │   ├── grader_pipeline.py         # Grader composition pipeline
 
@@ -112,29 +112,29 @@ traces = client.list_traces(
 
 ## 3. Code-Based Evaluation (Deterministic Metrics)
 
-`CodeEvaluator` runs deterministic, code-defined metric functions against session summaries. Each metric returns a score between 0.0 and 1.0.
+`SystemEvaluator` runs deterministic, code-defined metric functions against session summaries. Each metric returns a score between 0.0 and 1.0.
 
 ### Pre-Built Evaluators
 
 The SDK ships with six ready-to-use evaluators:
 
 ```python
-from bigquery_agent_analytics import CodeEvaluator
+from bigquery_agent_analytics import SystemEvaluator
 
 # Latency: score degrades linearly as avg latency approaches threshold
-evaluator = CodeEvaluator.latency(threshold_ms=5000)
+evaluator = SystemEvaluator.latency(threshold_ms=5000)
 
 # Turn count: penalizes sessions with too many back-and-forth turns
-evaluator = CodeEvaluator.turn_count(max_turns=10)
+evaluator = SystemEvaluator.turn_count(max_turns=10)
 
 # Error rate: penalizes high tool error rates
-evaluator = CodeEvaluator.error_rate(max_error_rate=0.1)
+evaluator = SystemEvaluator.error_rate(max_error_rate=0.1)
 
 # Token efficiency: checks total token usage stays within budget
-evaluator = CodeEvaluator.token_efficiency(max_tokens=50000)
+evaluator = SystemEvaluator.token_efficiency(max_tokens=50000)
 
 # Cost per session: checks estimated USD cost stays under budget
-evaluator = CodeEvaluator.cost_per_session(
+evaluator = SystemEvaluator.cost_per_session(
     max_cost_usd=1.0,
     input_cost_per_1k=0.00025,
     output_cost_per_1k=0.00125,
@@ -147,7 +147,7 @@ Define your own metric functions and chain multiple metrics together:
 
 ```python
 evaluator = (
-    CodeEvaluator(name="my_quality_check")
+    SystemEvaluator(name="my_quality_check")
     .add_metric(
         name="latency",
         fn=lambda s: 1.0 - min(s.get("avg_latency_ms", 0) / 5000, 1.0),
@@ -190,7 +190,7 @@ Run evaluation across all sessions matching a filter:
 from bigquery_agent_analytics import TraceFilter
 
 report = client.evaluate(
-    evaluator=CodeEvaluator.latency(threshold_ms=3000),
+    evaluator=SystemEvaluator.latency(threshold_ms=3000),
     filters=TraceFilter(agent_id="my_agent"),
 )
 
@@ -535,7 +535,7 @@ pass_pow_k = compute_pass_pow_k(num_trials=10, num_passed=8)  # ~0.107
 
 ## 7. Grader Composition Pipeline
 
-Combine multiple evaluators (`CodeEvaluator` + `LLMAsJudge` + custom functions) into a single aggregated verdict using configurable scoring strategies.
+Combine multiple evaluators (`SystemEvaluator` + `LLMAsJudge` + custom functions) into a single aggregated verdict using configurable scoring strategies.
 
 ### Scoring Strategies
 
@@ -549,7 +549,7 @@ Combine multiple evaluators (`CodeEvaluator` + `LLMAsJudge` + custom functions)
 
 ```python
 from bigquery_agent_analytics import (
-    CodeEvaluator, GraderPipeline, LLMAsJudge,
+    SystemEvaluator, GraderPipeline, LLMAsJudge,
     WeightedStrategy, GraderResult,
 )
 
@@ -562,8 +562,8 @@ pipeline = (
         },
         threshold=0.6,
     ))
-    .add_code_grader(CodeEvaluator.latency(threshold_ms=5000), weight=0.2)
-    .add_code_grader(CodeEvaluator.cost_per_session(max_cost_usd=0.50), weight=0.1)
+    .add_code_grader(SystemEvaluator.latency(threshold_ms=5000), weight=0.2)
+    .add_code_grader(SystemEvaluator.cost_per_session(max_cost_usd=0.50), weight=0.1)
     .add_llm_grader(LLMAsJudge.correctness(threshold=0.7), weight=0.7)
 )
 
@@ -592,8 +592,8 @@ from bigquery_agent_analytics import BinaryStrategy
 
 pipeline = (
     GraderPipeline(BinaryStrategy())
-    .add_code_grader(CodeEvaluator.latency(threshold_ms=3000))
-    .add_code_grader(CodeEvaluator.error_rate(max_error_rate=0.05))
+    .add_code_grader(SystemEvaluator.latency(threshold_ms=3000))
+    .add_code_grader(SystemEvaluator.error_rate(max_error_rate=0.05))
     .add_llm_grader(LLMAsJudge.hallucination(threshold=0.8))
 )
 
@@ -623,7 +623,7 @@ def business_rules_grader(context):
 
 pipeline = (
     GraderPipeline(BinaryStrategy())
-    .add_code_grader(CodeEvaluator.latency())
+    .add_code_grader(SystemEvaluator.latency())
     .add_custom_grader("business_rules", business_rules_grader)
 )
 ```
 
@@ -150,7 +150,7 @@ As demonstrated in the [e2e demo](../examples/e2e_demo.py):
 
 **Phase 2 — Evaluation:**
 1. `Client.get_trace()` retrieves all events for a session
-2. `CodeEvaluator` preset factories assess latency, turn count, error rate, token efficiency
+2. `SystemEvaluator` preset factories assess latency, turn count, error rate, token efficiency
 3. `LLMAsJudge.correctness()` performs semantic evaluation via BigQuery `AI.GENERATE`
 4. `BigQueryTraceEvaluator.evaluate_session()` performs trajectory matching against golden tool sequences
 
@@ -208,7 +208,7 @@ As demonstrated in the [e2e demo](../examples/e2e_demo.py):
    │ categorical_evaluator│  │ ontology_* (6 modules)│  │      cli         │
    │ categorical_views    │  │ (YAML → AI.GENERATE → │  │ (Typer commands) │
    │ (label evaluation)   │  │  tables → PG → GQL)   │  │                  │
-   └──────────────────────┘  └──────────────────────┘  └──────────────────┘
+   └──────────────────┘  └──────────────────┘  └──────────────────┘
 
    ┌──────────────────┐  ┌───────────────────┐
    │ udf_kernels      │  │ serialization     │
@@ -248,7 +248,7 @@ Aggregations, filtering, joins, and even LLM evaluation (via `AI.GENERATE`) are
 LLM-based evaluation can run via (1) BigQuery `AI.GENERATE`, (2) legacy BigQuery ML `ML.GENERATE_TEXT`, or (3) the Gemini API directly. This maximizes compatibility across different GCP configurations.
 
 **Decision 4: Composition over inheritance.**
-The `GraderPipeline` composes `CodeEvaluator`, `LLMAsJudge`, and custom functions via a builder pattern rather than requiring them to share a common base class. The `BigQueryMemoryService` composes four internal services rather than extending a single monolithic class.
+The `GraderPipeline` composes `SystemEvaluator`, `LLMAsJudge`, and custom functions via a builder pattern rather than requiring them to share a common base class. The `BigQueryMemoryService` composes four internal services rather than extending a single monolithic class.
 
 ---
 
@@ -396,7 +396,7 @@ Each field generates a separate `AND` condition with a corresponding `bigquery.S
 
 This module contains two evaluator classes and the SQL templates that power batch evaluation.
 
-#### 4.3.1 `CodeEvaluator`
+#### 4.3.1 `SystemEvaluator`
 
 Deterministic evaluation using code-defined metric functions.
 
@@ -626,7 +626,7 @@ Combines heterogeneous evaluators into a unified verdict using a strategy patter
                              │
               ┌──────────────┼──────────────┐
               ▼              ▼              ▼
-        CodeEvaluator   LLMAsJudge    Custom Fn
+       SystemEvaluator   LLMAsJudge    Custom Fn
         (sync)          (async)        (sync)
               │              │              │
               ▼              ▼              ▼
@@ -1258,7 +1258,7 @@ results = client.query(formatted, job_config=job_config)
 
 ```
 Evaluation
-├── Deterministic (CodeEvaluator)
+├── Deterministic (SystemEvaluator)
 │   ├── Latency
 │   ├── Turn count
 │   ├── Error rate
@@ -1306,7 +1306,7 @@ All evaluation scores in the SDK are normalized to `[0.0, 1.0]`:
 
 | Mode | Evaluator | Where Computation Runs |
 |------|-----------|----------------------|
-| Single session (sync) | `CodeEvaluator.evaluate_session()` | Python |
+| Single session (sync) | `SystemEvaluator.evaluate_session()` | Python |
 | Single session (async) | `LLMAsJudge.evaluate_session()` | Gemini API |
 | Batch via Client | `Client.evaluate()` | BigQuery (SQL + AI.GENERATE) |
 | Trajectory matching | `BigQueryTraceEvaluator.evaluate_session()` | BigQuery (fetch) + Python (matching) |
@@ -1405,7 +1405,7 @@ Synchronous (user-facing):
 ├── Client.drift_detection()
 ├── Client.insights()
 ├── Client.deep_analysis()
-├── CodeEvaluator.evaluate_session()
+├── SystemEvaluator.evaluate_session()
 ├── EvalSuite.*
 ├── EvalValidator.*
 └── BigFramesEvaluator.*
@@ -1465,10 +1465,10 @@ results = await asyncio.gather(*[_run_one(t) for t in tasks])
 
 ## 10. Extensibility & Plugin Points
 
-### 10.1 Custom Metrics (CodeEvaluator)
+### 10.1 Custom Metrics (SystemEvaluator)
 
 ```python
-evaluator = CodeEvaluator(name="custom").add_metric(
+evaluator = SystemEvaluator(name="custom").add_metric(
     name="business_metric",
     fn=lambda session: your_scoring_logic(session),
     threshold=0.7,
@@ -1571,7 +1571,7 @@ All tests mock BigQuery — no GCP credentials or live BigQuery access is needed
 ```
 tests/
 ├── test_sdk_client.py              # Client integration tests
-├── test_sdk_evaluators.py          # CodeEvaluator + LLMAsJudge
+├── test_sdk_evaluators.py          # SystemEvaluator + LLMAsJudge
 ├── test_sdk_trace.py               # Trace/Span reconstruction
 ├── test_sdk_feedback.py            # Drift detection
 ├── test_sdk_insights.py            # Insights pipeline
 
@@ -7,7 +7,7 @@ agent sessions into user-defined categories directly against traces stored in
 BigQuery, without relying on an external service.
 
 This should be implemented as a new categorical evaluation subsystem, not as
-an overload of the existing numeric `CodeEvaluator` / `LLMAsJudge` report
+an overload of the existing numeric `SystemEvaluator` / `LLMAsJudge` report
 path.
 
 The goal is to support Hatteras-like functionality inside the SDK:
@@ -22,7 +22,7 @@ The goal is to support Hatteras-like functionality inside the SDK:
 
 Today the SDK supports two major evaluation modes:
 
-- deterministic numeric scoring via `CodeEvaluator`
+- deterministic numeric scoring via `SystemEvaluator`
 - semantic numeric scoring via `LLMAsJudge`
 
 What is missing is a first-class way to answer questions like:
@@ -60,7 +60,7 @@ That capability is useful for:
 This design is not proposing:
 
 - a full clone of an external Hatteras service
-- a replacement for `CodeEvaluator`
+- a replacement for `SystemEvaluator`
 - a replacement for `LLMAsJudge`
 - a new remote function or Python UDF surface in the first phase
 - real-time ingestion-time classification in phase 1
 
@@ -165,7 +165,7 @@ Work: `bigquery_ontology/contrib/advertising/` stub with Yahoo's resolver (if co
 - `src/bigquery_ontology/graph_ddl_compiler.py` — add `compile_concept_index(ontology, binding, *, output_table) -> str`. Preserve `compile_graph()` contract byte-identically. No changes to existing function bodies.
 - `src/bigquery_ontology/cli.py:299` — `compile` command gains `--emit-concept-index` and `--concept-index-table` flags. When absent, behavior is byte-identical to today.
 - `src/bigquery_ontology/__init__.py` — add `from .graph_ddl_compiler import compile_concept_index` so the new public function is importable as `from bigquery_ontology import compile_concept_index`, matching the existing pattern for `compile_graph` (`__init__.py:50` today).
-- `src/bigquery_agent_analytics/__init__.py` — add the new public surface to the try/except re-export block (same pattern as `Client`, `CodeEvaluator`, etc.):
+- `src/bigquery_agent_analytics/__init__.py` — add the new public surface to the try/except re-export block (same pattern as `Client`, `SystemEvaluator`, etc.):
   - `OntologyRuntime` from `.ontology_runtime`
   - `EntityResolver`, `ExactMatchResolver`, `SynonymResolver`, `Candidate`, `ResolveResult` from `.entity_resolver`
   - `ConceptIndexMismatchError`, `ConceptIndexProvenanceMissing`, `ConceptIndexInconsistentPair`, `ConceptIndexRefreshed` from `.ontology_runtime`
 
@@ -219,13 +219,30 @@ Dispatch logic:
 ```python
 # Map CLI --evaluator to SDK factory
 EVALUATOR_FACTORIES = {
-    "latency": lambda t: CodeEvaluator.latency(threshold_ms=t),
-    "error_rate": lambda t: CodeEvaluator.error_rate(max_error_rate=t),
-    "turn_count": lambda t: CodeEvaluator.turn_count(max_turns=int(t)),
-    "token_efficiency": lambda t: CodeEvaluator.token_efficiency(max_tokens=int(t)),
-    "ttft": lambda t: CodeEvaluator.ttft(threshold_ms=t),
-    "cost": lambda t: CodeEvaluator.cost_per_session(max_cost_usd=t),
-    "llm-judge": None,  # special handling
+    "latency": (
+        lambda t: SystemEvaluator.latency(threshold_ms=t),
+        lambda: SystemEvaluator.latency(),
+    ),
+    "error_rate": (
+        lambda t: SystemEvaluator.error_rate(max_error_rate=t),
+        lambda: SystemEvaluator.error_rate(),
+    ),
+    "turn_count": (
+        lambda t: SystemEvaluator.turn_count(max_turns=int(t)),
+        lambda: SystemEvaluator.turn_count(),
+    ),
+    "token_efficiency": (
+        lambda t: SystemEvaluator.token_efficiency(max_tokens=int(t)),
+        lambda: SystemEvaluator.token_efficiency(),
+    ),
+    "ttft": (
+        lambda t: SystemEvaluator.ttft(threshold_ms=t),
+        lambda: SystemEvaluator.ttft(),
+    ),
+    "cost": (
+        lambda t: SystemEvaluator.cost_per_session(max_cost_usd=t),
+        lambda: SystemEvaluator.cost_per_session(),
+    ),
 }
 ```
 
@@ -289,7 +306,7 @@ import functions_framework
 from flask import jsonify
 
 from bigquery_agent_analytics import Client, serialize
-from bigquery_agent_analytics import CodeEvaluator, LLMAsJudge
+from bigquery_agent_analytics import SystemEvaluator, LLMAsJudge
 from bigquery_agent_analytics import TraceFilter
 
 
@@ -385,18 +402,18 @@ def _dispatch(client, operation, params):
 
 
 def _build_evaluator(params):
-    """Build CodeEvaluator from params dict."""
+    """Build SystemEvaluator from params dict."""
     metric = params.get("metric", "latency")
     threshold = params.get("threshold", 5000)
     factories = {
-        "latency": lambda t: CodeEvaluator.latency(threshold_ms=t),
-        "error_rate": lambda t: CodeEvaluator.error_rate(max_error_rate=t),
-        "turn_count": lambda t: CodeEvaluator.turn_count(max_turns=int(t)),
-        "token_efficiency": lambda t: CodeEvaluator.token_efficiency(
+        "latency": lambda t: SystemEvaluator.latency(threshold_ms=t),
+        "error_rate": lambda t: SystemEvaluator.error_rate(max_error_rate=t),
+        "turn_count": lambda t: SystemEvaluator.turn_count(max_turns=int(t)),
+        "token_efficiency": lambda t: SystemEvaluator.token_efficiency(
             max_tokens=int(t)
         ),
-        "ttft": lambda t: CodeEvaluator.ttft(threshold_ms=t),
-        "cost": lambda t: CodeEvaluator.cost_per_session(max_cost_usd=t),
+        "ttft": lambda t: SystemEvaluator.ttft(threshold_ms=t),
+        "cost": lambda t: SystemEvaluator.cost_per_session(max_cost_usd=t),
     }
     factory = factories.get(metric)
     if not factory:
 
@@ -109,7 +109,7 @@ All operations go through a single multiplexed function:
 | Operation | SDK Method | Params (JSON keys) | Output |
 |-----------|-----------|---------------------|--------|
 | `analyze` | `Client.get_session_trace()` + metrics | `session_id` | JSON with span count, error count, latency, tool calls |
-| `evaluate` | `CodeEvaluator` | `session_id`, `metric`, `threshold` | JSON with passed, score, details |
+| `evaluate` | `SystemEvaluator` | `session_id`, `metric`, `threshold` | JSON with passed, score, details |
 | `judge` | `LLMAsJudge` | `session_id`, `criterion` | JSON with score, feedback |
 | `insights` | Facet extraction | `session_id` | JSON with intent, outcome, friction |
 | `drift` | Drift detection | `golden_dataset`, `agent_filter`, `start_date`, `end_date` | JSON with coverage, gaps |
@@ -443,7 +443,7 @@ import functions_framework
 import json
 import os
 from flask import jsonify
-from bigquery_agent_analytics import Client, CodeEvaluator, LLMAsJudge, TraceFilter
+from bigquery_agent_analytics import Client, SystemEvaluator, LLMAsJudge, TraceFilter
 
 # Initialized once per cold start. Config comes from userDefinedContext
 # (forwarded by BigQuery) or environment variables as fallback.
@@ -490,7 +490,7 @@ def _dispatch(client, operation, params):
             "final_response": trace.final_response,
         }
     elif operation == "evaluate":
-        evaluator = CodeEvaluator.latency(threshold_ms=params["threshold"])
+        evaluator = SystemEvaluator.latency(threshold_ms=params["threshold"])
         report = client.evaluate(evaluator=evaluator,
             filters=TraceFilter(session_ids=[params["session_id"]]))
         return report.details[0] if report.details else {}
 
@@ -172,7 +172,7 @@ primitive:
 
 | SDK area | Python UDF fit | Required redesign |
 |----------|----------------|-------------------|
-| `Client.evaluate(CodeEvaluator, filters)` | Partial | SQL builds per-session summaries first; UDF computes scores from summary fields |
+| `Client.evaluate(SystemEvaluator, filters)` | Partial | SQL builds per-session summaries first; UDF computes scores from summary fields |
 | `Client.deep_analysis()` / question distribution | Partial | SQL does grouping / embeddings / top-k; UDF can help with categorization or normalization |
 | `Client.drift_detection()` | Partial | SQL computes set logic; UDF may help with text normalization or thresholding |
 | `Client.insights()` | Partial | Best split into SQL extraction + optional UDF post-processing; not a direct port |
@@ -224,7 +224,7 @@ That is maintainable. Reusing the entire client inside a Python UDF is not.
 
 The current evaluator score math is not implemented as standalone top-level
 functions today. It lives inside factory-method closures such as
-`CodeEvaluator.latency()` and `CodeEvaluator.error_rate()` in
+`SystemEvaluator.latency()` and `SystemEvaluator.error_rate()` in
 [evaluators.py](/Users/haiyuancao/BigQuery-Agent-Analytics-SDK/src/bigquery_agent_analytics/evaluators.py).
 
 That means the first implementation step is a deliberate refactor:
@@ -281,7 +281,7 @@ the shared extraction helper.
 
 ### 7.2 Tier 2: code-evaluator score kernels
 
-These should map directly to the existing `CodeEvaluator` math:
+These should map directly to the existing `SystemEvaluator` math:
 
 ```sql
 CREATE FUNCTION `PROJECT.UDF_DATASET.bqaa_score_latency`(
@@ -497,7 +497,7 @@ Remote Function should still be described as:
 - Add `udf_kernels.py`
 - Move reusable evaluator math into standalone pure functions
 - Move reusable event semantic helpers into a UDF-safe layer
-- Add unit tests proving parity with existing `CodeEvaluator` behavior
+- Add unit tests proving parity with existing `SystemEvaluator` behavior
 
 ### Phase U2: Tier 1 and Tier 2 UDFs