fix: use agent model for eval simulations (#1555)

Chibionos · claude · akshaylive · web-flow · commit 39d51e4cf4b5 · 2026-04-09T14:02:18.000-07:00
Co-authored-by: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
Co-authored-by: Akshaya Shanbhogue &lt;akshaya.shanbhogue@uipath.com&gt;
diff --git a/packages/uipath/pyproject.toml b/packages/uipath/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "uipath"
-version = "2.10.46"
+version = "2.10.47"
 description = "Python SDK and CLI for UiPath Platform, enabling programmatic interaction with automation services, process management, and deployment tools."
 readme = { file = "README.md", content-type = "text/markdown" }
 requires-python = ">=3.11"
diff --git a/packages/uipath/src/uipath/_cli/cli_debug.py b/packages/uipath/src/uipath/_cli/cli_debug.py
@@ -9,6 +9,7 @@
 from uipath._cli._utils._studio_project import StudioClient
 from uipath.core.tracing import UiPathTraceManager
 from uipath.eval.mocks import UiPathMockRuntime
+from uipath.eval.mocks._mock_runtime import load_simulation_config
 from uipath.platform.common import ResourceOverwritesContext, UiPathConfig
 from uipath.runtime import (
     UiPathExecuteOptions,
@@ -163,8 +164,19 @@ async def execute_debug_runtime():
                                 trigger_poll_interval=trigger_poll_interval,
                             )
 
+                            # Build mocking context with agent model for simulations
+                            schema = await runtime.get_schema()
+                            agent_model = None
+                            if schema.metadata and "settings" in schema.metadata:
+                                agent_model = schema.metadata["settings"].get("model")
+
+                            mocking_context = load_simulation_config(
+                                agent_model=agent_model
+                            )
+
                             mock_runtime = UiPathMockRuntime(
                                 delegate=debug_runtime,
+                                mocking_context=mocking_context,
                             )
 
                             try:
diff --git a/packages/uipath/src/uipath/_cli/cli_eval.py b/packages/uipath/src/uipath/_cli/cli_eval.py
@@ -25,7 +25,6 @@
 from uipath.runtime import (
     UiPathRuntimeContext,
     UiPathRuntimeFactoryRegistry,
-    UiPathRuntimeSchema,
 )
 from uipath.telemetry._track import flush_events
 from uipath.tracing import (
@@ -65,27 +64,6 @@ def setup_reporting_prereq(no_report: bool) -> bool:
     return True
 
 
-def _get_agent_model(schema: UiPathRuntimeSchema) -> str | None:
-    """Get agent model from the runtime schema metadata.
-
-    The model is read from schema.metadata["settings"]["model"] which is
-    populated by the low-code agents runtime from agent.json.
-
-    Returns:
-        The model name from agent settings, or None if not found.
-    """
-    try:
-        if schema.metadata and "settings" in schema.metadata:
-            settings = schema.metadata["settings"]
-            model = settings.get("model")
-            if model:
-                logger.debug(f"Got agent model from schema.metadata: {model}")
-                return model
-        return None
-    except Exception:
-        return None
-
-
 def _resolve_model_settings_override(
     model_settings_id: str, evaluation_set: EvaluationSet
 ) -> dict[str, Any] | None:
@@ -431,7 +409,6 @@ async def execute_eval():
                         eval_context.evaluators = await EvalHelpers.load_evaluators(
                             resolved_eval_set_path,
                             eval_context.evaluation_set,
-                            _get_agent_model(eval_context.runtime_schema),
                         )
 
                         # Runtime is not required anymore.
diff --git a/packages/uipath/src/uipath/eval/helpers.py b/packages/uipath/src/uipath/eval/helpers.py
@@ -7,6 +7,8 @@
 
 from pydantic import ValidationError
 
+from uipath.runtime.schema import UiPathRuntimeSchema
+
 from .evaluators.base_evaluator import GenericBaseEvaluator
 from .evaluators.evaluator_factory import EvaluatorFactory
 from .mocks._types import InputMockingStrategy, LLMMockingStrategy
@@ -277,3 +279,24 @@ async def load_evaluators(
             )
 
         return evaluators
+
+
+def get_agent_model(schema: UiPathRuntimeSchema) -> str | None:
+    """Get agent model from the runtime schema metadata.
+
+    The model is read from schema.metadata["settings"]["model"] which is
+    populated by the low-code agents runtime from agent.json.
+
+    Returns:
+        The model name from agent settings, or None if not found.
+    """
+    try:
+        if schema.metadata and "settings" in schema.metadata:
+            settings = schema.metadata["settings"]
+            model = settings.get("model")
+            if model:
+                logger.debug(f"Got agent model from schema.metadata: {model}")
+                return model
+        return None
+    except Exception:
+        return None
diff --git a/packages/uipath/src/uipath/eval/mocks/_input_mocker.py b/packages/uipath/src/uipath/eval/mocks/_input_mocker.py
@@ -1,6 +1,7 @@
 """LLM Input Mocker implementation."""
 
 import json
+import logging
 from datetime import datetime
 from typing import Any
 
@@ -9,6 +10,7 @@
 from uipath.core.tracing import traced
 from uipath.platform import UiPath
 from uipath.platform.chat import UiPathLlmChatService
+from uipath.platform.chat._llm_gateway_service import ChatModels
 
 from .._execution_context import eval_set_run_id_context
 from ._mock_context import cache_manager_context
@@ -17,6 +19,8 @@
     InputMockingStrategy,
 )
 
+logger = logging.getLogger(__name__)
+
 
 def get_input_mocking_prompt(
     input_schema: str,
@@ -117,6 +121,11 @@ async def generate_llm_input(
             else {}
         )
 
+        simulation_model = completion_kwargs.get(
+            "model", ChatModels.gpt_4_1_mini_2025_04_14
+        )
+        logger.info(f"Simulating input generation using model: {simulation_model}")
+
         if cache_manager is not None:
             cache_key_data = {
                 "response_format": response_format,
diff --git a/packages/uipath/src/uipath/eval/mocks/_llm_mocker.py b/packages/uipath/src/uipath/eval/mocks/_llm_mocker.py
@@ -10,7 +10,7 @@
 from uipath.core.tracing import traced
 from uipath.platform import UiPath
 from uipath.platform.chat import UiPathLlmChatService
-from uipath.platform.chat._llm_gateway_service import _cleanup_schema
+from uipath.platform.chat._llm_gateway_service import ChatModels, _cleanup_schema
 
 from .._execution_context import (
     eval_set_run_id_context,
@@ -182,6 +182,13 @@ async def response(
                     else {}
                 )
 
+                simulation_model = completion_kwargs.get(
+                    "model", ChatModels.gpt_4_1_mini_2025_04_14
+                )
+                logger.info(
+                    f"Simulating tool '{function_name}' using model: {simulation_model}"
+                )
+
                 formatted_prompt = PROMPT.format(**prompt_generation_args)
 
                 cache_key_data = {
diff --git a/packages/uipath/src/uipath/eval/mocks/_mock_runtime.py b/packages/uipath/src/uipath/eval/mocks/_mock_runtime.py
@@ -28,13 +28,14 @@
     LLMMockingStrategy,
     MockingContext,
     MockingStrategyType,
+    ModelSettings,
     ToolSimulation,
 )
 
 logger = logging.getLogger(__name__)
 
 
-def load_simulation_config() -> MockingContext | None:
+def load_simulation_config(agent_model: str | None = None) -> MockingContext | None:
     """Load simulation.json from current directory and convert to MockingContext.
 
     Returns:
@@ -63,11 +64,21 @@ def load_simulation_config() -> MockingContext | None:
         if not tools_to_simulate:
             return None
 
-        # Create LLM mocking strategy
+        # Honor model from simulation config if specified, otherwise use the agent model
+        simulation_model = simulation_data.get("model")
+        model = (
+            ModelSettings(model=simulation_model)
+            if simulation_model
+            else ModelSettings(model=agent_model)
+            if agent_model
+            else None
+        )
+
         mocking_strategy = LLMMockingStrategy(
             type=MockingStrategyType.LLM,
             prompt=simulation_data.get("instructions", ""),
             tools_to_simulate=tools_to_simulate,
+            model=model,
         )
 
         # Create MockingContext for debugging
diff --git a/packages/uipath/src/uipath/eval/runtime/runtime.py b/packages/uipath/src/uipath/eval/runtime/runtime.py
@@ -47,13 +47,14 @@
 from .._execution_context import ExecutionSpanCollector
 from ..evaluators.base_evaluator import GenericBaseEvaluator
 from ..evaluators.output_evaluator import OutputEvaluationCriteria
+from ..helpers import get_agent_model
 from ..mocks._cache_manager import CacheManager
 from ..mocks._input_mocker import (
     generate_llm_input,
 )
 from ..mocks._mock_context import cache_manager_context
 from ..mocks._mock_runtime import UiPathMockRuntime
-from ..mocks._types import MockingContext
+from ..mocks._types import LLMMockingStrategy, MockingContext, ModelSettings
 from ..models import EvaluationResult
 from ..models.evaluation_set import (
     EvaluationItem,
@@ -526,12 +527,25 @@ async def _execute_eval(
                                 eval_item=eval_item,
                             ),
                         )
+                    # Set agent model on the mocking strategy if not already set
+                    mocking_strategy = eval_item.mocking_strategy
+                    if (
+                        mocking_strategy
+                        and isinstance(mocking_strategy, LLMMockingStrategy)
+                        and not mocking_strategy.model
+                    ):
+                        mocking_model = get_agent_model(self.context.runtime_schema)
+                        if mocking_model:
+                            mocking_strategy = mocking_strategy.model_copy(
+                                update={"model": ModelSettings(model=mocking_model)}
+                            )
+
                     agent_execution_output = await self.execute_runtime(
                         eval_item,
                         execution_id,
                         input_overrides=self.context.input_overrides,
                         mocking_context=MockingContext(
-                            strategy=eval_item.mocking_strategy,
+                            strategy=mocking_strategy,
                             name=eval_item.name,
                             inputs=eval_item.inputs,
                         ),
@@ -811,8 +825,18 @@ async def _generate_input_for_eval(
             or getattr(eval_item, "expected_output", None)
             or {}
         )
+        # Set agent model on the input mocking strategy if not already set
+        input_strategy = eval_item.input_mocking_strategy
+        # If input strategy does not specify a model, extract it
+        if input_strategy and not input_strategy.model:
+            input_generation_model = get_agent_model(self.context.runtime_schema)
+            if input_generation_model:
+                input_strategy = input_strategy.model_copy(
+                    update={"model": ModelSettings(model=input_generation_model)}
+                )
+
         generated_input = await generate_llm_input(
-            eval_item.input_mocking_strategy,
+            input_strategy,
             (await self.get_schema()).input,
             expected_behavior=eval_item.expected_agent_behavior or "",
             expected_output=expected_output,
diff --git a/packages/uipath/tests/cli/eval/test_eval_runtime_metadata.py b/packages/uipath/tests/cli/eval/test_eval_runtime_metadata.py
@@ -1,7 +1,7 @@
 """Tests for UiPathEvalRuntime metadata loading functionality.
 
 This module tests:
-- _get_agent_model() - cached agent model retrieval
+- get_agent_model() - cached agent model retrieval
 - get_schema() - cached schema retrieval
 """
 
@@ -10,11 +10,9 @@
 
 import pytest
 
-from uipath._cli.cli_eval import (
-    _get_agent_model,
-)
 from uipath.core.events import EventBus
 from uipath.core.tracing import UiPathTraceManager
+from uipath.eval.helpers import get_agent_model
 from uipath.eval.runtime import UiPathEvalContext, UiPathEvalRuntime
 from uipath.runtime import (
     UiPathExecuteOptions,
@@ -119,34 +117,34 @@ async def dispose(self) -> None:
 
 
 class TestGetAgentModel:
-    """Tests for _get_agent_model function."""
+    """Tests for get_agent_model function."""
 
     @pytest.mark.asyncio
     async def test_returns_agent_model(self):
-        """Test that _get_agent_model returns the correct model from schema."""
+        """Test that get_agent_model returns the correct model from schema."""
         schema = MockRuntimeSchema()
         schema.metadata = {"settings": {"model": "gpt-4o-2024-11-20"}}
 
-        model = _get_agent_model(schema)
+        model = get_agent_model(schema)
         assert model == "gpt-4o-2024-11-20"
 
     @pytest.mark.asyncio
     async def test_returns_none_when_no_model(self):
-        """Test that _get_agent_model returns None when runtime has no model."""
+        """Test that get_agent_model returns None when runtime has no model."""
         schema = MockRuntimeSchema()
 
-        model = _get_agent_model(schema)
+        model = get_agent_model(schema)
         assert model is None
 
     @pytest.mark.asyncio
     async def test_returns_model_consistently(self):
-        """Test that _get_agent_model returns consistent results."""
+        """Test that get_agent_model returns consistent results."""
         schema = MockRuntimeSchema()
         schema.metadata = {"settings": {"model": "consistent-model"}}
 
         # Multiple calls should return the same value
-        model1 = _get_agent_model(schema)
-        model2 = _get_agent_model(schema)
+        model1 = get_agent_model(schema)
+        model2 = get_agent_model(schema)
 
         assert model1 == model2 == "consistent-model"
 
diff --git a/packages/uipath/tests/cli/test_debug_simulation.py b/packages/uipath/tests/cli/test_debug_simulation.py
@@ -241,6 +241,9 @@ def test_debug_always_wraps_with_mock_runtime(
                     ) as mock_factory_get:
                         mock_runtime = Mock()
                         mock_runtime.dispose = AsyncMock()
+                        mock_runtime.get_schema = AsyncMock(
+                            return_value=Mock(metadata=None)
+                        )
 
                         mock_factory = Mock()
                         mock_factory.new_runtime = AsyncMock(return_value=mock_runtime)
@@ -305,6 +308,9 @@ def test_debug_wraps_with_mock_runtime_on_error(
                     ) as mock_factory_get:
                         mock_runtime = Mock()
                         mock_runtime.dispose = AsyncMock()
+                        mock_runtime.get_schema = AsyncMock(
+                            return_value=Mock(metadata=None)
+                        )
 
                         mock_factory = Mock()
                         mock_factory.new_runtime = AsyncMock(return_value=mock_runtime)
diff --git a/packages/uipath/uv.lock b/packages/uipath/uv.lock