UiPath · bai-uipath · Oct 13, 2025 · Oct 13, 2025
diff --git a/src/uipath/_cli/_evals/_runtime.py b/src/uipath/_cli/_evals/_runtime.py
@@ -7,7 +7,8 @@
 from time import time
 from typing import Any, Dict, Generic, List, Optional, Sequence, TypeVar

-from opentelemetry.sdk.trace import ReadableSpan
+from opentelemetry import context as context_api
+from opentelemetry.sdk.trace import ReadableSpan, Span
 from opentelemetry.sdk.trace.export import SpanExporter, SpanExportResult

 from ..._events._event_bus import EventBus
@@ -24,6 +25,7 @@
 from ...eval.models.models import AgentExecution, EvalItemResult
 from .._runtime._contracts import (
    UiPathBaseRuntime,
+    UiPathExecutionBatchTraceProcessor,
    UiPathRuntimeContext,
    UiPathRuntimeFactory,
    UiPathRuntimeResult,
@@ -41,7 +43,11 @@
    UiPathEvalOutput,
    UiPathEvalRunExecutionOutput,
 )
-from .mocks.mocks import set_evaluation_item
+from ._span_collection import ExecutionSpanCollector
+from .mocks.mocks import (
+    clear_execution_context,
+    set_execution_context,
+)

 T = TypeVar("T", bound=UiPathBaseRuntime)
 C = TypeVar("C", bound=UiPathRuntimeContext)
@@ -78,6 +84,24 @@ def shutdown(self) -> None:
        self.clear()


+class ExecutionSpanProcessor(UiPathExecutionBatchTraceProcessor):
+    """Span processor that adds spans to ExecutionSpanCollector when they start."""
+
+    def __init__(self, span_exporter: SpanExporter, collector: ExecutionSpanCollector):
+        super().__init__(span_exporter)
+        self.collector = collector
+
+    def on_start(
+        self, span: Span, parent_context: Optional[context_api.Context] = None
+    ) -> None:
+        super().on_start(span, parent_context)
+
+        if span.attributes and "execution.id" in span.attributes:
+            exec_id = span.attributes["execution.id"]
+            if isinstance(exec_id, str):
+                self.collector.add_span(span, exec_id)
+
+
 class ExecutionLogsExporter:
    """Custom exporter that stores multiple execution log handlers."""

@@ -127,8 +151,15 @@ def __init__(
        self.context: UiPathEvalContext = context
        self.factory: UiPathRuntimeFactory[T, C] = factory
        self.event_bus: EventBus = event_bus
+
        self.span_exporter: ExecutionSpanExporter = ExecutionSpanExporter()
-        self.factory.add_span_exporter(self.span_exporter)
+        self.span_collector: ExecutionSpanCollector = ExecutionSpanCollector()
+
+        # Span processor feeds both exporter and collector
+        span_processor = ExecutionSpanProcessor(self.span_exporter, self.span_collector)
+        self.factory.tracer_span_processors.append(span_processor)
+        self.factory.tracer_provider.add_span_processor(span_processor)
+
        self.logs_exporter: ExecutionLogsExporter = ExecutionLogsExporter()
        self.execution_id = str(uuid.uuid4())

@@ -180,7 +211,6 @@ async def execute(self) -> Optional[UiPathRuntimeResult]:
            evaluation_set_name=evaluation_set.name,
            evaluation_set_results=eval_run_result_list,
        )
-
        # Computing evaluator averages
        evaluator_averages: Dict[str, float] = defaultdict(float)
        evaluator_count: Dict[str, int] = defaultdict(int)
@@ -194,7 +224,6 @@ async def execute(self) -> Optional[UiPathRuntimeResult]:
            evaluator_averages[eval_id] = (
                evaluator_averages[eval_id] / evaluator_count[eval_id]
            )
-
        await event_bus.publish(
            EvaluationEvents.UPDATE_EVAL_SET_RUN,
            EvalSetRunUpdatedEvent(
@@ -289,7 +318,7 @@ async def _execute_eval(
        evaluators: List[BaseEvaluator[Any]],
        event_bus: EventBus,
    ) -> EvaluationRunResult:
-        set_evaluation_item(eval_item)
+        set_execution_context(eval_item, self.span_collector)

        await event_bus.publish(
            EvaluationEvents.CREATE_EVAL_RUN,
@@ -383,6 +412,8 @@ async def _execute_eval(
                eval_run_updated_event,
                wait_for_completion=False,
            )
+        finally:
+            clear_execution_context()

        return evaluation_run_results

@@ -391,6 +422,7 @@ def _get_and_clear_execution_data(
    ) -> tuple[List[ReadableSpan], list[logging.LogRecord]]:
        spans = self.span_exporter.get_spans(execution_id)
        self.span_exporter.clear(execution_id)
+        self.span_collector.clear(execution_id)

        logs = self.logs_exporter.get_logs(execution_id)
        self.logs_exporter.clear(execution_id)

diff --git a/src/uipath/_cli/_evals/_span_collection.py b/src/uipath/_cli/_evals/_span_collection.py
@@ -0,0 +1,24 @@
+from collections import defaultdict
+from typing import Dict, List, Optional
+
+from opentelemetry.sdk.trace import ReadableSpan, Span
+
+
+class ExecutionSpanCollector:
+    """Collects spans as they are created during execution."""
+
+    def __init__(self):
+        # { execution_id -> list of spans }
+        self._spans: Dict[str, List[ReadableSpan]] = defaultdict(list)
+
+    def add_span(self, span: Span, execution_id: str) -> None:
+        self._spans[execution_id].append(span)
+
+    def get_spans(self, execution_id: str) -> List[ReadableSpan]:
+        return self._spans.get(execution_id, [])
+
+    def clear(self, execution_id: Optional[str] = None) -> None:
+        if execution_id:
+            self._spans.pop(execution_id, None)
+        else:
+            self._spans.clear()
diff --git a/src/uipath/_cli/_evals/mocks/llm_mocker.py b/src/uipath/_cli/_evals/mocks/llm_mocker.py
@@ -6,6 +6,9 @@

 from pydantic import BaseModel

+from uipath.tracing._traced import traced
+from uipath.tracing._utils import _SpanUtils
+
 from .._models._evaluation_set import (
    EvaluationItem,
    LLMMockingStrategy,
@@ -51,7 +54,7 @@
 3. Always include the entire output regardless of token length.
 3. Consider the context of the current test run and the agent being tested.  If the agent is acting on a property, make sure the output includes that property.

-Respond ONLY with valid JSON that would be a realistic and completetool response. Do not include any explanations or markdown.
+Respond ONLY with valid JSON that would be a realistic and complete tool response. Do not include any explanations or markdown.
 """

 logger = logging.getLogger(__name__)
@@ -79,6 +82,7 @@ def __init__(self, evaluation_item: EvaluationItem):
        self.evaluation_item = evaluation_item
        assert isinstance(self.evaluation_item.mocking_strategy, LLMMockingStrategy)

+    @traced(name="__mocker__")
    async def response(
        self, func: Callable[[T], R], params: dict[str, Any], *args: T, **kwargs
    ) -> R:
@@ -92,6 +96,8 @@ async def response(
            from uipath import UiPath
            from uipath._services.llm_gateway_service import _cleanup_schema

+            from .mocks import evaluation_context, span_collector_context
+
            llm = UiPath().llm
            return_type: Any = func.__annotations__.get("return", None)
            if return_type is None:
@@ -116,9 +122,17 @@ class OutputSchema(BaseModel):
                example_calls = [
                    call for call in example_calls if isinstance(call, ExampleCall)
                ]
+
+                test_run_history = "(empty)"
+                eval_item = evaluation_context.get()
+                span_collector = span_collector_context.get()
+                if eval_item and span_collector:
+                    spans = span_collector.get_spans(eval_item.id)
+                    test_run_history = _SpanUtils.spans_to_llm_context(spans)
+
                prompt_input: dict[str, Any] = {
                    "toolRunExamples": example_calls,
-                    "testRunHistory": [],  # This should contain ordered spans.
+                    "testRunHistory": test_run_history,
                    "toolInfo": {
                        "name": function_name,
                        "description": params.get("description"),

diff --git a/src/uipath/_cli/_evals/mocks/mocks.py b/src/uipath/_cli/_evals/mocks/mocks.py
@@ -5,30 +5,49 @@
 from typing import Any, Callable, Optional

 from uipath._cli._evals._models._evaluation_set import EvaluationItem
+from uipath._cli._evals._span_collection import ExecutionSpanCollector
 from uipath._cli._evals.mocks.mocker import Mocker, UiPathNoMockFoundError
 from uipath._cli._evals.mocks.mocker_factory import MockerFactory

+# Context variables for evaluation items and mockers
 evaluation_context: ContextVar[Optional[EvaluationItem]] = ContextVar(
    "evaluation", default=None
 )

 mocker_context: ContextVar[Optional[Mocker]] = ContextVar("mocker", default=None)

+# Span collector for trace access during mocking
+span_collector_context: ContextVar[Optional[ExecutionSpanCollector]] = ContextVar(
+    "span_collector", default=None
+)
+
 logger = logging.getLogger(__name__)


-def set_evaluation_item(item: EvaluationItem) -> None:
-    """Set an evaluation item within an evaluation set."""
-    evaluation_context.set(item)
+def set_execution_context(
+    eval_item: EvaluationItem, span_collector: ExecutionSpanCollector
+) -> None:
+    """Set the execution context for an evaluation run for mocking and trace access."""
+    evaluation_context.set(eval_item)
+
    try:
-        if item.mocking_strategy:
-            mocker_context.set(MockerFactory.create(item))
+        if eval_item.mocking_strategy:
+            mocker_context.set(MockerFactory.create(eval_item))
        else:
            mocker_context.set(None)
    except Exception:
-        logger.warning(f"Failed to create mocker for evaluation {item.name}")
+        logger.warning(f"Failed to create mocker for evaluation {eval_item.name}")
        mocker_context.set(None)

+    span_collector_context.set(span_collector)
+
+
+def clear_execution_context() -> None:
+    """Clear the execution context after evaluation completes."""
+    evaluation_context.set(None)
+    mocker_context.set(None)
+    span_collector_context.set(None)
+

 async def get_mocked_response(
    func: Callable[[Any], Any], params: dict[str, Any], *args, **kwargs

diff --git a/src/uipath/tracing/_utils.py b/src/uipath/tracing/_utils.py
@@ -319,3 +319,55 @@ def format_args_for_trace(
                f"Error formatting arguments for trace: {e}. Using args and kwargs directly."
            )
            return {"args": args, "kwargs": kwargs}
+
+    @staticmethod
+    def _has_ancestor_with_name(
+        span: ReadableSpan, ancestor_name: str, span_map: Dict[int, ReadableSpan]
+    ) -> bool:
+        """Check if this span or any of its ancestors has a given name."""
+        if span.name == ancestor_name:
+            return True
+
+        current = span
+        while current.parent is not None:
+            parent_span = span_map.get(current.parent.span_id)
+            if parent_span is None:
+                break
+            if parent_span.name == ancestor_name:
+                return True
+            current = parent_span
+
+        return False
+
+    @staticmethod
+    def spans_to_llm_context(spans: list[ReadableSpan]) -> str:
+        """Convert spans to a formatted conversation history string suitable for LLM context.
+
+        Includes function calls (including LLM calls) with their inputs and outputs.
+        """
+        # Build span_id -> span map for parent chain traversal
+        span_map = {span.get_span_context().span_id: span for span in spans}
+
+        history = []
+        for span in spans:
+            attributes = dict(span.attributes) if span.attributes else {}
+
+            input_value = attributes.get("input.value")
+            output_value = attributes.get("output.value")
+
+            if not input_value or not output_value:
+                continue
+
+            # Skip spans that are internal LLM calls (eg. for tool mocking in evals)
+            if _SpanUtils._has_ancestor_with_name(span, "__mocker__", span_map):
+                continue
+
+            history.append(f"Function: {span.name}")
+            history.append(f"Input: {input_value}")
+            history.append(f"Output: {output_value}")
+            history.append("")
+
+        if not history:
+            return "(empty)"
+
+        return "\n".join(history)
diff --git a/tests/cli/eval/mocks/test_mocks.py b/tests/cli/eval/mocks/test_mocks.py
@@ -1,4 +1,5 @@
 from typing import Any
+from unittest.mock import MagicMock

 import pytest
 from _pytest.monkeypatch import MonkeyPatch
@@ -10,9 +11,11 @@
    MockitoMockingStrategy,
 )
 from uipath._cli._evals.mocks.mocker import UiPathMockResponseGenerationError
-from uipath._cli._evals.mocks.mocks import set_evaluation_item
+from uipath._cli._evals.mocks.mocks import set_execution_context
 from uipath.eval.mocks import mockable

+_mock_span_collector = MagicMock()
+

 def test_mockito_mockable_sync():
    # Arrange
@@ -51,7 +54,7 @@ def foofoo(*args, **kwargs):
    assert isinstance(evaluation.mocking_strategy, MockitoMockingStrategy)

    # Act & Assert
-    set_evaluation_item(evaluation)
+    set_execution_context(evaluation, _mock_span_collector)
    assert foo() == "bar1"
    assert foo() == "bar2"
    assert foo() == "bar2"
@@ -63,13 +66,13 @@ def foofoo(*args, **kwargs):
        assert foofoo()

    evaluation.mocking_strategy.behaviors[0].arguments.kwargs = {"x": 1}
-    set_evaluation_item(evaluation)
+    set_execution_context(evaluation, _mock_span_collector)
    assert foo(x=1) == "bar1"

    evaluation.mocking_strategy.behaviors[0].arguments.kwargs = {
        "x": {"_target_": "mockito.any"}
    }
-    set_evaluation_item(evaluation)
+    set_execution_context(evaluation, _mock_span_collector)
    assert foo(x=2) == "bar1"


@@ -111,7 +114,7 @@ async def foofoo(*args, **kwargs):
    assert isinstance(evaluation.mocking_strategy, MockitoMockingStrategy)

    # Act & Assert
-    set_evaluation_item(evaluation)
+    set_execution_context(evaluation, _mock_span_collector)
    assert await foo() == "bar1"
    assert await foo() == "bar2"
    assert await foo() == "bar2"
@@ -123,13 +126,13 @@ async def foofoo(*args, **kwargs):
        assert await foofoo()

    evaluation.mocking_strategy.behaviors[0].arguments.kwargs = {"x": 1}
-    set_evaluation_item(evaluation)
+    set_execution_context(evaluation, _mock_span_collector)
    assert await foo(x=1) == "bar1"

    evaluation.mocking_strategy.behaviors[0].arguments.kwargs = {
        "x": {"_target_": "mockito.any"}
    }
-    set_evaluation_item(evaluation)
+    set_execution_context(evaluation, _mock_span_collector)
    assert await foo(x=2) == "bar1"


@@ -201,7 +204,7 @@ def foofoo(*args, **kwargs):
        },
    )
    # Act & Assert
-    set_evaluation_item(evaluation)
+    set_execution_context(evaluation, _mock_span_collector)

    assert foo() == "bar1"
    with pytest.raises(NotImplementedError):
@@ -274,7 +277,7 @@ async def foofoo(*args, **kwargs):
        },
    )
    # Act & Assert
-    set_evaluation_item(evaluation)
+    set_execution_context(evaluation, _mock_span_collector)

    assert await foo() == "bar1"
    with pytest.raises(NotImplementedError):