diff --git a/src/uipath/_cli/_evals/_runtime.py b/src/uipath/_cli/_evals/_runtime.py index 271af0a7f..75392dcb3 100644 --- a/src/uipath/_cli/_evals/_runtime.py +++ b/src/uipath/_cli/_evals/_runtime.py @@ -7,7 +7,8 @@ from time import time from typing import Any, Dict, Generic, List, Optional, Sequence, TypeVar -from opentelemetry.sdk.trace import ReadableSpan +from opentelemetry import context as context_api +from opentelemetry.sdk.trace import ReadableSpan, Span from opentelemetry.sdk.trace.export import SpanExporter, SpanExportResult from ..._events._event_bus import EventBus @@ -24,6 +25,7 @@ from ...eval.models.models import AgentExecution, EvalItemResult from .._runtime._contracts import ( UiPathBaseRuntime, + UiPathExecutionBatchTraceProcessor, UiPathRuntimeContext, UiPathRuntimeFactory, UiPathRuntimeResult, @@ -41,7 +43,11 @@ UiPathEvalOutput, UiPathEvalRunExecutionOutput, ) -from .mocks.mocks import set_evaluation_item +from ._span_collection import ExecutionSpanCollector +from .mocks.mocks import ( + clear_execution_context, + set_execution_context, +) T = TypeVar("T", bound=UiPathBaseRuntime) C = TypeVar("C", bound=UiPathRuntimeContext) @@ -78,6 +84,24 @@ def shutdown(self) -> None: self.clear() +class ExecutionSpanProcessor(UiPathExecutionBatchTraceProcessor): + """Span processor that adds spans to ExecutionSpanCollector when they start.""" + + def __init__(self, span_exporter: SpanExporter, collector: ExecutionSpanCollector): + super().__init__(span_exporter) + self.collector = collector + + def on_start( + self, span: Span, parent_context: Optional[context_api.Context] = None + ) -> None: + super().on_start(span, parent_context) + + if span.attributes and "execution.id" in span.attributes: + exec_id = span.attributes["execution.id"] + if isinstance(exec_id, str): + self.collector.add_span(span, exec_id) + + class ExecutionLogsExporter: """Custom exporter that stores multiple execution log handlers.""" @@ -127,8 +151,15 @@ def __init__( self.context: UiPathEvalContext = context self.factory: UiPathRuntimeFactory[T, C] = factory self.event_bus: EventBus = event_bus + self.span_exporter: ExecutionSpanExporter = ExecutionSpanExporter() - self.factory.add_span_exporter(self.span_exporter) + self.span_collector: ExecutionSpanCollector = ExecutionSpanCollector() + + # Span processor feeds both exporter and collector + span_processor = ExecutionSpanProcessor(self.span_exporter, self.span_collector) + self.factory.tracer_span_processors.append(span_processor) + self.factory.tracer_provider.add_span_processor(span_processor) + self.logs_exporter: ExecutionLogsExporter = ExecutionLogsExporter() self.execution_id = str(uuid.uuid4()) @@ -180,7 +211,6 @@ async def execute(self) -> Optional[UiPathRuntimeResult]: evaluation_set_name=evaluation_set.name, evaluation_set_results=eval_run_result_list, ) - # Computing evaluator averages evaluator_averages: Dict[str, float] = defaultdict(float) evaluator_count: Dict[str, int] = defaultdict(int) @@ -194,7 +224,6 @@ async def execute(self) -> Optional[UiPathRuntimeResult]: evaluator_averages[eval_id] = ( evaluator_averages[eval_id] / evaluator_count[eval_id] ) - await event_bus.publish( EvaluationEvents.UPDATE_EVAL_SET_RUN, EvalSetRunUpdatedEvent( @@ -289,7 +318,7 @@ async def _execute_eval( evaluators: List[BaseEvaluator[Any]], event_bus: EventBus, ) -> EvaluationRunResult: - set_evaluation_item(eval_item) + set_execution_context(eval_item, self.span_collector) await event_bus.publish( EvaluationEvents.CREATE_EVAL_RUN, @@ -383,6 +412,8 @@ async def _execute_eval( eval_run_updated_event, wait_for_completion=False, ) + finally: + clear_execution_context() return evaluation_run_results @@ -391,6 +422,7 @@ def _get_and_clear_execution_data( ) -> tuple[List[ReadableSpan], list[logging.LogRecord]]: spans = self.span_exporter.get_spans(execution_id) self.span_exporter.clear(execution_id) + self.span_collector.clear(execution_id) logs = self.logs_exporter.get_logs(execution_id) self.logs_exporter.clear(execution_id) diff --git a/src/uipath/_cli/_evals/_span_collection.py b/src/uipath/_cli/_evals/_span_collection.py new file mode 100644 index 000000000..ba00eba8f --- /dev/null +++ b/src/uipath/_cli/_evals/_span_collection.py @@ -0,0 +1,24 @@ +from collections import defaultdict +from typing import Dict, List, Optional + +from opentelemetry.sdk.trace import ReadableSpan, Span + + +class ExecutionSpanCollector: + """Collects spans as they are created during execution.""" + + def __init__(self): + # { execution_id -> list of spans } + self._spans: Dict[str, List[ReadableSpan]] = defaultdict(list) + + def add_span(self, span: Span, execution_id: str) -> None: + self._spans[execution_id].append(span) + + def get_spans(self, execution_id: str) -> List[ReadableSpan]: + return self._spans.get(execution_id, []) + + def clear(self, execution_id: Optional[str] = None) -> None: + if execution_id: + self._spans.pop(execution_id, None) + else: + self._spans.clear() diff --git a/src/uipath/_cli/_evals/mocks/llm_mocker.py b/src/uipath/_cli/_evals/mocks/llm_mocker.py index e86644592..95ab29754 100644 --- a/src/uipath/_cli/_evals/mocks/llm_mocker.py +++ b/src/uipath/_cli/_evals/mocks/llm_mocker.py @@ -6,6 +6,9 @@ from pydantic import BaseModel +from uipath.tracing._traced import traced +from uipath.tracing._utils import _SpanUtils + from .._models._evaluation_set import ( EvaluationItem, LLMMockingStrategy, @@ -51,7 +54,7 @@ 3. Always include the entire output regardless of token length. 3. Consider the context of the current test run and the agent being tested. If the agent is acting on a property, make sure the output includes that property. -Respond ONLY with valid JSON that would be a realistic and completetool response. Do not include any explanations or markdown. +Respond ONLY with valid JSON that would be a realistic and complete tool response. Do not include any explanations or markdown. """ logger = logging.getLogger(__name__) @@ -79,6 +82,7 @@ def __init__(self, evaluation_item: EvaluationItem): self.evaluation_item = evaluation_item assert isinstance(self.evaluation_item.mocking_strategy, LLMMockingStrategy) + @traced(name="__mocker__") async def response( self, func: Callable[[T], R], params: dict[str, Any], *args: T, **kwargs ) -> R: @@ -92,6 +96,8 @@ async def response( from uipath import UiPath from uipath._services.llm_gateway_service import _cleanup_schema + from .mocks import evaluation_context, span_collector_context + llm = UiPath().llm return_type: Any = func.__annotations__.get("return", None) if return_type is None: @@ -116,9 +122,17 @@ class OutputSchema(BaseModel): example_calls = [ call for call in example_calls if isinstance(call, ExampleCall) ] + + test_run_history = "(empty)" + eval_item = evaluation_context.get() + span_collector = span_collector_context.get() + if eval_item and span_collector: + spans = span_collector.get_spans(eval_item.id) + test_run_history = _SpanUtils.spans_to_llm_context(spans) + prompt_input: dict[str, Any] = { "toolRunExamples": example_calls, - "testRunHistory": [], # This should contain ordered spans. + "testRunHistory": test_run_history, "toolInfo": { "name": function_name, "description": params.get("description"), diff --git a/src/uipath/_cli/_evals/mocks/mocks.py b/src/uipath/_cli/_evals/mocks/mocks.py index 9a20809c3..48f0896dd 100644 --- a/src/uipath/_cli/_evals/mocks/mocks.py +++ b/src/uipath/_cli/_evals/mocks/mocks.py @@ -5,30 +5,49 @@ from typing import Any, Callable, Optional from uipath._cli._evals._models._evaluation_set import EvaluationItem +from uipath._cli._evals._span_collection import ExecutionSpanCollector from uipath._cli._evals.mocks.mocker import Mocker, UiPathNoMockFoundError from uipath._cli._evals.mocks.mocker_factory import MockerFactory +# Context variables for evaluation items and mockers evaluation_context: ContextVar[Optional[EvaluationItem]] = ContextVar( "evaluation", default=None ) mocker_context: ContextVar[Optional[Mocker]] = ContextVar("mocker", default=None) +# Span collector for trace access during mocking +span_collector_context: ContextVar[Optional[ExecutionSpanCollector]] = ContextVar( + "span_collector", default=None +) + logger = logging.getLogger(__name__) -def set_evaluation_item(item: EvaluationItem) -> None: - """Set an evaluation item within an evaluation set.""" - evaluation_context.set(item) +def set_execution_context( + eval_item: EvaluationItem, span_collector: ExecutionSpanCollector +) -> None: + """Set the execution context for an evaluation run for mocking and trace access.""" + evaluation_context.set(eval_item) + try: - if item.mocking_strategy: - mocker_context.set(MockerFactory.create(item)) + if eval_item.mocking_strategy: + mocker_context.set(MockerFactory.create(eval_item)) else: mocker_context.set(None) except Exception: - logger.warning(f"Failed to create mocker for evaluation {item.name}") + logger.warning(f"Failed to create mocker for evaluation {eval_item.name}") mocker_context.set(None) + span_collector_context.set(span_collector) + + +def clear_execution_context() -> None: + """Clear the execution context after evaluation completes.""" + evaluation_context.set(None) + mocker_context.set(None) + span_collector_context.set(None) + async def get_mocked_response( func: Callable[[Any], Any], params: dict[str, Any], *args, **kwargs diff --git a/src/uipath/tracing/_utils.py b/src/uipath/tracing/_utils.py index b82c216b9..d225d13af 100644 --- a/src/uipath/tracing/_utils.py +++ b/src/uipath/tracing/_utils.py @@ -319,3 +319,55 @@ def format_args_for_trace( f"Error formatting arguments for trace: {e}. Using args and kwargs directly." ) return {"args": args, "kwargs": kwargs} + + @staticmethod + def _has_ancestor_with_name( + span: ReadableSpan, ancestor_name: str, span_map: Dict[int, ReadableSpan] + ) -> bool: + """Check if this span or any of its ancestors has a given name.""" + if span.name == ancestor_name: + return True + + current = span + while current.parent is not None: + parent_span = span_map.get(current.parent.span_id) + if parent_span is None: + break + if parent_span.name == ancestor_name: + return True + current = parent_span + + return False + + @staticmethod + def spans_to_llm_context(spans: list[ReadableSpan]) -> str: + """Convert spans to a formatted conversation history string suitable for LLM context. + + Includes function calls (including LLM calls) with their inputs and outputs. + """ + # Build span_id -> span map for parent chain traversal + span_map = {span.get_span_context().span_id: span for span in spans} + + history = [] + for span in spans: + attributes = dict(span.attributes) if span.attributes else {} + + input_value = attributes.get("input.value") + output_value = attributes.get("output.value") + + if not input_value or not output_value: + continue + + # Skip spans that are internal LLM calls (eg. for tool mocking in evals) + if _SpanUtils._has_ancestor_with_name(span, "__mocker__", span_map): + continue + + history.append(f"Function: {span.name}") + history.append(f"Input: {input_value}") + history.append(f"Output: {output_value}") + history.append("") + + if not history: + return "(empty)" + + return "\n".join(history) diff --git a/tests/cli/eval/mocks/test_mocks.py b/tests/cli/eval/mocks/test_mocks.py index d374bb74c..78b64b0f4 100644 --- a/tests/cli/eval/mocks/test_mocks.py +++ b/tests/cli/eval/mocks/test_mocks.py @@ -1,4 +1,5 @@ from typing import Any +from unittest.mock import MagicMock import pytest from _pytest.monkeypatch import MonkeyPatch @@ -10,9 +11,11 @@ MockitoMockingStrategy, ) from uipath._cli._evals.mocks.mocker import UiPathMockResponseGenerationError -from uipath._cli._evals.mocks.mocks import set_evaluation_item +from uipath._cli._evals.mocks.mocks import set_execution_context from uipath.eval.mocks import mockable +_mock_span_collector = MagicMock() + def test_mockito_mockable_sync(): # Arrange @@ -51,7 +54,7 @@ def foofoo(*args, **kwargs): assert isinstance(evaluation.mocking_strategy, MockitoMockingStrategy) # Act & Assert - set_evaluation_item(evaluation) + set_execution_context(evaluation, _mock_span_collector) assert foo() == "bar1" assert foo() == "bar2" assert foo() == "bar2" @@ -63,13 +66,13 @@ def foofoo(*args, **kwargs): assert foofoo() evaluation.mocking_strategy.behaviors[0].arguments.kwargs = {"x": 1} - set_evaluation_item(evaluation) + set_execution_context(evaluation, _mock_span_collector) assert foo(x=1) == "bar1" evaluation.mocking_strategy.behaviors[0].arguments.kwargs = { "x": {"_target_": "mockito.any"} } - set_evaluation_item(evaluation) + set_execution_context(evaluation, _mock_span_collector) assert foo(x=2) == "bar1" @@ -111,7 +114,7 @@ async def foofoo(*args, **kwargs): assert isinstance(evaluation.mocking_strategy, MockitoMockingStrategy) # Act & Assert - set_evaluation_item(evaluation) + set_execution_context(evaluation, _mock_span_collector) assert await foo() == "bar1" assert await foo() == "bar2" assert await foo() == "bar2" @@ -123,13 +126,13 @@ async def foofoo(*args, **kwargs): assert await foofoo() evaluation.mocking_strategy.behaviors[0].arguments.kwargs = {"x": 1} - set_evaluation_item(evaluation) + set_execution_context(evaluation, _mock_span_collector) assert await foo(x=1) == "bar1" evaluation.mocking_strategy.behaviors[0].arguments.kwargs = { "x": {"_target_": "mockito.any"} } - set_evaluation_item(evaluation) + set_execution_context(evaluation, _mock_span_collector) assert await foo(x=2) == "bar1" @@ -201,7 +204,7 @@ def foofoo(*args, **kwargs): }, ) # Act & Assert - set_evaluation_item(evaluation) + set_execution_context(evaluation, _mock_span_collector) assert foo() == "bar1" with pytest.raises(NotImplementedError): @@ -274,7 +277,7 @@ async def foofoo(*args, **kwargs): }, ) # Act & Assert - set_evaluation_item(evaluation) + set_execution_context(evaluation, _mock_span_collector) assert await foo() == "bar1" with pytest.raises(NotImplementedError):