Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 38 additions & 6 deletions 44 src/uipath/_cli/_evals/_runtime.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@
from time import time
from typing import Any, Dict, Generic, List, Optional, Sequence, TypeVar

from opentelemetry.sdk.trace import ReadableSpan
from opentelemetry import context as context_api
from opentelemetry.sdk.trace import ReadableSpan, Span
from opentelemetry.sdk.trace.export import SpanExporter, SpanExportResult

from ..._events._event_bus import EventBus
Expand All @@ -24,6 +25,7 @@
from ...eval.models.models import AgentExecution, EvalItemResult
from .._runtime._contracts import (
UiPathBaseRuntime,
UiPathExecutionBatchTraceProcessor,
UiPathRuntimeContext,
UiPathRuntimeFactory,
UiPathRuntimeResult,
Expand All @@ -41,7 +43,11 @@
UiPathEvalOutput,
UiPathEvalRunExecutionOutput,
)
from .mocks.mocks import set_evaluation_item
from ._span_collection import ExecutionSpanCollector
from .mocks.mocks import (
clear_execution_context,
set_execution_context,
)

T = TypeVar("T", bound=UiPathBaseRuntime)
C = TypeVar("C", bound=UiPathRuntimeContext)
Expand Down Expand Up @@ -78,6 +84,24 @@ def shutdown(self) -> None:
self.clear()


class ExecutionSpanProcessor(UiPathExecutionBatchTraceProcessor):
"""Span processor that adds spans to ExecutionSpanCollector when they start."""

def __init__(self, span_exporter: SpanExporter, collector: ExecutionSpanCollector):
super().__init__(span_exporter)
self.collector = collector

def on_start(
self, span: Span, parent_context: Optional[context_api.Context] = None
) -> None:
super().on_start(span, parent_context)

if span.attributes and "execution.id" in span.attributes:
exec_id = span.attributes["execution.id"]
if isinstance(exec_id, str):
self.collector.add_span(span, exec_id)


class ExecutionLogsExporter:
"""Custom exporter that stores multiple execution log handlers."""

Expand Down Expand Up @@ -127,8 +151,15 @@ def __init__(
self.context: UiPathEvalContext = context
self.factory: UiPathRuntimeFactory[T, C] = factory
self.event_bus: EventBus = event_bus

self.span_exporter: ExecutionSpanExporter = ExecutionSpanExporter()
self.factory.add_span_exporter(self.span_exporter)
self.span_collector: ExecutionSpanCollector = ExecutionSpanCollector()

# Span processor feeds both exporter and collector
span_processor = ExecutionSpanProcessor(self.span_exporter, self.span_collector)
self.factory.tracer_span_processors.append(span_processor)
self.factory.tracer_provider.add_span_processor(span_processor)

self.logs_exporter: ExecutionLogsExporter = ExecutionLogsExporter()
self.execution_id = str(uuid.uuid4())

Expand Down Expand Up @@ -180,7 +211,6 @@ async def execute(self) -> Optional[UiPathRuntimeResult]:
evaluation_set_name=evaluation_set.name,
evaluation_set_results=eval_run_result_list,
)

# Computing evaluator averages
evaluator_averages: Dict[str, float] = defaultdict(float)
evaluator_count: Dict[str, int] = defaultdict(int)
Expand All @@ -194,7 +224,6 @@ async def execute(self) -> Optional[UiPathRuntimeResult]:
evaluator_averages[eval_id] = (
evaluator_averages[eval_id] / evaluator_count[eval_id]
)

await event_bus.publish(
EvaluationEvents.UPDATE_EVAL_SET_RUN,
EvalSetRunUpdatedEvent(
Expand Down Expand Up @@ -289,7 +318,7 @@ async def _execute_eval(
evaluators: List[BaseEvaluator[Any]],
event_bus: EventBus,
) -> EvaluationRunResult:
set_evaluation_item(eval_item)
set_execution_context(eval_item, self.span_collector)

await event_bus.publish(
EvaluationEvents.CREATE_EVAL_RUN,
Expand Down Expand Up @@ -383,6 +412,8 @@ async def _execute_eval(
eval_run_updated_event,
wait_for_completion=False,
)
finally:
clear_execution_context()

return evaluation_run_results

Expand All @@ -391,6 +422,7 @@ def _get_and_clear_execution_data(
) -> tuple[List[ReadableSpan], list[logging.LogRecord]]:
spans = self.span_exporter.get_spans(execution_id)
self.span_exporter.clear(execution_id)
self.span_collector.clear(execution_id)

logs = self.logs_exporter.get_logs(execution_id)
self.logs_exporter.clear(execution_id)
Expand Down
24 changes: 24 additions & 0 deletions 24 src/uipath/_cli/_evals/_span_collection.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
from collections import defaultdict
from typing import Dict, List, Optional

from opentelemetry.sdk.trace import ReadableSpan, Span


class ExecutionSpanCollector:
"""Collects spans as they are created during execution."""

def __init__(self):
# { execution_id -> list of spans }
self._spans: Dict[str, List[ReadableSpan]] = defaultdict(list)

def add_span(self, span: Span, execution_id: str) -> None:
self._spans[execution_id].append(span)

def get_spans(self, execution_id: str) -> List[ReadableSpan]:
return self._spans.get(execution_id, [])

def clear(self, execution_id: Optional[str] = None) -> None:
if execution_id:
self._spans.pop(execution_id, None)
else:
self._spans.clear()
18 changes: 16 additions & 2 deletions 18 src/uipath/_cli/_evals/mocks/llm_mocker.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@

from pydantic import BaseModel

from uipath.tracing._traced import traced
from uipath.tracing._utils import _SpanUtils

from .._models._evaluation_set import (
EvaluationItem,
LLMMockingStrategy,
Expand Down Expand Up @@ -51,7 +54,7 @@
3. Always include the entire output regardless of token length.
3. Consider the context of the current test run and the agent being tested. If the agent is acting on a property, make sure the output includes that property.

Respond ONLY with valid JSON that would be a realistic and completetool response. Do not include any explanations or markdown.
Respond ONLY with valid JSON that would be a realistic and complete tool response. Do not include any explanations or markdown.
"""

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -79,6 +82,7 @@ def __init__(self, evaluation_item: EvaluationItem):
self.evaluation_item = evaluation_item
assert isinstance(self.evaluation_item.mocking_strategy, LLMMockingStrategy)

@traced(name="__mocker__")
async def response(
self, func: Callable[[T], R], params: dict[str, Any], *args: T, **kwargs
) -> R:
Expand All @@ -92,6 +96,8 @@ async def response(
from uipath import UiPath
from uipath._services.llm_gateway_service import _cleanup_schema

from .mocks import evaluation_context, span_collector_context

llm = UiPath().llm
return_type: Any = func.__annotations__.get("return", None)
if return_type is None:
Expand All @@ -116,9 +122,17 @@ class OutputSchema(BaseModel):
example_calls = [
call for call in example_calls if isinstance(call, ExampleCall)
]

test_run_history = "(empty)"
eval_item = evaluation_context.get()
span_collector = span_collector_context.get()
if eval_item and span_collector:
spans = span_collector.get_spans(eval_item.id)
test_run_history = _SpanUtils.spans_to_llm_context(spans)

prompt_input: dict[str, Any] = {
"toolRunExamples": example_calls,
"testRunHistory": [], # This should contain ordered spans.
"testRunHistory": test_run_history,
"toolInfo": {
"name": function_name,
"description": params.get("description"),
Expand Down
31 changes: 25 additions & 6 deletions 31 src/uipath/_cli/_evals/mocks/mocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,30 +5,49 @@
from typing import Any, Callable, Optional

from uipath._cli._evals._models._evaluation_set import EvaluationItem
from uipath._cli._evals._span_collection import ExecutionSpanCollector
from uipath._cli._evals.mocks.mocker import Mocker, UiPathNoMockFoundError
from uipath._cli._evals.mocks.mocker_factory import MockerFactory

# Context variables for evaluation items and mockers
evaluation_context: ContextVar[Optional[EvaluationItem]] = ContextVar(
"evaluation", default=None
)

mocker_context: ContextVar[Optional[Mocker]] = ContextVar("mocker", default=None)

# Span collector for trace access during mocking
span_collector_context: ContextVar[Optional[ExecutionSpanCollector]] = ContextVar(
"span_collector", default=None
)

logger = logging.getLogger(__name__)


def set_evaluation_item(item: EvaluationItem) -> None:
"""Set an evaluation item within an evaluation set."""
evaluation_context.set(item)
def set_execution_context(
eval_item: EvaluationItem, span_collector: ExecutionSpanCollector
) -> None:
"""Set the execution context for an evaluation run for mocking and trace access."""
evaluation_context.set(eval_item)

try:
if item.mocking_strategy:
mocker_context.set(MockerFactory.create(item))
if eval_item.mocking_strategy:
mocker_context.set(MockerFactory.create(eval_item))
else:
mocker_context.set(None)
except Exception:
logger.warning(f"Failed to create mocker for evaluation {item.name}")
logger.warning(f"Failed to create mocker for evaluation {eval_item.name}")
mocker_context.set(None)

span_collector_context.set(span_collector)


def clear_execution_context() -> None:
"""Clear the execution context after evaluation completes."""
evaluation_context.set(None)
mocker_context.set(None)
span_collector_context.set(None)


async def get_mocked_response(
func: Callable[[Any], Any], params: dict[str, Any], *args, **kwargs
Expand Down
52 changes: 52 additions & 0 deletions 52 src/uipath/tracing/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -319,3 +319,55 @@ def format_args_for_trace(
f"Error formatting arguments for trace: {e}. Using args and kwargs directly."
)
return {"args": args, "kwargs": kwargs}

@staticmethod
def _has_ancestor_with_name(
span: ReadableSpan, ancestor_name: str, span_map: Dict[int, ReadableSpan]
) -> bool:
"""Check if this span or any of its ancestors has a given name."""
if span.name == ancestor_name:
return True

current = span
while current.parent is not None:
parent_span = span_map.get(current.parent.span_id)
if parent_span is None:
break
if parent_span.name == ancestor_name:
return True
current = parent_span

return False

@staticmethod
def spans_to_llm_context(spans: list[ReadableSpan]) -> str:
bai-uipath marked this conversation as resolved.
Show resolved Hide resolved
"""Convert spans to a formatted conversation history string suitable for LLM context.

Includes function calls (including LLM calls) with their inputs and outputs.
"""
# Build span_id -> span map for parent chain traversal
span_map = {span.get_span_context().span_id: span for span in spans}

history = []
for span in spans:
attributes = dict(span.attributes) if span.attributes else {}

input_value = attributes.get("input.value")
output_value = attributes.get("output.value")

if not input_value or not output_value:
continue

# Skip spans that are internal LLM calls (eg. for tool mocking in evals)
if _SpanUtils._has_ancestor_with_name(span, "__mocker__", span_map):
continue

history.append(f"Function: {span.name}")
history.append(f"Input: {input_value}")
history.append(f"Output: {output_value}")
history.append("")

if not history:
return "(empty)"

return "\n".join(history)
21 changes: 12 additions & 9 deletions 21 tests/cli/eval/mocks/test_mocks.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from typing import Any
from unittest.mock import MagicMock

import pytest
from _pytest.monkeypatch import MonkeyPatch
Expand All @@ -10,9 +11,11 @@
MockitoMockingStrategy,
)
from uipath._cli._evals.mocks.mocker import UiPathMockResponseGenerationError
from uipath._cli._evals.mocks.mocks import set_evaluation_item
from uipath._cli._evals.mocks.mocks import set_execution_context
from uipath.eval.mocks import mockable

_mock_span_collector = MagicMock()


def test_mockito_mockable_sync():
# Arrange
Expand Down Expand Up @@ -51,7 +54,7 @@ def foofoo(*args, **kwargs):
assert isinstance(evaluation.mocking_strategy, MockitoMockingStrategy)

# Act & Assert
set_evaluation_item(evaluation)
set_execution_context(evaluation, _mock_span_collector)
assert foo() == "bar1"
assert foo() == "bar2"
assert foo() == "bar2"
Expand All @@ -63,13 +66,13 @@ def foofoo(*args, **kwargs):
assert foofoo()

evaluation.mocking_strategy.behaviors[0].arguments.kwargs = {"x": 1}
set_evaluation_item(evaluation)
set_execution_context(evaluation, _mock_span_collector)
assert foo(x=1) == "bar1"

evaluation.mocking_strategy.behaviors[0].arguments.kwargs = {
"x": {"_target_": "mockito.any"}
}
set_evaluation_item(evaluation)
set_execution_context(evaluation, _mock_span_collector)
assert foo(x=2) == "bar1"


Expand Down Expand Up @@ -111,7 +114,7 @@ async def foofoo(*args, **kwargs):
assert isinstance(evaluation.mocking_strategy, MockitoMockingStrategy)

# Act & Assert
set_evaluation_item(evaluation)
set_execution_context(evaluation, _mock_span_collector)
assert await foo() == "bar1"
assert await foo() == "bar2"
assert await foo() == "bar2"
Expand All @@ -123,13 +126,13 @@ async def foofoo(*args, **kwargs):
assert await foofoo()

evaluation.mocking_strategy.behaviors[0].arguments.kwargs = {"x": 1}
set_evaluation_item(evaluation)
set_execution_context(evaluation, _mock_span_collector)
assert await foo(x=1) == "bar1"

evaluation.mocking_strategy.behaviors[0].arguments.kwargs = {
"x": {"_target_": "mockito.any"}
}
set_evaluation_item(evaluation)
set_execution_context(evaluation, _mock_span_collector)
assert await foo(x=2) == "bar1"


Expand Down Expand Up @@ -201,7 +204,7 @@ def foofoo(*args, **kwargs):
},
)
# Act & Assert
set_evaluation_item(evaluation)
set_execution_context(evaluation, _mock_span_collector)

assert foo() == "bar1"
with pytest.raises(NotImplementedError):
Expand Down Expand Up @@ -274,7 +277,7 @@ async def foofoo(*args, **kwargs):
},
)
# Act & Assert
set_evaluation_item(evaluation)
set_execution_context(evaluation, _mock_span_collector)

assert await foo() == "bar1"
with pytest.raises(NotImplementedError):
Expand Down
Morty Proxy This is a proxified and sanitized view of the page, visit original site.