From 26118708a2c1f7a9e9d236ee8b93fad598eb6bac Mon Sep 17 00:00:00 2001 From: ahibrahim Date: Tue, 7 Oct 2025 17:59:29 +0300 Subject: [PATCH 01/12] push base --- .../tool_output_utilization/asset.yaml | 4 + .../evaluator/_tool_output_utilization.py | 227 ++++++++++++++++++ .../evaluator/tool_output_utilization.prompty | 221 +++++++++++++++++ .../builtin/tool_output_utilization/spec.yaml | 49 ++++ 4 files changed, 501 insertions(+) create mode 100644 assets/evaluators/builtin/tool_output_utilization/asset.yaml create mode 100644 assets/evaluators/builtin/tool_output_utilization/evaluator/_tool_output_utilization.py create mode 100644 assets/evaluators/builtin/tool_output_utilization/evaluator/tool_output_utilization.prompty create mode 100644 assets/evaluators/builtin/tool_output_utilization/spec.yaml diff --git a/assets/evaluators/builtin/tool_output_utilization/asset.yaml b/assets/evaluators/builtin/tool_output_utilization/asset.yaml new file mode 100644 index 0000000000..845a698b96 --- /dev/null +++ b/assets/evaluators/builtin/tool_output_utilization/asset.yaml @@ -0,0 +1,4 @@ +type: evaluator +spec: spec.yaml +categories: +- Evaluator \ No newline at end of file diff --git a/assets/evaluators/builtin/tool_output_utilization/evaluator/_tool_output_utilization.py b/assets/evaluators/builtin/tool_output_utilization/evaluator/_tool_output_utilization.py new file mode 100644 index 0000000000..ed0ad9297e --- /dev/null +++ b/assets/evaluators/builtin/tool_output_utilization/evaluator/_tool_output_utilization.py @@ -0,0 +1,227 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +import os +import math +import logging +from typing import Dict, Union, List, Optional + +from typing_extensions import overload, override + +from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget +from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase +from azure.ai.evaluation._common.utils import ( + reformat_conversation_history, + reformat_agent_response, + reformat_tool_definitions, + filter_to_used_tools, +) +from azure.ai.evaluation._common._experimental import experimental + +logger = logging.getLogger(__name__) + + +@experimental +class ToolOutputUtilizationEvaluator(PromptyEvaluatorBase[Union[str, float]]): + """The Tool Output Utilization evaluator assesses how well an AI-generated response utilizes the provided tool definitions based on: + + - Alignment with instructions and definitions + - Accuracy and clarity of the response + - Proper use of provided tool definitions + + Scoring is based on five levels: + 1. Fully Inadherent - Response completely ignores instructions. + 2. Barely Adherent - Partial alignment with critical gaps. + 3. Moderately Adherent - Meets core requirements but lacks precision. + 4. Mostly Adherent - Clear and accurate with minor issues. + 5. Fully Adherent - Flawless adherence to instructions. + + The evaluation includes a step-by-step reasoning process, a brief explanation, and a final integer score. + + + :param model_config: Configuration for the Azure OpenAI model. + :type model_config: Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration, + ~azure.ai.evaluation.OpenAIModelConfiguration] + + .. admonition:: Example: + .. literalinclude:: ../samples/evaluation_samples_evaluate.py + :start-after: [START tool_output_utilization_evaluator] + :end-before: [END tool_output_utilization_evaluator] + :language: python + :dedent: 8 + :caption: Initialize and call an TaskAdherenceEvaluator with a query and response. + + .. admonition:: Example using Azure AI Project URL: + + .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py + :start-after: [START tool_output_utilization_evaluator] + :end-before: [END tool_output_utilization_evaluator] + :language: python + :dedent: 8 + :caption: Initialize and call TaskAdherenceEvaluator using Azure AI Project URL in the following format + https://{resource_name}.services.ai.azure.com/api/projects/{project_name} + + """ + + _PROMPTY_FILE = "tool_output_utilization.prompty" + _RESULT_KEY = "tool_output_utilization" + _OPTIONAL_PARAMS = ["tool_definitions"] + + _DEFAULT_TOOL_OUTPUT_UTILIZATION_SCORE = 3 + + id = "azureai://built-in/evaluators/tool_output_utilization" + """Evaluator identifier, experimental and to be used only with evaluation in cloud.""" + + @override + def __init__( + self, + model_config, + *, + threshold=_DEFAULT_TOOL_OUTPUT_UTILIZATION_SCORE, + credential=None, + **kwargs, + ): + current_dir = os.path.dirname(__file__) + prompty_path = os.path.join(current_dir, self._PROMPTY_FILE) + self.threshold = threshold + super().__init__( + model_config=model_config, + prompty_file=prompty_path, + result_key=self._RESULT_KEY, + credential=credential, + **kwargs, + ) + + @overload + def __call__( + self, + *, + query: Union[str, List[dict]], + response: Union[str, List[dict]], + tool_definitions: Optional[Union[dict, List[dict]]] = None, + ) -> Dict[str, Union[str, float]]: + """Evaluate task adherence for a given query, response, and optional tool defintions. + The query and response can be either a string or a list of messages. + + + Example with string inputs and no tools: + evaluator = TaskAdherenceEvaluator(model_config) + query = "What is the weather today?" + response = "The weather is sunny." + + result = evaluator(query=query, response=response) + + Example with list of messages: + evaluator = TaskAdherenceEvaluator(model_config) + query = [{'role': 'system', 'content': 'You are a friendly and helpful customer service agent.'}, {'createdAt': 1700000060, 'role': 'user', 'content': [{'type': 'text', 'text': 'Hi, I need help with the last 2 orders on my account #888. Could you please update me on their status?'}]}] + response = [{'createdAt': 1700000070, 'run_id': '0', 'role': 'assistant', 'content': [{'type': 'text', 'text': 'Hello! Let me quickly look up your account details.'}]}, {'createdAt': 1700000075, 'run_id': '0', 'role': 'assistant', 'content': [{'type': 'tool_call', 'tool_call': {'id': 'tool_call_20250310_001', 'type': 'function', 'function': {'name': 'get_orders', 'arguments': {'account_number': '888'}}}}]}, {'createdAt': 1700000080, 'run_id': '0', 'tool_call_id': 'tool_call_20250310_001', 'role': 'tool', 'content': [{'type': 'tool_result', 'tool_result': '[{ "order_id": "123" }, { "order_id": "124" }]'}]}, {'createdAt': 1700000085, 'run_id': '0', 'role': 'assistant', 'content': [{'type': 'text', 'text': 'Thanks for your patience. I see two orders on your account. Let me fetch the details for both.'}]}, {'createdAt': 1700000090, 'run_id': '0', 'role': 'assistant', 'content': [{'type': 'tool_call', 'tool_call': {'id': 'tool_call_20250310_002', 'type': 'function', 'function': {'name': 'get_order', 'arguments': {'order_id': '123'}}}}, {'type': 'tool_call', 'tool_call': {'id': 'tool_call_20250310_003', 'type': 'function', 'function': {'name': 'get_order', 'arguments': {'order_id': '124'}}}}]}, {'createdAt': 1700000095, 'run_id': '0', 'tool_call_id': 'tool_call_20250310_002', 'role': 'tool', 'content': [{'type': 'tool_result', 'tool_result': '{ "order": { "id": "123", "status": "shipped", "delivery_date": "2025-03-15" } }'}]}, {'createdAt': 1700000100, 'run_id': '0', 'tool_call_id': 'tool_call_20250310_003', 'role': 'tool', 'content': [{'type': 'tool_result', 'tool_result': '{ "order": { "id": "124", "status": "delayed", "expected_delivery": "2025-03-20" } }'}]}, {'createdAt': 1700000105, 'run_id': '0', 'role': 'assistant', 'content': [{'type': 'text', 'text': 'The order with ID 123 has been shipped and is expected to be delivered on March 15, 2025. However, the order with ID 124 is delayed and should now arrive by March 20, 2025. Is there anything else I can help you with?'}]}] + tool_definitions = [{'name': 'get_orders', 'description': 'Get the list of orders for a given account number.', 'parameters': {'type': 'object', 'properties': {'account_number': {'type': 'string', 'description': 'The account number to get the orders for.'}}}}, {'name': 'get_order', 'description': 'Get the details of a specific order.', 'parameters': {'type': 'object', 'properties': {'order_id': {'type': 'string', 'description': 'The order ID to get the details for.'}}}}, {'name': 'initiate_return', 'description': 'Initiate the return process for an order.', 'parameters': {'type': 'object', 'properties': {'order_id': {'type': 'string', 'description': 'The order ID for the return process.'}}}}, {'name': 'update_shipping_address', 'description': 'Update the shipping address for a given account.', 'parameters': {'type': 'object', 'properties': {'account_number': {'type': 'string', 'description': 'The account number to update.'}, 'new_address': {'type': 'string', 'description': 'The new shipping address.'}}}}] + + result = evaluator(query=query, response=response, tool_definitions=tool_definitions) + + :keyword query: The query being evaluated, either a string or a list of messages. + :paramtype query: Union[str, List[dict]] + :keyword response: The response being evaluated, either a string or a list of messages (full agent response potentially including tool calls) + :paramtype response: Union[str, List[dict]] + :keyword tool_definitions: An optional list of messages containing the tool definitions the agent is aware of. + :paramtype tool_definitions: Optional[Union[dict, List[dict]]] + :return: A dictionary with the task adherence evaluation results. + :rtype: Dict[str, Union[str, float]] + """ + + @override + def __call__( # pylint: disable=docstring-missing-param + self, + *args, + **kwargs, + ): + """ + Invokes the instance using the overloaded __call__ signature. + + For detailed parameter types and return value documentation, see the overloaded __call__ definition. + """ + return super().__call__(*args, **kwargs) + + @override + async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # type: ignore[override] + """Do Task Adherence evaluation. + :param eval_input: The input to the evaluator. Expected to contain whatever inputs are needed for the _flow method + :type eval_input: Dict + :return: The evaluation result. + :rtype: Dict + """ + # we override the _do_eval method as we want the output to be a dictionary, + # which is a different schema than _base_prompty_eval.py + if "query" not in eval_input and "response" not in eval_input: + raise EvaluationException( + message=f"Both query and response must be provided as input to the Task Adherence evaluator.", + internal_message=f"Both query and response must be provided as input to the Task Adherence evaluator.", + blame=ErrorBlame.USER_ERROR, + category=ErrorCategory.MISSING_FIELD, + target=ErrorTarget.TOOL_OUTPUT_UTILIZATION_EVALUATOR, + ) + if ( + "tool_definitions" in eval_input + and eval_input["tool_definitions"] is not None + ): + tool_definitions = eval_input["tool_definitions"] + filtered_tool_definitions = filter_to_used_tools( + tool_definitions=tool_definitions, + msgs_lists=[eval_input["query"], eval_input["response"]], + logger=logger + ) + eval_input["tool_definitions"] = reformat_tool_definitions( + filtered_tool_definitions, logger + ) + + eval_input["query"] = reformat_conversation_history( + eval_input["query"], logger, include_system_messages=False, include_tool_messages=True + ) + eval_input["response"] = reformat_agent_response( + eval_input["response"], logger, include_tool_messages=True + ) + + # print('-------------') + # for k, v in eval_input.items(): + # print(f'{k}:\n{v}') + # print('---') + # print('-------------') + + llm_output = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input) + if isinstance(llm_output, dict): + output_label = llm_output.get("label", None) + if output_label is None: + if logger: + logger.warning( + "LLM output does not contain 'label' key, returning NaN for the score." + ) + output_label = "fail" + + output_label = output_label.lower() + if output_label not in ["pass", "fail"]: + if logger: + logger.warning( + f"LLM output label is not 'pass' or 'fail' (got '{output_label}'), returning NaN for the score." + ) + + + # `faulty_details`, `reason`, `label` + score = 1.0 if output_label == "pass" else 0.0 + score_result = output_label + reason = llm_output.get("reason", "") + faulty_details = llm_output.get("faulty_details", []) + if faulty_details: + reason += " Issues found: " + "; ".join(faulty_details) + + return { + f"{self._result_key}": score, + f"{self._result_key}_result": score_result, + f"{self._result_key}_reason": reason, + # Uncomment the following line in the next iteration after UI contracts are validated. + # f"{self._result_key}_additional_details": llm_output + } + if logger: + logger.warning( + "LLM output is not a dictionary, returning NaN for the score." + ) + return {self._result_key: math.nan} diff --git a/assets/evaluators/builtin/tool_output_utilization/evaluator/tool_output_utilization.prompty b/assets/evaluators/builtin/tool_output_utilization/evaluator/tool_output_utilization.prompty new file mode 100644 index 0000000000..5fd406e48d --- /dev/null +++ b/assets/evaluators/builtin/tool_output_utilization/evaluator/tool_output_utilization.prompty @@ -0,0 +1,221 @@ +--- +name: Tool Output Utilization Evaluator +description: Binary evaluator that judges whether an agent correctly understands and *uses* the outputs returned by tools it invoked (APIs, search/retrieval, DB queries, etc.). This evaluator focuses ONLY on incorrect, missing, or fabricated uses of tool outputs — whether they are used in the final response to the user or reused as inputs to subsequent tool calls. It does NOT judge tool selection, correctness of new inputs, or general reasoning quality. +model: + api: chat + parameters: + temperature: 0.0 + max_tokens: 1500 + top_p: 1.0 + presence_penalty: 0 + frequency_penalty: 0 + response_format: + type: json_object + +inputs: + query: + type: string + response: + type: string + tool_definitions: + type: string +--- +system: +You are **Tool Output Utilization Judge**, an expert evaluator whose only task is to decide whether the AGENT correctly interpreted and *used* TOOL OUTPUTS whenproducing the RESPONSE. + +Key constraints: + +- **Focus exclusively** on uses of tool outputs. A "use" means any appearance or + incorporation of a prior tool output (from `query`) within the agent's `response` + — either as part of the textual content to the user or as a parameter inside a new tool call. +- Do **not** judge whether the agent chose the right tool, made the right new call, + or used the correct input format. Those are evaluated separately. +- Treat `query` as the authoritative source of all prior conversation. +- Treat `response` as the agent's latest message, which may: + 1. State facts that come from tool outputs. + 2. Contain tool calls that reference or reuse prior tool outputs. +- Use `tool_definitions` for contextual understanding of tool structures (fields, types, units, etc.). +- Conservative rule: if any tool-derived information appears incorrectly used in RESPONSE, omitted when relevant, or fabricated, mark it as a fault. + +INPUT +===== + +CONVERSATION_HISTORY: {{query}} +AGENT_RESPONSE: {{response}} +TOOL_DEFINITIONS: {{tool_definitions}} + +> `CONVERSATION_HISTORY` includes all prior turns and any tool results. +> `AGENT_RESPONSE` is the model's latest message. +> `TOOL_DEFINITIONS` describe the tool schemas used. + +user: +ROLE +==== + +You are Tool Output Utilization Judge. Evaluate whether the RESPONSE correctly: + +- Reflects the factual content of prior tool outputs from `query`, and +- Reuses any of those tool outputs correctly when incorporating them into new tool calls or the textual response. + +TASK +==== + +Produce exactly one JSON object (and nothing else) with these keys in **this exact order**: + +1. `faulty_details`: array of strings — list only the faults found (empty array if none). + Each entry can follow one of these formats: + - "claim -> MISMATCH (expected X, saw Y) mapped to tool_name.field_path" + - "claim -> FABRICATED (no supporting tool field)" + - "use -> FABRICATED (referenced value not found in prior tool outputs)" + - "use -> MISMATCH (expected X, used Y) mapped to tool_name.field_path" + + +2. `reason`: short 1–2 sentence summary of why PASS or FAIL. +3. `label`: string `"pass"` or `"fail"`. + +> Output must be valid JSON, all lowercase keys, no extra text or markdown. + +EVALUATION STEPS +================ + +1. Identify all **instances** in the RESPONSE where tool outputs are *used*: + - Either referenced in text (factual claims to the user), or + - Reused as parameters in new tool calls. +2. For each instance: + - Cross-check against the corresponding tool outputs in `query`. + - If the usage faithfully matches the tool output (exact or paraphrased) → OK. + - If the agent uses wrong values, wrong entities, incorrect transformations, or fabricates data → record as fault. +3. Populate the JSON object: + - `faulty_details`: all detected issues (empty if none). + - `reason`: concise rationale. + - `label`: `"pass"` or `"fail"`. + +SCORING RULES +============= + +- **PASS:** No faulty uses of tool outputs found (empty `faulty_details`) in the RESPONSE. +- **FAIL:** Any misuse, fabrication, omission, or misinterpretation of a tool output, + including when a prior tool output is reused incorrectly in a new tool call in the RESPONSE. + +IMPLEMENTATION NOTES +==================== + +- Do NOT evaluate: + - The correctness of *which tool* was used. + - Whether new tool inputs are valid by themselves. + - Task success or completeness. +- Your judgment concerns *only* whether previously returned tool outputs are + correctly understood and reused where they appear. +- If multiple faulty uses exist, list all in `faulty_details`. +- When uncertain whether a value use is correct, treat it as a fault and explain why. +- If tool outputs are missing but the response claims to use them, that counts as a fabricated use. +- If a tool fails, that is outside your scope; unless the response misuses or misreports the failed output. + +> [TOOL CALLS] and [TOOL RESULTS] are internal, user does not see them. + +EXAMPLES (few-shot — using the new JSON schema and key order) + +### Example 1 - PASS +QUERY: +User turn 1: + Can you transfer $500 from my checking to my savings account? + +Agent turn 1: + [TOOL_CALL] get_account_balances(user_id="USER456") + [TOOL_RESULT] {'accounts': [{'account_id': 'CHK001', 'type': 'checking', 'balance': 1250.75}, {'account_id': 'SAV001', 'type': 'savings', 'balance': 3400.20}]} + You have $1,250.75 in checking and $3,400.20 in savings. You have enough for the transfer. + +User turn 2: + Great, please go ahead. + +RESPONSE: +[TOOL_CALL] transfer_funds(from_account="CHK001", to_account="SAV001", amount=500) +[TOOL_RESULT] {'transaction_id':'TXN789123','status':'completed','from_account':'CHK001','to_account':'SAV001','amount':500,'new_balances':{'CHK001':750.75,'SAV001':3900.20}} +Transfer completed successfully. Checking now has $750.75 and savings $3,900.20. + +EXPECTED JSON: +{ + "faulty_details": [], + "reason": "All tool-derived claims and uses in the response match the prior tool outputs correctly.", + "label": "pass", +} + + +### Example 2 - FAIL (unit misinterpretation) +QUERY: +User turn 1: + What's the current temperature in Rome? + +RESPONSE: +[TOOL_CALL] weather_api(city="Rome") +[TOOL_RESULT] {"city":"Rome","temp":28,"condition":"Sunny"} +It’s currently 28°F and sunny in Rome. + +TOOL_DEFINITIONS: +- weather_api: Gets current weather for a city and returns temperature in Celsius (inputs: city) (outputs: city, temp, condition) + +EXPECTED JSON: +{ + "faulty_details": [ + "claim -> MISMATCH (expected 28°C, saw 28°F) mapped to weather_api.temp" + ], + "reason": "Agent incorrectly reported the temperature in Fahrenheit instead of Celsius as provided by the tool output.", + "label": "fail", +} + + +### Example 3 - FAIL (fabricated inventory claim) +QUERY: +User turn 1: + Can you check if item B123 is available? + +RESPONSE: +[TOOL_CALL] inventory_api(item_id="B123") +[TOOL_RESULT] {"item_id":"B123","qty":0,"eta":"2025-10-07"} +The item is in stock till the 7th of October. + +EXPECTED JSON: +{ + "faulty_details": [ + "claim -> FABRICATED (no supporting tool field; inventory_api.qty is 0, eta is future date)" + ], + "reason": "Agent fabricated a claim that the item is in stock, whereas the tool output indicates a quantity of 0 and misunderstood availability eta", + "label": "fail", +} + + +### Example 4 - FAIL (misuse of prior tool output in new tool call) +QUERY: +User turn 1: + Show my latest account transactions. + +Agent turn 1: + [TOOL_CALL] get_account_list(user_id="U123") + [TOOL_RESULT] {"accounts":[{"id":"CHK100","type":"checking"},{"id":"SAV200","type":"savings"}]} + You have two accounts: CHK100 and SAV200. Which one do you want transactions for? + +User turn 2: + Please get the transaction history for my checking account. + +RESPONSE: +[TOOL_CALL] get_transactions(account_id="SAV200") +[TOOL_RESULT] ... +The latest transactions for your checking account are: ... + +EXPECTED JSON: +{ + "faulty_details": [ + "use -> MISMATCH (expected CHK100, used SAV200) mapped to get_transactions.account_id" + ], + "reason": "Agent incorrectly used the savings account ID instead of the requested checking account ID, leading to a mismatch in the transactions reported.", + "label": "fail", +} + +-- +END OF EXAMPLES + +FINAL NOTES: + +- Output must be exactly one JSON object and must follow the key order: `faulty_details`, `reason`, `label`. + +# Output diff --git a/assets/evaluators/builtin/tool_output_utilization/spec.yaml b/assets/evaluators/builtin/tool_output_utilization/spec.yaml new file mode 100644 index 0000000000..0f905bfea6 --- /dev/null +++ b/assets/evaluators/builtin/tool_output_utilization/spec.yaml @@ -0,0 +1,49 @@ +type: "evaluator" +name: "builtin.tool_output_utilization" +version: 1 +displayName: "Tool-Output-Utilization-Evaluator" +description: "" +evaluatorType: "builtin" +evaluatorSubType: "code" +categories: ["agents"] +tags: + provider: "Microsoft" +initParameterSchema: + type: "object" + properties: + deployment_name: + type: "string" + threshold: + type: "number" + credential: + type: "object" + required: ["deployment_name"] +dataMappingSchema: + type: "object" + properties: + query: + anyOf: + - type: "string" + - type: "array" + items: + type: "object" + response: + anyOf: + - type: "string" + - type: "array" + items: + type: "object" + tool_definitions: + anyOf: + - type: "object" + - type: "array" + items: + type: "object" + required: ["query", "response", "tool_definitions"] +outputSchema: + intent_resolution: + type: "ordinal" + desirable_direction: "increase" + min_value: 0 + max_value: 1 +path: ./evaluator \ No newline at end of file From 592b00e7d0e0d1098701807b5512d2de2a0b38b7 Mon Sep 17 00:00:00 2001 From: ahibrahim Date: Wed, 8 Oct 2025 14:17:40 +0300 Subject: [PATCH 02/12] update spec --- assets/evaluators/builtin/tool_output_utilization/spec.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/assets/evaluators/builtin/tool_output_utilization/spec.yaml b/assets/evaluators/builtin/tool_output_utilization/spec.yaml index 0f905bfea6..ee1b119342 100644 --- a/assets/evaluators/builtin/tool_output_utilization/spec.yaml +++ b/assets/evaluators/builtin/tool_output_utilization/spec.yaml @@ -2,7 +2,7 @@ type: "evaluator" name: "builtin.tool_output_utilization" version: 1 displayName: "Tool-Output-Utilization-Evaluator" -description: "" +description: "| | |\n| -- | -- |\n| Score range | Binary [0-1]: 1 indicates correct tool output usage, 0 indicates faulty usage. |\n| What is this metric? | Tool Output Utilization evaluates whether an agent correctly understands and uses the outputs returned by tools it invoked (APIs, search/retrieval, DB queries, etc.). |\n| How does it work? | It focuses exclusively on detecting incorrect, missing, or fabricated uses of tool outputs in the agent's response, checking if tool-derived information is accurately reflected or reused. |\n| When to use it? | Use this metric for agent-based systems that rely on tools, ensuring agents properly interpret and utilize tool results without fabrication or misrepresentation. |\n" evaluatorType: "builtin" evaluatorSubType: "code" categories: ["agents"] @@ -41,7 +41,7 @@ dataMappingSchema: type: "object" required: ["query", "response", "tool_definitions"] outputSchema: - intent_resolution: + tool_output_utilization: type: "ordinal" desirable_direction: "increase" min_value: 0 From cef3f273fe75c6aa23d349afb5ef6d3c4f0258e8 Mon Sep 17 00:00:00 2001 From: ahibrahim Date: Wed, 8 Oct 2025 17:24:29 +0300 Subject: [PATCH 03/12] updates --- .../evaluator/_tool_output_utilization.py | 106 +++++++++--------- 1 file changed, 50 insertions(+), 56 deletions(-) diff --git a/assets/evaluators/builtin/tool_output_utilization/evaluator/_tool_output_utilization.py b/assets/evaluators/builtin/tool_output_utilization/evaluator/_tool_output_utilization.py index ed0ad9297e..104772b45f 100644 --- a/assets/evaluators/builtin/tool_output_utilization/evaluator/_tool_output_utilization.py +++ b/assets/evaluators/builtin/tool_output_utilization/evaluator/_tool_output_utilization.py @@ -1,6 +1,6 @@ -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT License. - +# --------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# --------------------------------------------------------- import os import math import logging @@ -8,14 +8,20 @@ from typing_extensions import overload, override -from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget +from azure.ai.evaluation._exceptions import ( + EvaluationException, + ErrorBlame, + ErrorCategory, + ErrorTarget, +) from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase -from azure.ai.evaluation._common.utils import ( +from ..._common.utils import ( reformat_conversation_history, reformat_agent_response, reformat_tool_definitions, filter_to_used_tools, ) +from azure.ai.evaluation._model_configurations import Message from azure.ai.evaluation._common._experimental import experimental logger = logging.getLogger(__name__) @@ -23,20 +29,13 @@ @experimental class ToolOutputUtilizationEvaluator(PromptyEvaluatorBase[Union[str, float]]): - """The Tool Output Utilization evaluator assesses how well an AI-generated response utilizes the provided tool definitions based on: + """The Tool Output Utilization Evaluator assesses how effectively an AI agent utilizes the outputs from tools and whether it accurately incorporates this information into its responses. - - Alignment with instructions and definitions - - Accuracy and clarity of the response - - Proper use of provided tool definitions + Scoring is based on two levels: + 1. Pass - The agent effectively utilizes tool outputs and accurately incorporates the information into its response. + 2. Fail - The agent fails to properly utilize tool outputs or incorrectly incorporates the information into its response. - Scoring is based on five levels: - 1. Fully Inadherent - Response completely ignores instructions. - 2. Barely Adherent - Partial alignment with critical gaps. - 3. Moderately Adherent - Meets core requirements but lacks precision. - 4. Mostly Adherent - Clear and accurate with minor issues. - 5. Fully Adherent - Flawless adherence to instructions. - - The evaluation includes a step-by-step reasoning process, a brief explanation, and a final integer score. + The evaluation includes the score, a brief explanation, and a final pass/fail result. :param model_config: Configuration for the Azure OpenAI model. @@ -49,7 +48,7 @@ class ToolOutputUtilizationEvaluator(PromptyEvaluatorBase[Union[str, float]]): :end-before: [END tool_output_utilization_evaluator] :language: python :dedent: 8 - :caption: Initialize and call an TaskAdherenceEvaluator with a query and response. + :caption: Initialize and call a ToolOutputUtilizationEvaluator with a query and response. .. admonition:: Example using Azure AI Project URL: @@ -58,7 +57,7 @@ class ToolOutputUtilizationEvaluator(PromptyEvaluatorBase[Union[str, float]]): :end-before: [END tool_output_utilization_evaluator] :language: python :dedent: 8 - :caption: Initialize and call TaskAdherenceEvaluator using Azure AI Project URL in the following format + :caption: Initialize and call ToolOutputUtilizationEvaluator using Azure AI Project URL in the following format https://{resource_name}.services.ai.azure.com/api/projects/{project_name} """ @@ -98,21 +97,21 @@ def __call__( *, query: Union[str, List[dict]], response: Union[str, List[dict]], - tool_definitions: Optional[Union[dict, List[dict]]] = None, + tool_definitions: Union[dict, List[dict]], ) -> Dict[str, Union[str, float]]: - """Evaluate task adherence for a given query, response, and optional tool defintions. + """Evaluate tool output utilization for a given query, response, and optional tool defintions. The query and response can be either a string or a list of messages. Example with string inputs and no tools: - evaluator = TaskAdherenceEvaluator(model_config) + evaluator = ToolOutputUtilizationEvaluator(model_config) query = "What is the weather today?" response = "The weather is sunny." result = evaluator(query=query, response=response) Example with list of messages: - evaluator = TaskAdherenceEvaluator(model_config) + evaluator = ToolOutputUtilizationEvaluator(model_config) query = [{'role': 'system', 'content': 'You are a friendly and helpful customer service agent.'}, {'createdAt': 1700000060, 'role': 'user', 'content': [{'type': 'text', 'text': 'Hi, I need help with the last 2 orders on my account #888. Could you please update me on their status?'}]}] response = [{'createdAt': 1700000070, 'run_id': '0', 'role': 'assistant', 'content': [{'type': 'text', 'text': 'Hello! Let me quickly look up your account details.'}]}, {'createdAt': 1700000075, 'run_id': '0', 'role': 'assistant', 'content': [{'type': 'tool_call', 'tool_call': {'id': 'tool_call_20250310_001', 'type': 'function', 'function': {'name': 'get_orders', 'arguments': {'account_number': '888'}}}}]}, {'createdAt': 1700000080, 'run_id': '0', 'tool_call_id': 'tool_call_20250310_001', 'role': 'tool', 'content': [{'type': 'tool_result', 'tool_result': '[{ "order_id": "123" }, { "order_id": "124" }]'}]}, {'createdAt': 1700000085, 'run_id': '0', 'role': 'assistant', 'content': [{'type': 'text', 'text': 'Thanks for your patience. I see two orders on your account. Let me fetch the details for both.'}]}, {'createdAt': 1700000090, 'run_id': '0', 'role': 'assistant', 'content': [{'type': 'tool_call', 'tool_call': {'id': 'tool_call_20250310_002', 'type': 'function', 'function': {'name': 'get_order', 'arguments': {'order_id': '123'}}}}, {'type': 'tool_call', 'tool_call': {'id': 'tool_call_20250310_003', 'type': 'function', 'function': {'name': 'get_order', 'arguments': {'order_id': '124'}}}}]}, {'createdAt': 1700000095, 'run_id': '0', 'tool_call_id': 'tool_call_20250310_002', 'role': 'tool', 'content': [{'type': 'tool_result', 'tool_result': '{ "order": { "id": "123", "status": "shipped", "delivery_date": "2025-03-15" } }'}]}, {'createdAt': 1700000100, 'run_id': '0', 'tool_call_id': 'tool_call_20250310_003', 'role': 'tool', 'content': [{'type': 'tool_result', 'tool_result': '{ "order": { "id": "124", "status": "delayed", "expected_delivery": "2025-03-20" } }'}]}, {'createdAt': 1700000105, 'run_id': '0', 'role': 'assistant', 'content': [{'type': 'text', 'text': 'The order with ID 123 has been shipped and is expected to be delivered on March 15, 2025. However, the order with ID 124 is delayed and should now arrive by March 20, 2025. Is there anything else I can help you with?'}]}] tool_definitions = [{'name': 'get_orders', 'description': 'Get the list of orders for a given account number.', 'parameters': {'type': 'object', 'properties': {'account_number': {'type': 'string', 'description': 'The account number to get the orders for.'}}}}, {'name': 'get_order', 'description': 'Get the details of a specific order.', 'parameters': {'type': 'object', 'properties': {'order_id': {'type': 'string', 'description': 'The order ID to get the details for.'}}}}, {'name': 'initiate_return', 'description': 'Initiate the return process for an order.', 'parameters': {'type': 'object', 'properties': {'order_id': {'type': 'string', 'description': 'The order ID for the return process.'}}}}, {'name': 'update_shipping_address', 'description': 'Update the shipping address for a given account.', 'parameters': {'type': 'object', 'properties': {'account_number': {'type': 'string', 'description': 'The account number to update.'}, 'new_address': {'type': 'string', 'description': 'The new shipping address.'}}}}] @@ -124,8 +123,8 @@ def __call__( :keyword response: The response being evaluated, either a string or a list of messages (full agent response potentially including tool calls) :paramtype response: Union[str, List[dict]] :keyword tool_definitions: An optional list of messages containing the tool definitions the agent is aware of. - :paramtype tool_definitions: Optional[Union[dict, List[dict]]] - :return: A dictionary with the task adherence evaluation results. + :paramtype tool_definitions: Union[dict, List[dict]] + :return: A dictionary with the tool output utilization evaluation results. :rtype: Dict[str, Union[str, float]] """ @@ -144,7 +143,7 @@ def __call__( # pylint: disable=docstring-missing-param @override async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # type: ignore[override] - """Do Task Adherence evaluation. + """Do Tool Output Utilization evaluation. :param eval_input: The input to the evaluator. Expected to contain whatever inputs are needed for the _flow method :type eval_input: Dict :return: The evaluation result. @@ -152,40 +151,38 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t """ # we override the _do_eval method as we want the output to be a dictionary, # which is a different schema than _base_prompty_eval.py - if "query" not in eval_input and "response" not in eval_input: + if ( + ("query" not in eval_input) + and ("response" not in eval_input) + and ("tool_definitions" not in eval_input) + ): raise EvaluationException( - message=f"Both query and response must be provided as input to the Task Adherence evaluator.", - internal_message=f"Both query and response must be provided as input to the Task Adherence evaluator.", + message="Query, response, and tool_definitions are required inputs to the Tool Output Utilization evaluator.", + internal_message="Query, response, and tool_definitions are required inputs to the Tool Output Utilization evaluator.", blame=ErrorBlame.USER_ERROR, category=ErrorCategory.MISSING_FIELD, target=ErrorTarget.TOOL_OUTPUT_UTILIZATION_EVALUATOR, ) - if ( - "tool_definitions" in eval_input - and eval_input["tool_definitions"] is not None - ): - tool_definitions = eval_input["tool_definitions"] - filtered_tool_definitions = filter_to_used_tools( - tool_definitions=tool_definitions, - msgs_lists=[eval_input["query"], eval_input["response"]], - logger=logger - ) - eval_input["tool_definitions"] = reformat_tool_definitions( - filtered_tool_definitions, logger - ) - + + tool_definitions = eval_input["tool_definitions"] + filtered_tool_definitions = filter_to_used_tools( + tool_definitions=tool_definitions, + msgs_lists=[eval_input["query"], eval_input["response"]], + logger=logger, + ) + eval_input["tool_definitions"] = reformat_tool_definitions( + filtered_tool_definitions, logger + ) + eval_input["query"] = reformat_conversation_history( - eval_input["query"], logger, include_system_messages=False, include_tool_messages=True + eval_input["query"], + logger, + include_system_messages=True, + include_tool_messages=True, ) eval_input["response"] = reformat_agent_response( eval_input["response"], logger, include_tool_messages=True ) - - # print('-------------') - # for k, v in eval_input.items(): - # print(f'{k}:\n{v}') - # print('---') - # print('-------------') llm_output = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input) if isinstance(llm_output, dict): @@ -196,19 +193,18 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t "LLM output does not contain 'label' key, returning NaN for the score." ) output_label = "fail" - + output_label = output_label.lower() if output_label not in ["pass", "fail"]: if logger: logger.warning( f"LLM output label is not 'pass' or 'fail' (got '{output_label}'), returning NaN for the score." ) - - - # `faulty_details`, `reason`, `label` + score = 1.0 if output_label == "pass" else 0.0 score_result = output_label reason = llm_output.get("reason", "") + faulty_details = llm_output.get("faulty_details", []) if faulty_details: reason += " Issues found: " + "; ".join(faulty_details) @@ -217,8 +213,6 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t f"{self._result_key}": score, f"{self._result_key}_result": score_result, f"{self._result_key}_reason": reason, - # Uncomment the following line in the next iteration after UI contracts are validated. - # f"{self._result_key}_additional_details": llm_output } if logger: logger.warning( From 718eb641690b4d6d30b28f13c56f8ffb1342181d Mon Sep 17 00:00:00 2001 From: ahibrahim Date: Wed, 8 Oct 2025 23:46:53 +0300 Subject: [PATCH 04/12] update with utils --- .../evaluator/_tool_output_utilization.py | 223 +++++++++++++++++- 1 file changed, 217 insertions(+), 6 deletions(-) diff --git a/assets/evaluators/builtin/tool_output_utilization/evaluator/_tool_output_utilization.py b/assets/evaluators/builtin/tool_output_utilization/evaluator/_tool_output_utilization.py index 104772b45f..cac4afbace 100644 --- a/assets/evaluators/builtin/tool_output_utilization/evaluator/_tool_output_utilization.py +++ b/assets/evaluators/builtin/tool_output_utilization/evaluator/_tool_output_utilization.py @@ -13,20 +13,231 @@ ErrorBlame, ErrorCategory, ErrorTarget, + ErrorMessage ) from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase -from ..._common.utils import ( - reformat_conversation_history, - reformat_agent_response, - reformat_tool_definitions, - filter_to_used_tools, +from azure.ai.evaluation._common.utils import ( + _extract_text_from_content ) -from azure.ai.evaluation._model_configurations import Message from azure.ai.evaluation._common._experimental import experimental logger = logging.getLogger(__name__) +# ``` updated _exceptions.py +# Extend ErrorTarget enum if needed +if not hasattr(ErrorTarget, 'TOOL_OUTPUT_UTILIZATION_EVALUATOR'): + ErrorTarget.TOOL_OUTPUT_UTILIZATION_EVALUATOR = 'ToolOutputUtilizationEvaluator' +# ``` + +# ``` updated utils.py +def filter_to_used_tools(tool_definitions, msgs_lists, logger=None): + """Filters the tool definitions to only include those that were actually used in the messages lists.""" + try: + used_tool_names = set() + any_tools_used = False + for msgs in msgs_lists: + for msg in msgs: + if msg.get("role") == "assistant" and "content" in msg: + for content in msg.get("content", []): + if content.get("type") == "tool_call": + any_tools_used = True + if "tool_call" in content and "function" in content["tool_call"]: + used_tool_names.add(content["tool_call"]["function"]) + elif "name" in content: + used_tool_names.add(content["name"]) + + filtered_tools = [tool for tool in tool_definitions if tool.get("name") in used_tool_names] + if any_tools_used and not filtered_tools: + if logger: + logger.warning("No tool definitions matched the tools used in the messages. Returning original list.") + filtered_tools = tool_definitions + + return filtered_tools + except Exception as e: + if logger: + logger.warning(f"Failed to filter tool definitions, returning original list. Error: {e}") + return tool_definitions + + +def _get_conversation_history(query, include_system_messages=False, include_tool_messages=False): + all_user_queries, all_agent_responses = [], [] + cur_user_query, cur_agent_response = [], [] + system_message = None + + for msg in query: + role = msg.get("role") + if not role: + continue + if include_system_messages and role == "system": + system_message = msg.get("content", "") + + elif role == "user" and "content" in msg: + if cur_agent_response: + formatted_agent_response = _get_agent_response( + cur_agent_response, include_tool_messages=include_tool_messages + ) + all_agent_responses.append([formatted_agent_response]) + cur_agent_response = [] + text_in_msg = _extract_text_from_content(msg["content"]) + if text_in_msg: + cur_user_query.append(text_in_msg) + + elif role in ("assistant", "tool"): + if cur_user_query: + all_user_queries.append(cur_user_query) + cur_user_query = [] + cur_agent_response.append(msg) + + if cur_user_query: + all_user_queries.append(cur_user_query) + if cur_agent_response: + formatted_agent_response = _get_agent_response(cur_agent_response, include_tool_messages=include_tool_messages) + all_agent_responses.append([formatted_agent_response]) + + if len(all_user_queries) != len(all_agent_responses) + 1: + raise EvaluationException( + message=ErrorMessage.MALFORMED_CONVERSATION_HISTORY, + internal_message=ErrorMessage.MALFORMED_CONVERSATION_HISTORY, + target=ErrorTarget.CONVERSATION_HISTORY_PARSING, + category=ErrorCategory.INVALID_VALUE, + blame=ErrorBlame.USER_ERROR, + ) + + result = {"user_queries": all_user_queries, "agent_responses": all_agent_responses} + if include_system_messages and system_message: + result["system_message"] = system_message + return result + + +def _pretty_format_conversation_history(conversation_history): + """Formats the conversation history for better readability.""" + formatted_history = "" + if conversation_history.get("system_message"): + formatted_history += "SYSTEM_PROMPT:\n" + formatted_history += " " + conversation_history["system_message"] + "\n\n" + for i, (user_query, agent_response) in enumerate( + zip(conversation_history["user_queries"], conversation_history["agent_responses"] + [None]) + ): + formatted_history += f"User turn {i+1}:\n" + for msg in user_query: + formatted_history += " " + "\n ".join(msg) + formatted_history += "\n\n" + if agent_response: + formatted_history += f"Agent turn {i+1}:\n" + for msg in agent_response: + formatted_history += " " + "\n ".join(msg) + formatted_history += "\n\n" + return formatted_history + + +def reformat_conversation_history(query, logger=None, include_system_messages=False, include_tool_messages=False): + """Reformats the conversation history to a more compact representation.""" + try: + conversation_history = _get_conversation_history( + query, + include_system_messages=include_system_messages, + include_tool_messages=include_tool_messages, + ) + return _pretty_format_conversation_history(conversation_history) + except Exception as e: + # If the conversation history cannot be parsed for whatever reason (e.g. the converter format changed), the original query is returned + # This is a fallback to ensure that the evaluation can still proceed. However the accuracy of the evaluation will be affected. + # From our tests the negative impact on IntentResolution is: + # Higher intra model variance (0.142 vs 0.046) + # Higher inter model variance (0.345 vs 0.607) + # Lower percentage of mode in Likert scale (73.4% vs 75.4%) + # Lower pairwise agreement between LLMs (85% vs 90% at the pass/fail level with threshold of 3) + if logger: + logger.warning(f"Conversation history could not be parsed, falling back to original query: {query}") + print(e) + return query + + +def _get_agent_response(agent_response_msgs, include_tool_messages=False): + """Extracts formatted agent response including text, and optionally tool calls/results.""" + agent_response_text = [] + tool_results = {} + + # First pass: collect tool results + if include_tool_messages: + for msg in agent_response_msgs: + if msg.get("role") == "tool" and "tool_call_id" in msg: + for content in msg.get("content", []): + if content.get("type") == "tool_result": + result = content.get("tool_result") + tool_results[msg["tool_call_id"]] = f"[TOOL_RESULT] {result}" + + # Second pass: parse assistant messages and tool calls + for msg in agent_response_msgs: + if "role" in msg and msg.get("role") == "assistant" and "content" in msg: + text = _extract_text_from_content(msg["content"]) + if text: + agent_response_text.extend(text) + if include_tool_messages: + for content in msg.get("content", []): + # Todo: Verify if this is the correct way to handle tool calls + if content.get("type") == "tool_call": + if "tool_call" in content and "function" in content.get("tool_call", {}): + tc = content.get("tool_call", {}) + func_name = tc.get("function", {}).get("name", "") + args = tc.get("function", {}).get("arguments", {}) + tool_call_id = tc.get("id") + else: + tool_call_id = content.get("tool_call_id") + func_name = content.get("name", "") + args = content.get("arguments", {}) + args_str = ", ".join(f'{k}="{v}"' for k, v in args.items()) + call_line = f"[TOOL_CALL] {func_name}({args_str})" + agent_response_text.append(call_line) + if tool_call_id in tool_results: + agent_response_text.append(tool_results[tool_call_id]) + + return agent_response_text + + +def reformat_agent_response(response, logger=None, include_tool_messages=False): + try: + if response is None or response == []: + return "" + agent_response = _get_agent_response(response, include_tool_messages=include_tool_messages) + if agent_response == []: + # If no message could be extracted, likely the format changed, fallback to the original response in that case + if logger: + logger.warning( + f"Empty agent response extracted, likely due to input schema change. Falling back to using the original response: {response}" + ) + return response + return "\n".join(agent_response) + except: + # If the agent response cannot be parsed for whatever reason (e.g. the converter format changed), the original response is returned + # This is a fallback to ensure that the evaluation can still proceed. See comments on reformat_conversation_history for more details. + if logger: + logger.warning(f"Agent response could not be parsed, falling back to original response: {response}") + return response + + +def reformat_tool_definitions(tool_definitions, logger=None): + try: + output_lines = ["TOOL_DEFINITIONS:"] + for tool in tool_definitions: + name = tool.get("name", "unnamed_tool") + desc = tool.get("description", "").strip() + params = tool.get("parameters", {}).get("properties", {}) + param_names = ", ".join(params.keys()) if params else "no parameters" + output_lines.append(f"- {name}: {desc} (inputs: {param_names})") + return "\n".join(output_lines) + except Exception as e: + # If the tool definitions cannot be parsed for whatever reason, the original tool definitions are returned + # This is a fallback to ensure that the evaluation can still proceed. See comments on reformat_conversation_history for more details. + if logger: + logger.warning( + f"Tool definitions could not be parsed, falling back to original definitions: {tool_definitions}" + ) + return tool_definitions +### ```` + + @experimental class ToolOutputUtilizationEvaluator(PromptyEvaluatorBase[Union[str, float]]): """The Tool Output Utilization Evaluator assesses how effectively an AI agent utilizes the outputs from tools and whether it accurately incorporates this information into its responses. From 867ca8aec9a39f2b438d6a0dc6beb0bfc16fac07 Mon Sep 17 00:00:00 2001 From: ahibrahim Date: Thu, 9 Oct 2025 12:57:10 +0300 Subject: [PATCH 05/12] prefix with an underscore --- .../evaluator/_tool_output_utilization.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/assets/evaluators/builtin/tool_output_utilization/evaluator/_tool_output_utilization.py b/assets/evaluators/builtin/tool_output_utilization/evaluator/_tool_output_utilization.py index cac4afbace..79f0352eb8 100644 --- a/assets/evaluators/builtin/tool_output_utilization/evaluator/_tool_output_utilization.py +++ b/assets/evaluators/builtin/tool_output_utilization/evaluator/_tool_output_utilization.py @@ -31,7 +31,7 @@ # ``` # ``` updated utils.py -def filter_to_used_tools(tool_definitions, msgs_lists, logger=None): +def _filter_to_used_tools(tool_definitions, msgs_lists, logger=None): """Filters the tool definitions to only include those that were actually used in the messages lists.""" try: used_tool_names = set() @@ -376,7 +376,7 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t ) tool_definitions = eval_input["tool_definitions"] - filtered_tool_definitions = filter_to_used_tools( + filtered_tool_definitions = _filter_to_used_tools( tool_definitions=tool_definitions, msgs_lists=[eval_input["query"], eval_input["response"]], logger=logger, From 36efbe926562d7cf30bac66cbe8a974a5e678d38 Mon Sep 17 00:00:00 2001 From: ahibrahim Date: Mon, 13 Oct 2025 12:56:26 +0300 Subject: [PATCH 06/12] fix code style and health --- .../evaluator/_tool_output_utilization.py | 57 +++++++++++++++---- 1 file changed, 47 insertions(+), 10 deletions(-) diff --git a/assets/evaluators/builtin/tool_output_utilization/evaluator/_tool_output_utilization.py b/assets/evaluators/builtin/tool_output_utilization/evaluator/_tool_output_utilization.py index 79f0352eb8..03ec38d65f 100644 --- a/assets/evaluators/builtin/tool_output_utilization/evaluator/_tool_output_utilization.py +++ b/assets/evaluators/builtin/tool_output_utilization/evaluator/_tool_output_utilization.py @@ -4,7 +4,7 @@ import os import math import logging -from typing import Dict, Union, List, Optional +from typing import Dict, Union, List from typing_extensions import overload, override @@ -30,9 +30,10 @@ ErrorTarget.TOOL_OUTPUT_UTILIZATION_EVALUATOR = 'ToolOutputUtilizationEvaluator' # ``` + # ``` updated utils.py def _filter_to_used_tools(tool_definitions, msgs_lists, logger=None): - """Filters the tool definitions to only include those that were actually used in the messages lists.""" + """Filter the tool definitions to only include those that were actually used in the messages lists.""" try: used_tool_names = set() any_tools_used = False @@ -61,6 +62,18 @@ def _filter_to_used_tools(tool_definitions, msgs_lists, logger=None): def _get_conversation_history(query, include_system_messages=False, include_tool_messages=False): + """Parse conversation history from a list of messages into structured format. + + :param query: List of message dictionaries containing the conversation history + :type query: List[dict] + :param include_system_messages: Whether to include system messages in the output + :type include_system_messages: bool + :param include_tool_messages: Whether to include tool-related messages in agent responses + :type include_tool_messages: bool + :return: Dict containing parsed user_queries, agent_responses, and optionally system_message + :rtype: Dict[str, Union[List[List[str]], str]] + :raises EvaluationException: If conversation history is malformed (mismatched user/agent turns + """ all_user_queries, all_agent_responses = [], [] cur_user_query, cur_agent_response = [], [] system_message = None @@ -111,7 +124,7 @@ def _get_conversation_history(query, include_system_messages=False, include_tool def _pretty_format_conversation_history(conversation_history): - """Formats the conversation history for better readability.""" + """Format the conversation history for better readability.""" formatted_history = "" if conversation_history.get("system_message"): formatted_history += "SYSTEM_PROMPT:\n" @@ -155,7 +168,7 @@ def reformat_conversation_history(query, logger=None, include_system_messages=Fa def _get_agent_response(agent_response_msgs, include_tool_messages=False): - """Extracts formatted agent response including text, and optionally tool calls/results.""" + """Extract formatted agent response including text, and optionally tool calls/results.""" agent_response_text = [] tool_results = {} @@ -197,6 +210,17 @@ def _get_agent_response(agent_response_msgs, include_tool_messages=False): def reformat_agent_response(response, logger=None, include_tool_messages=False): + """Reformat agent response to a standardized string format. + + :param response: The agent response to reformat, can be None, empty list, or list of messages + :type response: Union[None, List[dict], str] + :param logger: Optional logger for warning messages + :type logger: Optional[logging.Logger] + :param include_tool_messages: Whether to include tool call and result information + :type include_tool_messages: bool + :return: Formatted agent response as a string, or original response if parsing fails + :rtype: str + """ try: if response is None or response == []: return "" @@ -209,15 +233,24 @@ def reformat_agent_response(response, logger=None, include_tool_messages=False): ) return response return "\n".join(agent_response) - except: + except Exception as e: # If the agent response cannot be parsed for whatever reason (e.g. the converter format changed), the original response is returned # This is a fallback to ensure that the evaluation can still proceed. See comments on reformat_conversation_history for more details. if logger: - logger.warning(f"Agent response could not be parsed, falling back to original response: {response}") + logger.warning(f"Agent response could not be parsed, falling back to original response. Error: {e}") return response def reformat_tool_definitions(tool_definitions, logger=None): + """Reformat tool definitions into a human-readable string format. + + :param tool_definitions: List of tool definition dictionaries containing name, description, and parameters + :type tool_definitions: List[dict] + :param logger: Optional logger for warning messages + :type logger: Optional[logging.Logger] + :return: Formatted tool definitions as a string, or original definitions if parsing fails + :rtype: str + """ try: output_lines = ["TOOL_DEFINITIONS:"] for tool in tool_definitions: @@ -232,10 +265,10 @@ def reformat_tool_definitions(tool_definitions, logger=None): # This is a fallback to ensure that the evaluation can still proceed. See comments on reformat_conversation_history for more details. if logger: logger.warning( - f"Tool definitions could not be parsed, falling back to original definitions: {tool_definitions}" + f"Tool definitions could not be parsed, falling back to original definitions: {tool_definitions}. Error: {e}" ) return tool_definitions -### ```` +# ``` @experimental @@ -291,6 +324,7 @@ def __init__( credential=None, **kwargs, ): + """Initialize the Tool Output Utilization Evaluator.""" current_dir = os.path.dirname(__file__) prompty_path = os.path.join(current_dir, self._PROMPTY_FILE) self.threshold = threshold @@ -311,9 +345,10 @@ def __call__( tool_definitions: Union[dict, List[dict]], ) -> Dict[str, Union[str, float]]: """Evaluate tool output utilization for a given query, response, and optional tool defintions. + The query and response can be either a string or a list of messages. - + Example with string inputs and no tools: evaluator = ToolOutputUtilizationEvaluator(model_config) query = "What is the weather today?" @@ -339,6 +374,7 @@ def __call__( :rtype: Dict[str, Union[str, float]] """ + @override def __call__( # pylint: disable=docstring-missing-param self, @@ -346,7 +382,7 @@ def __call__( # pylint: disable=docstring-missing-param **kwargs, ): """ - Invokes the instance using the overloaded __call__ signature. + Invoke the instance using the overloaded __call__ signature. For detailed parameter types and return value documentation, see the overloaded __call__ definition. """ @@ -355,6 +391,7 @@ def __call__( # pylint: disable=docstring-missing-param @override async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # type: ignore[override] """Do Tool Output Utilization evaluation. + :param eval_input: The input to the evaluator. Expected to contain whatever inputs are needed for the _flow method :type eval_input: Dict :return: The evaluation result. From aed519b86aef926d1280d3abee1a624d37a09f02 Mon Sep 17 00:00:00 2001 From: ahibrahim Date: Mon, 13 Oct 2025 22:06:19 +0300 Subject: [PATCH 07/12] update errortarget usage --- .../evaluator/_tool_output_utilization.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/assets/evaluators/builtin/tool_output_utilization/evaluator/_tool_output_utilization.py b/assets/evaluators/builtin/tool_output_utilization/evaluator/_tool_output_utilization.py index 03ec38d65f..97c1600bda 100644 --- a/assets/evaluators/builtin/tool_output_utilization/evaluator/_tool_output_utilization.py +++ b/assets/evaluators/builtin/tool_output_utilization/evaluator/_tool_output_utilization.py @@ -4,6 +4,7 @@ import os import math import logging +from enum import Enum from typing import Dict, Union, List from typing_extensions import overload, override @@ -26,8 +27,15 @@ # ``` updated _exceptions.py # Extend ErrorTarget enum if needed -if not hasattr(ErrorTarget, 'TOOL_OUTPUT_UTILIZATION_EVALUATOR'): - ErrorTarget.TOOL_OUTPUT_UTILIZATION_EVALUATOR = 'ToolOutputUtilizationEvaluator' +def _create_extended_error_target(ErrorTarget): + """Create an extended ErrorTarget enum that includes TOOL_INPUT_ACCURACY_EVALUATOR.""" + existing_members = {member.name: member.value for member in ErrorTarget} + existing_members['TOOL_INPUT_ACCURACY_EVALUATOR'] = 'ToolInputAccuracyEvaluator' + + ExtendedErrorTarget = Enum('ExtendedErrorTarget', existing_members) + return ExtendedErrorTarget + +ErrorTarget = _create_extended_error_target(ErrorTarget) # ``` From 373c7c985d22f357b78a472ef7179cb1dd583202 Mon Sep 17 00:00:00 2001 From: ahibrahim Date: Tue, 14 Oct 2025 00:31:59 +0300 Subject: [PATCH 08/12] update errortarget --- .../evaluator/_tool_output_utilization.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/assets/evaluators/builtin/tool_output_utilization/evaluator/_tool_output_utilization.py b/assets/evaluators/builtin/tool_output_utilization/evaluator/_tool_output_utilization.py index 97c1600bda..0b053e8b91 100644 --- a/assets/evaluators/builtin/tool_output_utilization/evaluator/_tool_output_utilization.py +++ b/assets/evaluators/builtin/tool_output_utilization/evaluator/_tool_output_utilization.py @@ -28,12 +28,12 @@ # ``` updated _exceptions.py # Extend ErrorTarget enum if needed def _create_extended_error_target(ErrorTarget): - """Create an extended ErrorTarget enum that includes TOOL_INPUT_ACCURACY_EVALUATOR.""" + """Create an extended ErrorTarget enum that includes TOOL_OUTPUT_UTILIZATION_EVALUATOR.""" existing_members = {member.name: member.value for member in ErrorTarget} - existing_members['TOOL_INPUT_ACCURACY_EVALUATOR'] = 'ToolInputAccuracyEvaluator' - - ExtendedErrorTarget = Enum('ExtendedErrorTarget', existing_members) - return ExtendedErrorTarget + existing_members['TOOL_OUTPUT_UTILIZATION_EVALUATOR'] = 'ToolOutputUtilizationEvaluator' + + ErrorTarget = Enum('ExtendedErrorTarget', existing_members) + return ErrorTarget ErrorTarget = _create_extended_error_target(ErrorTarget) # ``` From b6293e31b2b31bfe50900942e6283a8a0f6e3d89 Mon Sep 17 00:00:00 2001 From: ahibrahim Date: Tue, 14 Oct 2025 01:01:02 +0300 Subject: [PATCH 09/12] nits --- .../evaluator/_tool_output_utilization.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/assets/evaluators/builtin/tool_output_utilization/evaluator/_tool_output_utilization.py b/assets/evaluators/builtin/tool_output_utilization/evaluator/_tool_output_utilization.py index 0b053e8b91..20063bde0d 100644 --- a/assets/evaluators/builtin/tool_output_utilization/evaluator/_tool_output_utilization.py +++ b/assets/evaluators/builtin/tool_output_utilization/evaluator/_tool_output_utilization.py @@ -35,6 +35,7 @@ def _create_extended_error_target(ErrorTarget): ErrorTarget = Enum('ExtendedErrorTarget', existing_members) return ErrorTarget + ErrorTarget = _create_extended_error_target(ErrorTarget) # ``` @@ -71,7 +72,7 @@ def _filter_to_used_tools(tool_definitions, msgs_lists, logger=None): def _get_conversation_history(query, include_system_messages=False, include_tool_messages=False): """Parse conversation history from a list of messages into structured format. - + :param query: List of message dictionaries containing the conversation history :type query: List[dict] :param include_system_messages: Whether to include system messages in the output @@ -219,7 +220,7 @@ def _get_agent_response(agent_response_msgs, include_tool_messages=False): def reformat_agent_response(response, logger=None, include_tool_messages=False): """Reformat agent response to a standardized string format. - + :param response: The agent response to reformat, can be None, empty list, or list of messages :type response: Union[None, List[dict], str] :param logger: Optional logger for warning messages @@ -251,7 +252,7 @@ def reformat_agent_response(response, logger=None, include_tool_messages=False): def reformat_tool_definitions(tool_definitions, logger=None): """Reformat tool definitions into a human-readable string format. - + :param tool_definitions: List of tool definition dictionaries containing name, description, and parameters :type tool_definitions: List[dict] :param logger: Optional logger for warning messages @@ -353,10 +354,8 @@ def __call__( tool_definitions: Union[dict, List[dict]], ) -> Dict[str, Union[str, float]]: """Evaluate tool output utilization for a given query, response, and optional tool defintions. - - The query and response can be either a string or a list of messages. - + The query and response can be either a string or a list of messages. Example with string inputs and no tools: evaluator = ToolOutputUtilizationEvaluator(model_config) query = "What is the weather today?" @@ -382,7 +381,6 @@ def __call__( :rtype: Dict[str, Union[str, float]] """ - @override def __call__( # pylint: disable=docstring-missing-param self, From 64f2c71525e3d88d7c1439e79a4e2d4dcf9381c4 Mon Sep 17 00:00:00 2001 From: ahibrahim Date: Tue, 14 Oct 2025 01:01:08 +0300 Subject: [PATCH 10/12] nits --- .../evaluator/_tool_output_utilization.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/assets/evaluators/builtin/tool_output_utilization/evaluator/_tool_output_utilization.py b/assets/evaluators/builtin/tool_output_utilization/evaluator/_tool_output_utilization.py index 20063bde0d..b5f7b288ca 100644 --- a/assets/evaluators/builtin/tool_output_utilization/evaluator/_tool_output_utilization.py +++ b/assets/evaluators/builtin/tool_output_utilization/evaluator/_tool_output_utilization.py @@ -252,7 +252,7 @@ def reformat_agent_response(response, logger=None, include_tool_messages=False): def reformat_tool_definitions(tool_definitions, logger=None): """Reformat tool definitions into a human-readable string format. - + :param tool_definitions: List of tool definition dictionaries containing name, description, and parameters :type tool_definitions: List[dict] :param logger: Optional logger for warning messages From 7cdc4513894228bc0b085d8eb4e2a5992da2c71a Mon Sep 17 00:00:00 2001 From: ahibrahim Date: Tue, 14 Oct 2025 16:21:19 +0300 Subject: [PATCH 11/12] update example to flake8 friendly --- .../evaluator/_tool_output_utilization.py | 45 ++++++++++++++++--- 1 file changed, 38 insertions(+), 7 deletions(-) diff --git a/assets/evaluators/builtin/tool_output_utilization/evaluator/_tool_output_utilization.py b/assets/evaluators/builtin/tool_output_utilization/evaluator/_tool_output_utilization.py index b5f7b288ca..b2cf31f243 100644 --- a/assets/evaluators/builtin/tool_output_utilization/evaluator/_tool_output_utilization.py +++ b/assets/evaluators/builtin/tool_output_utilization/evaluator/_tool_output_utilization.py @@ -1,6 +1,6 @@ -# --------------------------------------------------------- -# Copyright (c) Microsoft Corporation. All rights reserved. -# --------------------------------------------------------- +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + import os import math import logging @@ -282,7 +282,9 @@ def reformat_tool_definitions(tool_definitions, logger=None): @experimental class ToolOutputUtilizationEvaluator(PromptyEvaluatorBase[Union[str, float]]): - """The Tool Output Utilization Evaluator assesses how effectively an AI agent utilizes the outputs from tools and whether it accurately incorporates this information into its responses. + """Evaluate how effectively an AI agent uses tool outputs. + + This evaluator checks whether the agent correctly incorporates information from tools into its responses. Scoring is based on two levels: 1. Pass - The agent effectively utilizes tool outputs and accurately incorporates the information into its response. @@ -365,9 +367,38 @@ def __call__( Example with list of messages: evaluator = ToolOutputUtilizationEvaluator(model_config) - query = [{'role': 'system', 'content': 'You are a friendly and helpful customer service agent.'}, {'createdAt': 1700000060, 'role': 'user', 'content': [{'type': 'text', 'text': 'Hi, I need help with the last 2 orders on my account #888. Could you please update me on their status?'}]}] - response = [{'createdAt': 1700000070, 'run_id': '0', 'role': 'assistant', 'content': [{'type': 'text', 'text': 'Hello! Let me quickly look up your account details.'}]}, {'createdAt': 1700000075, 'run_id': '0', 'role': 'assistant', 'content': [{'type': 'tool_call', 'tool_call': {'id': 'tool_call_20250310_001', 'type': 'function', 'function': {'name': 'get_orders', 'arguments': {'account_number': '888'}}}}]}, {'createdAt': 1700000080, 'run_id': '0', 'tool_call_id': 'tool_call_20250310_001', 'role': 'tool', 'content': [{'type': 'tool_result', 'tool_result': '[{ "order_id": "123" }, { "order_id": "124" }]'}]}, {'createdAt': 1700000085, 'run_id': '0', 'role': 'assistant', 'content': [{'type': 'text', 'text': 'Thanks for your patience. I see two orders on your account. Let me fetch the details for both.'}]}, {'createdAt': 1700000090, 'run_id': '0', 'role': 'assistant', 'content': [{'type': 'tool_call', 'tool_call': {'id': 'tool_call_20250310_002', 'type': 'function', 'function': {'name': 'get_order', 'arguments': {'order_id': '123'}}}}, {'type': 'tool_call', 'tool_call': {'id': 'tool_call_20250310_003', 'type': 'function', 'function': {'name': 'get_order', 'arguments': {'order_id': '124'}}}}]}, {'createdAt': 1700000095, 'run_id': '0', 'tool_call_id': 'tool_call_20250310_002', 'role': 'tool', 'content': [{'type': 'tool_result', 'tool_result': '{ "order": { "id": "123", "status": "shipped", "delivery_date": "2025-03-15" } }'}]}, {'createdAt': 1700000100, 'run_id': '0', 'tool_call_id': 'tool_call_20250310_003', 'role': 'tool', 'content': [{'type': 'tool_result', 'tool_result': '{ "order": { "id": "124", "status": "delayed", "expected_delivery": "2025-03-20" } }'}]}, {'createdAt': 1700000105, 'run_id': '0', 'role': 'assistant', 'content': [{'type': 'text', 'text': 'The order with ID 123 has been shipped and is expected to be delivered on March 15, 2025. However, the order with ID 124 is delayed and should now arrive by March 20, 2025. Is there anything else I can help you with?'}]}] - tool_definitions = [{'name': 'get_orders', 'description': 'Get the list of orders for a given account number.', 'parameters': {'type': 'object', 'properties': {'account_number': {'type': 'string', 'description': 'The account number to get the orders for.'}}}}, {'name': 'get_order', 'description': 'Get the details of a specific order.', 'parameters': {'type': 'object', 'properties': {'order_id': {'type': 'string', 'description': 'The order ID to get the details for.'}}}}, {'name': 'initiate_return', 'description': 'Initiate the return process for an order.', 'parameters': {'type': 'object', 'properties': {'order_id': {'type': 'string', 'description': 'The order ID for the return process.'}}}}, {'name': 'update_shipping_address', 'description': 'Update the shipping address for a given account.', 'parameters': {'type': 'object', 'properties': {'account_number': {'type': 'string', 'description': 'The account number to update.'}, 'new_address': {'type': 'string', 'description': 'The new shipping address.'}}}}] + query = [ + {"role": "system","content": "You are a helpful customer service assistant."}, + {"role": "user","content": [{"type": "text","text": "Hi, can you check the status of my last order?"}]}, + ] + + response = [ + { + "role": "assistant", + "content": [{"type": "text", "text": "Sure! Let me look that up for you."}], + }, + { + "role": "assistant", + "content": [{"type": "tool_call","tool_call": {"id": "tool_1","type": "function","function": {"name": "get_order_status","arguments": {"order_id": "123"}}}}], + }, + { + "role": "tool", + "tool_call_id": "tool_1", + "content": [{"type": "tool_result","tool_result": '{"order_id": "123", "status": "shipped"}'}], + }, + { + "role": "assistant", + "content": [{"type": "text","text": "Your order 123 has been shipped and is on its way!"}], + }, + ] + + tool_definitions = [ + { + "name": "get_order_status", + "description": "Retrieve the status of an order by its ID.", + "parameters": {"type": "object","properties": {"order_id": {"type": "string","description": "The order ID to check."}}}, + } + ] result = evaluator(query=query, response=response, tool_definitions=tool_definitions) From 054caf35b65a005d03254df466aaf2392d698741 Mon Sep 17 00:00:00 2001 From: ahibrahim Date: Tue, 14 Oct 2025 16:54:33 +0300 Subject: [PATCH 12/12] formats --- .../evaluator/_tool_output_utilization.py | 151 ++++++++++++------ 1 file changed, 101 insertions(+), 50 deletions(-) diff --git a/assets/evaluators/builtin/tool_output_utilization/evaluator/_tool_output_utilization.py b/assets/evaluators/builtin/tool_output_utilization/evaluator/_tool_output_utilization.py index b2cf31f243..fd0ad71aa9 100644 --- a/assets/evaluators/builtin/tool_output_utilization/evaluator/_tool_output_utilization.py +++ b/assets/evaluators/builtin/tool_output_utilization/evaluator/_tool_output_utilization.py @@ -14,12 +14,10 @@ ErrorBlame, ErrorCategory, ErrorTarget, - ErrorMessage + ErrorMessage, ) from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase -from azure.ai.evaluation._common.utils import ( - _extract_text_from_content -) +from azure.ai.evaluation._common.utils import _extract_text_from_content from azure.ai.evaluation._common._experimental import experimental logger = logging.getLogger(__name__) @@ -30,9 +28,9 @@ def _create_extended_error_target(ErrorTarget): """Create an extended ErrorTarget enum that includes TOOL_OUTPUT_UTILIZATION_EVALUATOR.""" existing_members = {member.name: member.value for member in ErrorTarget} - existing_members['TOOL_OUTPUT_UTILIZATION_EVALUATOR'] = 'ToolOutputUtilizationEvaluator' + existing_members["TOOL_OUTPUT_UTILIZATION_EVALUATOR"] = "ToolOutputUtilizationEvaluator" - ErrorTarget = Enum('ExtendedErrorTarget', existing_members) + ErrorTarget = Enum("ExtendedErrorTarget", existing_members) return ErrorTarget @@ -139,7 +137,10 @@ def _pretty_format_conversation_history(conversation_history): formatted_history += "SYSTEM_PROMPT:\n" formatted_history += " " + conversation_history["system_message"] + "\n\n" for i, (user_query, agent_response) in enumerate( - zip(conversation_history["user_queries"], conversation_history["agent_responses"] + [None]) + zip( + conversation_history["user_queries"], + conversation_history["agent_responses"] + [None], + ) ): formatted_history += f"User turn {i+1}:\n" for msg in user_query: @@ -163,8 +164,9 @@ def reformat_conversation_history(query, logger=None, include_system_messages=Fa ) return _pretty_format_conversation_history(conversation_history) except Exception as e: - # If the conversation history cannot be parsed for whatever reason (e.g. the converter format changed), the original query is returned - # This is a fallback to ensure that the evaluation can still proceed. However the accuracy of the evaluation will be affected. + # If the conversation history cannot be parsed for whatever reason, the original query is returned + # This is a fallback to ensure that the evaluation can still proceed. + # However the accuracy of the evaluation will be affected. # From our tests the negative impact on IntentResolution is: # Higher intra model variance (0.142 vs 0.046) # Higher inter model variance (0.345 vs 0.607) @@ -235,16 +237,19 @@ def reformat_agent_response(response, logger=None, include_tool_messages=False): return "" agent_response = _get_agent_response(response, include_tool_messages=include_tool_messages) if agent_response == []: - # If no message could be extracted, likely the format changed, fallback to the original response in that case + # If no message could be extracted, fallback to the original response in that case if logger: logger.warning( - f"Empty agent response extracted, likely due to input schema change. Falling back to using the original response: {response}" + "Empty agent response extracted, likely due to input schema change. " + f"Falling back to using the original response: {response}" ) return response return "\n".join(agent_response) except Exception as e: - # If the agent response cannot be parsed for whatever reason (e.g. the converter format changed), the original response is returned - # This is a fallback to ensure that the evaluation can still proceed. See comments on reformat_conversation_history for more details. + # If the agent response cannot be parsed for whatever reason (e.g. the converter format changed), + # the original response is returned + # This is a fallback to ensure that the evaluation can still proceed. + # See comments on reformat_conversation_history for more details. if logger: logger.warning(f"Agent response could not be parsed, falling back to original response. Error: {e}") return response @@ -271,12 +276,16 @@ def reformat_tool_definitions(tool_definitions, logger=None): return "\n".join(output_lines) except Exception as e: # If the tool definitions cannot be parsed for whatever reason, the original tool definitions are returned - # This is a fallback to ensure that the evaluation can still proceed. See comments on reformat_conversation_history for more details. + # This is a fallback to ensure that the evaluation can still proceed. + # See comments on reformat_conversation_history for more details. if logger: logger.warning( - f"Tool definitions could not be parsed, falling back to original definitions: {tool_definitions}. Error: {e}" + "Tool definitions could not be parsed, falling back to original definitions" + f": {tool_definitions}. Error: {e}" ) return tool_definitions + + # ``` @@ -287,8 +296,8 @@ class ToolOutputUtilizationEvaluator(PromptyEvaluatorBase[Union[str, float]]): This evaluator checks whether the agent correctly incorporates information from tools into its responses. Scoring is based on two levels: - 1. Pass - The agent effectively utilizes tool outputs and accurately incorporates the information into its response. - 2. Fail - The agent fails to properly utilize tool outputs or incorrectly incorporates the information into its response. + 1. Pass - effectively utilizes tool outputs and accurately incorporates the information into its response. + 2. Fail - fails to properly utilize tool outputs or incorrectly incorporates the information into its response. The evaluation includes the score, a brief explanation, and a final pass/fail result. @@ -312,9 +321,9 @@ class ToolOutputUtilizationEvaluator(PromptyEvaluatorBase[Union[str, float]]): :end-before: [END tool_output_utilization_evaluator] :language: python :dedent: 8 - :caption: Initialize and call ToolOutputUtilizationEvaluator using Azure AI Project URL in the following format + :caption: Initialize and call ToolOutputUtilizationEvaluator + using Azure AI Project URL in the following format https://{resource_name}.services.ai.azure.com/api/projects/{project_name} - """ _PROMPTY_FILE = "tool_output_utilization.prompty" @@ -368,27 +377,62 @@ def __call__( Example with list of messages: evaluator = ToolOutputUtilizationEvaluator(model_config) query = [ - {"role": "system","content": "You are a helpful customer service assistant."}, - {"role": "user","content": [{"type": "text","text": "Hi, can you check the status of my last order?"}]}, + { + "role": "system", + "content": "You are a helpful customer service assistant.", + }, + { + "role": "user", + "content": [ + { + "type": "text", + "text": "Hi, can you check the status of my last order?", + } + ], + }, ] response = [ { "role": "assistant", - "content": [{"type": "text", "text": "Sure! Let me look that up for you."}], + "content": [ + {"type": "text", "text": "Sure! Let me look that up for you."} + ], }, { "role": "assistant", - "content": [{"type": "tool_call","tool_call": {"id": "tool_1","type": "function","function": {"name": "get_order_status","arguments": {"order_id": "123"}}}}], + "content": [ + { + "type": "tool_call", + "tool_call": { + "id": "tool_1", + "type": "function", + "function": { + "name": "get_order_status", + "arguments": {"order_id": "123"}, + }, + }, + } + ], }, { "role": "tool", "tool_call_id": "tool_1", - "content": [{"type": "tool_result","tool_result": '{"order_id": "123", "status": "shipped"}'}], + "content": [ + { + "type": "tool_result", + "tool_result": '{"order_id": "123", "status": "shipped"}', + } + ], }, { "role": "assistant", - "content": [{"type": "text","text": "Your order 123 has been shipped and is on its way!"}], + "content": [ + { + "type": "text", + "text": "Your order 123 has been shipped and is on its way!", + } + ], }, ] @@ -396,15 +440,25 @@ def __call__( { "name": "get_order_status", "description": "Retrieve the status of an order by its ID.", - "parameters": {"type": "object","properties": {"order_id": {"type": "string","description": "The order ID to check."}}}, + "parameters": { + "type": "object", + "properties": { + "order_id": { + "type": "string", + "description": "The order ID to check.", + } + }, + }, } ] + result = evaluator(query=query, response=response, tool_definitions=tool_definitions) :keyword query: The query being evaluated, either a string or a list of messages. :paramtype query: Union[str, List[dict]] - :keyword response: The response being evaluated, either a string or a list of messages (full agent response potentially including tool calls) + :keyword response: The response being evaluated, either a string or a list of messages + (full agent response potentially including tool calls) :paramtype response: Union[str, List[dict]] :keyword tool_definitions: An optional list of messages containing the tool definitions the agent is aware of. :paramtype tool_definitions: Union[dict, List[dict]] @@ -418,8 +472,7 @@ def __call__( # pylint: disable=docstring-missing-param *args, **kwargs, ): - """ - Invoke the instance using the overloaded __call__ signature. + """Invoke the instance using the overloaded __call__ signature. For detailed parameter types and return value documentation, see the overloaded __call__ definition. """ @@ -429,21 +482,24 @@ def __call__( # pylint: disable=docstring-missing-param async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # type: ignore[override] """Do Tool Output Utilization evaluation. - :param eval_input: The input to the evaluator. Expected to contain whatever inputs are needed for the _flow method + :param eval_input: The input to the evaluator. Expected to contain whatever inputs are needed for the _flow + method :type eval_input: Dict :return: The evaluation result. :rtype: Dict """ # we override the _do_eval method as we want the output to be a dictionary, # which is a different schema than _base_prompty_eval.py - if ( - ("query" not in eval_input) - and ("response" not in eval_input) - and ("tool_definitions" not in eval_input) - ): + if ("query" not in eval_input) and ("response" not in eval_input) and ("tool_definitions" not in eval_input): raise EvaluationException( - message="Query, response, and tool_definitions are required inputs to the Tool Output Utilization evaluator.", - internal_message="Query, response, and tool_definitions are required inputs to the Tool Output Utilization evaluator.", + message=( + "Query, response, and tool_definitions are required inputs to " + "the Tool Output Utilization evaluator." + ), + internal_message=( + "Query, response, and tool_definitions are required inputs " + "to the Tool Output Utilization evaluator." + ), blame=ErrorBlame.USER_ERROR, category=ErrorCategory.MISSING_FIELD, target=ErrorTarget.TOOL_OUTPUT_UTILIZATION_EVALUATOR, @@ -455,9 +511,7 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t msgs_lists=[eval_input["query"], eval_input["response"]], logger=logger, ) - eval_input["tool_definitions"] = reformat_tool_definitions( - filtered_tool_definitions, logger - ) + eval_input["tool_definitions"] = reformat_tool_definitions(filtered_tool_definitions, logger) eval_input["query"] = reformat_conversation_history( eval_input["query"], @@ -465,25 +519,24 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t include_system_messages=True, include_tool_messages=True, ) - eval_input["response"] = reformat_agent_response( - eval_input["response"], logger, include_tool_messages=True - ) + eval_input["response"] = reformat_agent_response(eval_input["response"], logger, include_tool_messages=True) llm_output = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input) if isinstance(llm_output, dict): output_label = llm_output.get("label", None) if output_label is None: if logger: - logger.warning( - "LLM output does not contain 'label' key, returning NaN for the score." - ) + logger.warning("LLM output does not contain 'label' key, returning NaN for the score.") output_label = "fail" output_label = output_label.lower() if output_label not in ["pass", "fail"]: if logger: logger.warning( - f"LLM output label is not 'pass' or 'fail' (got '{output_label}'), returning NaN for the score." + ( + f"LLM output label is not 'pass' or 'fail' (got '{output_label}'), " + "returning NaN for the score." + ) ) score = 1.0 if output_label == "pass" else 0.0 @@ -500,7 +553,5 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t f"{self._result_key}_reason": reason, } if logger: - logger.warning( - "LLM output is not a dictionary, returning NaN for the score." - ) + logger.warning("LLM output is not a dictionary, returning NaN for the score.") return {self._result_key: math.nan}