apify · jirispilka · Oct 16, 2025 · Oct 13, 2025 · Oct 13, 2025 · Oct 13, 2025
diff --git a/.env.example b/.env.example
@@ -1,3 +1,8 @@
 APIFY_TOKEN=
-# ANTHROPIC_API_KEY is only required when you want to run examples/clientStdioChat.js
-ANTHROPIC_API_KEY=
+
+# EVALS
+PHOENIX_API_KEY=
+PHOENIX_HOST=
+
+OPENROUTER_API_KEY=
+OPENROUTER_BASE_URL=https://openrouter.ai/api/v1
diff --git a/.github/workflows/evaluations.yaml b/.github/workflows/evaluations.yaml
@@ -0,0 +1,46 @@
+# This workflow runs MCP tool calling evaluations on master branch merges
+# It evaluates AI models' ability to correctly identify and call MCP tools.
+
+name: MCP tool calling evaluations
+
+on:
+    # Run evaluations on master branch merges
+    push:
+        branches:
+            - 'master'
+    # Also run on PRs with 'evals' label for testing
+    pull_request:
+        types: [labeled, synchronize, reopened]
+
+jobs:
+    evaluations:
+        name: MCP tool calling evaluations
+        runs-on: ubuntu-latest
+        # Run on master pushes or PRs with 'evals' label
+        if: github.event_name == 'push' || contains(github.event.pull_request.labels.*.name, 'validated')
+
+        steps:
+            -   name: Checkout code
+                uses: actions/checkout@v4
+
+            -   name: Use Node.js 22
+                uses: actions/setup-node@v4
+                with:
+                    node-version: 22
+                    cache: 'npm'
+                    cache-dependency-path: 'package-lock.json'
+
+            -   name: Install Node dependencies
+                run: npm ci --include=dev
+
+            -   name: Build project
+                run: npm run build
+
+            -   name: Run evaluations
+                run: npm run evals:run
+                env:
+                    GITHUB_PR_NUMBER: ${{ github.event_name == 'pull_request' && github.event.number || 'master' }}
+                    PHOENIX_API_KEY: ${{ secrets.PHOENIX_API_KEY }}
+                    PHOENIX_BASE_URL: ${{ secrets.PHOENIX_BASE_URL }}
+                    OPENROUTER_BASE_URL: ${{ secrets.OPENROUTER_BASE_URL }}
+                    OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }}
diff --git a/.gitignore b/.gitignore
@@ -28,3 +28,10 @@ key.pem

 # Ignore MCP config for Opencode client
 opencode.json
+
+# Python cache files
+__pycache__/
+*.pyc
+*.pyo
+*.pyd
+.Python
diff --git a/eslint.config.mjs b/eslint.config.mjs
@@ -2,7 +2,7 @@ import apifyTypeScriptConfig from '@apify/eslint-config/ts.js';

 // eslint-disable-next-line import/no-default-export
 export default [
-    { ignores: ['**/dist'] }, // Ignores need to happen first
+    { ignores: ['**/dist', '**/.venv', 'evals/**'] }, // Ignores need to happen first
    ...apifyTypeScriptConfig,
    {
        languageOptions: {

diff --git a/evals/README.md b/evals/README.md
@@ -0,0 +1,122 @@
+# MCP tool selection evaluation
+
+Evaluates MCP server tool selection. Phoenix used only for storing results and visualization.
+
+## CI Workflow
+
+The evaluation workflow runs automatically on:
+- **Master branch pushes** - for production evaluations (saves CI cycles)
+- **PRs with `validated` label** - for testing evaluation changes before merging
+
+To trigger evaluations on a PR, add the `validated` label to your pull request.
+
+## Two evaluation methods
+
+1. **exact match** (`tool-exact-match`) - binary tool name validation
+2. **LLM judge** (`tool-selection-llm`) - Phoenix classifier with structured prompt
+
+## Why OpenRouter?
+
+unified API for Gemini, Claude, GPT. no separate integrations needed.
+
+## Judge model
+
+- model: `openai/gpt-4o-mini`
+- prompt: structured eval with context + tool definitions
+- output: "correct"/"incorrect" → 1.0/0.0 score (and explanation)
+
+## Config (`config.ts`)
+
+```typescript
+MODELS_TO_EVALUATE = ['openai/gpt-4o-mini', 'anthropic/claude-3.5-haiku', 'google/gemini-2.5-flash']
+PASS_THRESHOLD = 0.6
+TOOL_SELECTION_EVAL_MODEL = 'openai/gpt-4o-mini'
+```
+
+## Setup
+
+```bash
+export PHOENIX_BASE_URL="your_url"
+export PHOENIX_API_KEY="your_key"
+export OPENROUTER_API_KEY="your_key"
+export OPENROUTER_BASE_URL="https://openrouter.ai/api/v1"
+
+npm ci
+npm run evals:create-dataset  # one-time
+npm run evals:run
+```
+
+## Test cases
+
+40+ cases across 7 tool categories: `fetch-actor-details`, `search-actors`, `apify-slash-rag-web-browser`, `search-apify-docs`, `call-actor`, `get-actor-output`, `fetch-apify-docs`
+
+## Output
+
+- Phoenix dashboard with detailed results
+- console: pass/fail per model + evaluator
+- exit code: 0 = success, 1 = failure
+
+## Adding new test cases
+
+### How to contribute?
+
+1. **Create an issue or PR** with your new test cases
+2. **Explain why it should pass** - add a `reference` field with clear reasoning
+3. **Test locally** before submitting
+4. **Publish** - we'll review and merge
+
+### Test case structure
+
+Each test case in `test-cases.json` has this structure:
+
+```json
+{
+  "id": "unique-test-id",
+  "category": "tool-category",
+  "query": "user query text",
+  "expectedTools": ["tool-name"],
+  "reference": "explanation of why this should pass (optional)",
+  "context": [/* conversation history (optional) */]
+}
+```
+
+### Simple examples
+
+**Basic tool selection:**
+```json
+{
+  "id": "fetch-actor-details-1",
+  "category": "fetch-actor-details",
+  "query": "What are the details of apify/instagram-scraper?",
+  "expectedTools": ["fetch-actor-details"]
+}
+```
+
+**With reference explanation:**
+```json
+{
+  "id": "fetch-actor-details-3",
+  "category": "fetch-actor-details",
+  "query": "Scrape details of apify/google-search-scraper",
+  "expectedTools": ["fetch-actor-details"],
+  "reference": "It should call the fetch-actor-details with the actor ID 'apify/google-search-scraper' and return the actor's documentation."
+}
+```
+
+### Advanced examples with context
+
+**Multi-step conversation flow:**
+```json
+{
+  "id": "weather-mcp-search-then-call-1",
+  "category": "flow",
+  "query": "Now, use the mcp to check the weather in Prague, Czechia?",
+  "expectedTools": ["call-actor"],
+  "context": [
+    { "role": "user", "content": "Search for weather MCP server" },
+    { "role": "assistant", "content": "I'll help you to do that" },
+    { "role": "tool_use", "tool": "search-actors", "input": {"search": "weather mcp", "limit": 5} },
+    { "role": "tool_result", "tool_use_id": 12, "content": "Tool 'search-actors' successful, Actor found: jiri.spilka/weather-mcp-server" }
+  ]
+}
+```
diff --git a/evals/config.ts b/evals/config.ts
@@ -0,0 +1,114 @@
+/**
+ * Configuration for Apify MCP Server evaluations.
+ */
+
+import { readFileSync } from 'node:fs';
+import { dirname, join } from 'node:path';
+import { fileURLToPath } from 'node:url';
+
+// Read version from test-cases.json
+function getTestCasesVersion(): string {
+    const currentFilename = fileURLToPath(import.meta.url);
+    const currentDirname = dirname(currentFilename);
+    const testCasesPath = join(currentDirname, 'test-cases.json');
+    const testCasesContent = readFileSync(testCasesPath, 'utf-8');
+    const testCases = JSON.parse(testCasesContent);
+    return testCases.version;
+}
+
+// Evaluator names
+export const EVALUATOR_NAMES = {
+    TOOLS_EXACT_MATCH: 'tool-exact-match',
+    TOOL_SELECTION_LLM: 'tool-selection-llm',
+} as const;
+
+export type EvaluatorName = typeof EVALUATOR_NAMES[keyof typeof EVALUATOR_NAMES];
+
+// Models to evaluate
+export const MODELS_TO_EVALUATE = [
+    'openai/gpt-4o-mini',
+    'anthropic/claude-3.5-haiku',
+    'google/gemini-2.5-flash',
+];
+
+export const TOOL_SELECTION_EVAL_MODEL = 'openai/gpt-4o-mini';
+
+export const PASS_THRESHOLD = 0.7;
+
+export const DATASET_NAME = `mcp_server_dataset_v${getTestCasesVersion()}`;
+
+// System prompt
+export const SYSTEM_PROMPT = 'You are a helpful assistant';
+
+export const TOOL_CALLING_BASE_TEMPLATE = `
+You are an evaluation assistant evaluating user queries and tool calls to
+determine whether a tool was chosen and if it was a right tool.
+
+The tool calls have been generated by a separate agent, and chosen from the list of
+tools provided below. It is your job to decide whether that agent chose
+the right tool to call.
+
+[BEGIN DATA]
+************
+[User's previous interaction with the assistant]: {{context}}
+[User query]: {{query}}
+************
+[LLM decided to call these tools]: {{tool_calls}}
+[LLM response]: {{llm_response}}
+************
+[END DATA]
+
+DECISION: [correct or incorrect]
+EXPLANATION: [Super short explanation of why the tool choice was correct or incorrect]
+
+Your response must be single word, either "correct" or "incorrect",
+and should not contain any text or characters aside from that word.
+
+"correct" means the correct tool call was chosen, the correct parameters
+were extracted from the query, the tool call generated is runnable and correct,
+and that no outside information not present in the query was used
+in the generated query.
+
+"incorrect" means that the chosen tool was not correct
+or that the tool signature includes parameter values that don't match
+the formats specified in the tool signatures below.
+
+You must not use any outside information or make assumptions.
+Base your decision solely on the information provided in [BEGIN DATA] ... [END DATA],
+the [Tool Definitions], and the [Reference instructions] (if provided).
+Reference instructions are optional and are intended to help you understand the use case and make your decision.
+
+[Reference instructions]: {{reference}}
+
+[Tool definitions]: {{tool_definitions}}
+`
+export function getRequiredEnvVars(): Record<string, string | undefined> {
+    return {
+        PHOENIX_BASE_URL: process.env.PHOENIX_BASE_URL,
+        PHOENIX_API_KEY: process.env.PHOENIX_API_KEY,
+        OPENROUTER_API_KEY: process.env.OPENROUTER_API_KEY,
+        OPENROUTER_BASE_URL: process.env.OPENROUTER_BASE_URL,
+    };
+}
+
+// Removes newlines and trims whitespace. Useful for Authorization header values
+// because CI secrets sometimes include trailing newlines or quotes.
+export function sanitizeHeaderValue(value?: string): string | undefined {
+    if (value == null) return value;
+    return value.replace(/[\r\n]/g, '').trim().replace(/^"|"$/g, '');
+}
+
+export function validateEnvVars(): boolean {
+    const envVars = getRequiredEnvVars();
+    const missing = Object.entries(envVars)
+        .filter(([, value]) => !value)
+        .map(([key]) => key);
+
+    if (missing.length > 0) {
+        // eslint-disable-next-line no-console
+        console.error(`Missing required environment variables: ${missing.join(', ')}`);
+        return false;
+    }
+
+    return true;
+}