diff --git a/evals/README.md b/evals/README.md new file mode 100644 index 0000000..45cbfe1 --- /dev/null +++ b/evals/README.md @@ -0,0 +1,348 @@ +# Browserbase MCP Server Evaluation Tests + +This directory contains comprehensive evaluation tests for the Browserbase MCP Server using [MCPVals](https://github.com/modelcontextprotocol/mcpvals), a testing framework that uses Claude to autonomously execute test workflows based on natural language descriptions. + +## Quick Start + +1. **Install dependencies:** + + ```bash + npm run test:install + ``` + +2. **Set up environment variables:** + + ```bash + export BROWSERBASE_API_KEY="your_api_key_here" + export BROWSERBASE_PROJECT_ID="your_project_id_here" + export ANTHROPIC_API_KEY="your_anthropic_key_here" + ``` + +3. **Run basic tests:** + ```bash + npm test + ``` + +## Test Configurations + +We provide three levels of testing: + +### 1. Minimal Tests (`mcp-eval-minimal.config.json`) + +- **Purpose**: Quick smoke tests to verify basic functionality +- **Duration**: ~2-3 minutes +- **Tests**: 3 workflows covering navigation, extraction, and multi-session basics +- **Usage**: `npm run test:minimal` + +### 2. Standard Tests (`mcp-eval.config.json`) + +- **Purpose**: Comprehensive functionality testing +- **Duration**: ~5-10 minutes +- **Tests**: 8 workflows covering all major features +- **Usage**: `npm test` + +### 3. Advanced Tests (`mcp-eval-advanced.config.json`) + +- **Purpose**: Complex scenarios with LLM judge evaluation +- **Duration**: ~10-15 minutes +- **Tests**: 6 workflows with subjective quality assessment +- **Usage**: `npm run test:advanced` (requires `OPENAI_API_KEY`) + +## Available Scripts + +| Script | Description | +| ----------------------- | --------------------------------- | +| `npm test` | Run standard evaluation tests | +| `npm run test:minimal` | Run minimal smoke tests | +| `npm run test:advanced` | Run advanced tests with LLM judge | +| `npm run test:all` | Run all test suites | +| `npm run test:debug` | Run tests with debug output | +| `npm run test:json` | Run tests with JSON output | +| `npm run test:install` | Install test dependencies | +| `npm run test:runner` | Direct access to test runner | + +## Understanding the Test Framework + +### How MCPVals Works + +MCPVals uses Claude to autonomously execute test workflows: + +1. **Natural Language Instructions**: Tests are written as natural language prompts +2. **Autonomous Execution**: Claude examines available MCP tools and plans execution +3. **Tool Invocation**: Claude calls the appropriate MCP tools to accomplish tasks +4. **Deterministic Evaluation**: Results are evaluated against expected outcomes + +### Test Structure + +Each test workflow contains: + +```json +{ + "name": "test-name", + "description": "What this test validates", + "steps": [ + { + "user": "Natural language instruction", + "expectedState": "Expected substring in output" + } + ], + "expectTools": ["list", "of", "expected", "tools"] +} +``` + +### Evaluation Metrics + +Each test is evaluated on three metrics: + +1. **End-to-End Success** (0-100%): Did the workflow achieve the expected final state? +2. **Tool Invocation Order** (0-100%): Were the expected tools called in the correct sequence? +3. **Tool Call Health** (0-100%): Did all tool calls complete successfully without errors? + +**Overall Score** = Average of all three metrics + +## Test Workflows + +### Minimal Test Suite + +| Workflow | Description | Expected Tools | +| -------------------------- | ------------------------ | ------------------------------------------------- | +| `smoke-test-navigation` | Basic browser navigation | session_create, navigate, session_close | +| `smoke-test-extraction` | Basic content extraction | session_create, navigate, extract, session_close | +| `smoke-test-multi-session` | Multi-session management | multi_session_create, session_list, session_close | + +### Standard Test Suite + +| Workflow | Description | Key Features | +| --------------------------- | -------------------------- | ---------------------------- | +| `basic-navigation-test` | Navigation to Google | Basic browser control | +| `search-and-extract-test` | Search and extract results | Form interaction, extraction | +| `observe-and-interact-test` | Element observation | DOM inspection | +| `screenshot-test` | Screenshot capture | Visual documentation | +| `multi-session-test` | Parallel browser sessions | Multi-session management | +| `form-interaction-test` | Form filling | Input handling | +| `error-handling-test` | Error scenarios | Error recovery | + +### Advanced Test Suite + +| Workflow | Description | LLM Judge | +| ---------------------------- | --------------------------- | --------- | +| `e-commerce-workflow` | Realistic browsing patterns | ✓ | +| `form-interaction-workflow` | Complex form handling | ✓ | +| `dynamic-content-handling` | JavaScript content | ✓ | +| `multi-session-workflow` | Advanced multi-session | ✓ | +| `error-recovery-workflow` | Error handling & recovery | ✓ | +| `comprehensive-feature-test` | All features combined | ✓ | + +## Environment Setup + +### Required Environment Variables + +Copy the example environment file and update with your credentials: + +```bash +cp evals/env.example .env +# Edit .env with your actual API keys +``` + +Or set the environment variables directly: + +```bash +# Browserbase credentials (required) +export BROWSERBASE_API_KEY="bb_api_key_..." +export BROWSERBASE_PROJECT_ID="bb_project_id_..." + +# Anthropic API key (required for Claude execution) +export ANTHROPIC_API_KEY="sk-ant-..." + +# OpenAI API key (required for LLM judge) +export OPENAI_API_KEY="sk-..." +``` + +### Optional Configuration + +You can override placeholder values in the config files by setting environment variables: + +```bash +# Override config placeholders +export BROWSERBASE_API_KEY="your_real_key" +export BROWSERBASE_PROJECT_ID="your_real_project" +``` + +## Running Tests + +### Command Line Interface + +```bash +# Basic usage +npm test + +# With options +npm run test:debug # Enable debug output +npm run test:json # Output results as JSON +npm run test:minimal # Run minimal tests +npm run test:advanced # Run advanced tests with LLM judge +npm run test:all # Run all test suites +``` + +### TypeScript Runner + +```bash +# Using the TypeScript runner directly +npm run test:runner run --config evals/mcp-eval.config.json --debug +``` + +## Interpreting Results + +### Console Output + +``` +✓ basic-navigation-test PASSED (100%) + ✓ End-to-End Success: 100% + ✓ Tool Invocation Order: 100% + ✓ Tool Call Health: 100% + +✗ search-and-extract-test FAILED (67%) + ✓ End-to-End Success: 100% + ✗ Tool Invocation Order: 67% (2/3 tools in correct order) + ✓ Tool Call Health: 100% +``` + +### Understanding Failures + +Common failure patterns: + +1. **Tool Order Issues**: Expected tools not called in sequence +2. **Missing Tools**: Expected tools not invoked +3. **Tool Errors**: Tools returning errors or timeouts +4. **Wrong Output**: Expected state not found in results + +## Troubleshooting + +### Common Issues + +1. **Missing Dependencies** + + ```bash + npm run test:install + ``` + +2. **Environment Variables Not Set** + + ```bash + # Check if variables are set + echo $BROWSERBASE_API_KEY + echo $BROWSERBASE_PROJECT_ID + echo $ANTHROPIC_API_KEY + ``` + +3. **Timeout Issues** + - Increase timeout in config files + - Check network connectivity + - Verify Browserbase service status + +4. **Tool Not Found Errors** + - Verify MCP server is running correctly + - Check tool names in config match server exports + - Run with `npm run test:debug` for detailed output + +### Debug Mode + +Run tests with debug output to see detailed execution: + +```bash +npm run test:debug +``` + +This shows: + +- Raw tool calls and responses +- Claude's reasoning process +- Network requests and responses +- Detailed error messages + +## Extending Tests + +### Adding New Workflows + +1. **Choose appropriate config file** based on complexity +2. **Write natural language steps** that describe user intent +3. **Specify expected tools** that should be called +4. **Set expected states** for validation +5. **Test your workflow** with debug mode + +### Example New Workflow + +```json +{ + "name": "custom-workflow", + "description": "Test custom functionality", + "steps": [ + { + "user": "Navigate to example.com and find all links", + "expectedState": "found links" + }, + { + "user": "Click on the first link", + "expectedState": "clicked" + } + ], + "expectTools": [ + "browserbase_session_create", + "browserbase_stagehand_navigate", + "browserbase_stagehand_observe", + "browserbase_stagehand_act" + ] +} +``` + +## CI/CD Integration + +### GitHub Actions + +```yaml +name: MCP Server Tests +on: [push, pull_request] + +jobs: + test: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-node@v4 + with: + node-version: "18" + - run: npm ci + - run: npm run test:install + - run: npm run test:minimal + env: + BROWSERBASE_API_KEY: ${{ secrets.BROWSERBASE_API_KEY }} + BROWSERBASE_PROJECT_ID: ${{ secrets.BROWSERBASE_PROJECT_ID }} + ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} +``` + +### Exit Codes + +- `0`: All tests passed +- `1`: One or more tests failed +- `2`: Configuration or setup error + +## Best Practices + +1. **Start with minimal tests** for quick feedback +2. **Use realistic scenarios** that match actual usage +3. **Include error cases** to test robustness +4. **Keep expected states simple** but unique +5. **Run tests regularly** to catch regressions +6. **Use debug mode** when developing new tests + +## Support + +For issues with: + +- **MCPVals framework**: Check the [MCPVals documentation](https://github.com/modelcontextprotocol/mcpvals) +- **Browserbase integration**: Visit [Browserbase docs](https://docs.browserbase.com) +- **MCP Server**: Open an issue in this repository + +## License + +These tests are part of the Browserbase MCP Server project and are licensed under the Apache License 2.0. diff --git a/evals/mcp-eval-advanced.config.json b/evals/mcp-eval-advanced.config.json new file mode 100644 index 0000000..4f6689c --- /dev/null +++ b/evals/mcp-eval-advanced.config.json @@ -0,0 +1,235 @@ +{ + "server": { + "transport": "stdio", + "command": "node", + "args": ["./cli.js"], + "env": { + "BROWSERBASE_API_KEY": "${BROWSERBASE_API_KEY}", + "BROWSERBASE_PROJECT_ID": "${BROWSERBASE_PROJECT_ID}" + } + }, + "timeout": 90000, + "llmJudge": true, + "openaiKey": "${OPENAI_API_KEY}", + "judgeModel": "gpt-4o", + "passThreshold": 0.8, + "workflows": [ + { + "name": "e-commerce-workflow", + "description": "Test a realistic e-commerce browsing workflow", + "steps": [ + { + "user": "Go to example.com and take a screenshot", + "expectedState": "Example Domain" + }, + { + "user": "Navigate to a different page and observe the page elements", + "expectedState": "observed" + }, + { + "user": "Take another screenshot to compare", + "expectedState": "Screenshot taken" + }, + { + "user": "Close the browser", + "expectedState": "session closed" + } + ], + "expectTools": [ + "browserbase_session_create", + "browserbase_stagehand_navigate", + "browserbase_screenshot", + "browserbase_stagehand_navigate", + "browserbase_stagehand_observe", + "browserbase_screenshot", + "browserbase_session_close" + ] + }, + { + "name": "form-interaction-workflow", + "description": "Test form interaction and validation", + "steps": [ + { + "user": "Navigate to httpbin.org/forms/post", + "expectedState": "httpbin" + }, + { + "user": "Observe the form fields available", + "expectedState": "form" + }, + { + "user": "Fill in the customer name field with 'Test User'", + "expectedState": "Test User" + }, + { + "user": "Extract the form data to verify it was filled correctly", + "expectedState": "Test User" + }, + { + "user": "Close the session", + "expectedState": "session closed" + } + ], + "expectTools": [ + "browserbase_session_create", + "browserbase_stagehand_navigate", + "browserbase_stagehand_observe", + "browserbase_stagehand_act", + "browserbase_stagehand_extract", + "browserbase_session_close" + ] + }, + { + "name": "dynamic-content-handling", + "description": "Test handling of dynamic content", + "steps": [ + { + "user": "Navigate to example.com", + "expectedState": "Example Domain" + }, + { + "user": "Extract the page title and content", + "expectedState": "Example Domain" + }, + { + "user": "Take a screenshot of the page", + "expectedState": "Screenshot taken" + }, + { + "user": "Close the browser", + "expectedState": "session closed" + } + ], + "expectTools": [ + "browserbase_session_create", + "browserbase_stagehand_navigate", + "browserbase_stagehand_extract", + "browserbase_session_create", + "browserbase_stagehand_navigate", + "browserbase_stagehand_extract", + "browserbase_session_close", + "browserbase_screenshot", + "browserbase_session_close" + ] + }, + { + "name": "multi-session-workflow", + "description": "Test handling multiple sessions", + "steps": [ + { + "user": "Create two separate browser sessions named 'session-a' and 'session-b'", + "expectedState": "Created session" + }, + { + "user": "List all active sessions to verify they were created", + "expectedState": "session-a" + }, + { + "user": "In session-a, navigate to example.com", + "expectedState": "example.com" + }, + { + "user": "In session-b, navigate to httpbin.org", + "expectedState": "httpbin" + }, + { + "user": "Extract content from both sessions", + "expectedState": "extracted" + }, + { + "user": "Close both sessions", + "expectedState": "closed session" + } + ], + "expectTools": [ + "multi_browserbase_stagehand_session_create", + "multi_browserbase_stagehand_session_create", + "multi_browserbase_stagehand_session_list", + "multi_browserbase_stagehand_session_list", + "multi_browserbase_stagehand_navigate_session", + "multi_browserbase_stagehand_extract_session", + "multi_browserbase_stagehand_extract_session" + ] + }, + { + "name": "error-recovery-workflow", + "description": "Test graceful error handling and recovery", + "steps": [ + { + "user": "Create a browser session", + "expectedState": "session created" + }, + { + "user": "Navigate to a valid page first", + "expectedState": "example.com" + }, + { + "user": "Try to interact with a non-existent element", + "expectedState": "error" + }, + { + "user": "Verify the session is still working by extracting the page title", + "expectedState": "Example Domain" + }, + { + "user": "Close the session", + "expectedState": "session closed" + } + ], + "expectTools": [ + "browserbase_session_create", + "browserbase_stagehand_navigate", + "browserbase_stagehand_act", + "browserbase_stagehand_observe", + "browserbase_stagehand_extract", + "browserbase_session_close" + ] + }, + { + "name": "comprehensive-feature-test", + "description": "Test all major features together", + "steps": [ + { + "user": "Create a multi-session setup with 3 sessions", + "expectedState": "Created session" + }, + { + "user": "Navigate each session to different pages", + "expectedState": "navigated" + }, + { + "user": "Take screenshots from each session", + "expectedState": "Screenshot taken" + }, + { + "user": "Extract content from all sessions", + "expectedState": "extracted" + }, + { + "user": "Close all sessions", + "expectedState": "closed session" + } + ], + "expectTools": [ + "multi_browserbase_stagehand_session_create", + "multi_browserbase_stagehand_session_create", + "multi_browserbase_stagehand_session_create", + "multi_browserbase_stagehand_session_list", + "multi_browserbase_stagehand_navigate_session", + "multi_browserbase_stagehand_session_create", + "multi_browserbase_stagehand_session_create", + "multi_browserbase_stagehand_session_create", + "multi_browserbase_stagehand_navigate_session", + "multi_browserbase_stagehand_session_list", + "multi_browserbase_stagehand_extract_session", + "multi_browserbase_stagehand_extract_session", + "multi_browserbase_stagehand_extract_session", + "multi_browserbase_stagehand_session_list", + "multi_browserbase_stagehand_session_close", + "multi_browserbase_stagehand_session_close", + "multi_browserbase_stagehand_session_close", + "multi_browserbase_stagehand_session_close" + ] + } + ] +} diff --git a/evals/mcp-eval-minimal.config.json b/evals/mcp-eval-minimal.config.json new file mode 100644 index 0000000..051f147 --- /dev/null +++ b/evals/mcp-eval-minimal.config.json @@ -0,0 +1,77 @@ +{ + "server": { + "transport": "stdio", + "command": "node", + "args": ["./cli.js"], + "env": { + "BROWSERBASE_API_KEY": "${BROWSERBASE_API_KEY}", + "BROWSERBASE_PROJECT_ID": "${BROWSERBASE_PROJECT_ID}" + } + }, + "timeout": 30000, + "llmJudge": false, + "workflows": [ + { + "name": "smoke-test-navigation", + "description": "Quick test to verify basic navigation works", + "steps": [ + { + "user": "Open a browser and go to example.com", + "expectedState": "session created" + }, + { + "user": "Close the browser", + "expectedState": "session closed" + } + ], + "expectTools": [ + "browserbase_session_create", + "browserbase_stagehand_navigate", + "browserbase_session_close" + ] + }, + { + "name": "smoke-test-extraction", + "description": "Quick test to verify data extraction works", + "steps": [ + { + "user": "Navigate to example.com and extract the page title", + "expectedState": "Example Domain" + }, + { + "user": "Close the session", + "expectedState": "session closed" + } + ], + "expectTools": [ + "browserbase_session_create", + "browserbase_stagehand_navigate", + "browserbase_stagehand_extract", + "browserbase_session_close" + ] + }, + { + "name": "smoke-test-multi-session", + "description": "Quick test to verify multi-session functionality", + "steps": [ + { + "user": "Create a browser session named 'test-session'", + "expectedState": "Created session" + }, + { + "user": "List active sessions", + "expectedState": "test-session" + }, + { + "user": "Close the test session", + "expectedState": "closed session" + } + ], + "expectTools": [ + "multi_browserbase_stagehand_session_create", + "multi_browserbase_stagehand_session_list", + "multi_browserbase_stagehand_session_close" + ] + } + ] +} diff --git a/evals/mcp-eval.config.json b/evals/mcp-eval.config.json new file mode 100644 index 0000000..2c1f49e --- /dev/null +++ b/evals/mcp-eval.config.json @@ -0,0 +1,124 @@ +{ + "server": { + "transport": "stdio", + "command": "node", + "args": ["./cli.js"], + "env": { + "BROWSERBASE_API_KEY": "${BROWSERBASE_API_KEY}", + "BROWSERBASE_PROJECT_ID": "${BROWSERBASE_PROJECT_ID}" + } + }, + "timeout": 60000, + "llmJudge": false, + "workflows": [ + { + "name": "basic-navigation-test", + "description": "Test basic browser navigation functionality", + "steps": [ + { + "user": "Create a browser session, navigate to https://example.com, and close the session", + "expectedState": "closed" + } + ], + "expectTools": [ + "browserbase_session_create", + "browserbase_stagehand_navigate", + "browserbase_session_close" + ] + }, + { + "name": "search-and-extract-test", + "description": "Test navigation, search interaction, and data extraction", + "steps": [ + { + "user": "Create a browser session, navigate to https://example.com, extract the page title, and close the session", + "expectedState": "Example Domain" + } + ], + "expectTools": [ + "browserbase_session_create", + "browserbase_stagehand_navigate", + "browserbase_stagehand_extract", + "browserbase_session_close" + ] + }, + { + "name": "observe-and-interact-test", + "description": "Test element observation and interaction capabilities", + "steps": [ + { + "user": "Create a browser session, navigate to https://example.com, observe the page elements, and close the session", + "expectedState": "closed" + } + ], + "expectTools": [ + "browserbase_session_create", + "browserbase_stagehand_navigate", + "browserbase_stagehand_observe", + "browserbase_session_close" + ] + }, + { + "name": "screenshot-test", + "description": "Test screenshot functionality", + "steps": [ + { + "user": "Create a browser session, navigate to https://example.com, take a screenshot, and close the session", + "expectedState": "closed" + } + ], + "expectTools": [ + "browserbase_session_create", + "browserbase_stagehand_navigate", + "browserbase_screenshot", + "browserbase_session_close" + ] + }, + { + "name": "multi-session-test", + "description": "Test multi-session browser management", + "steps": [ + { + "user": "Create a multi-session browser named 'test-session', list all sessions, navigate to https://example.com in that session, and close the session", + "expectedState": "closed" + } + ], + "expectTools": [ + "multi_browserbase_stagehand_session_create", + "multi_browserbase_stagehand_session_list", + "multi_browserbase_stagehand_navigate_session", + "multi_browserbase_stagehand_session_close" + ] + }, + { + "name": "form-interaction-test", + "description": "Test form filling and submission capabilities", + "steps": [ + { + "user": "Create a browser session, navigate to https://httpbin.org/forms/post, fill in the customer name field with 'TestUser', and close the session", + "expectedState": "closed" + } + ], + "expectTools": [ + "browserbase_session_create", + "browserbase_stagehand_navigate", + "browserbase_stagehand_act", + "browserbase_session_close" + ] + }, + { + "name": "error-handling-test", + "description": "Test error handling for invalid operations", + "steps": [ + { + "user": "Create a browser session and try to navigate to an invalid URL like 'invalid-url-test'", + "expectedState": "error" + } + ], + "expectTools": [ + "browserbase_session_create", + "browserbase_stagehand_navigate" + ] + } + ] +} diff --git a/evals/run-evals.ts b/evals/run-evals.ts new file mode 100644 index 0000000..4b92219 --- /dev/null +++ b/evals/run-evals.ts @@ -0,0 +1,269 @@ +#!/usr/bin/env tsx + +import { Command } from "commander"; +import * as fs from "fs/promises"; +import * as path from "path"; +import { evaluate } from "mcpvals"; +import chalk from "chalk"; + +// Load environment variables from .env file +import { config } from "dotenv"; +config(); + +// Types for evaluation results +interface EvaluationResult { + workflowName: string; + passed: boolean; + overallScore: number; + results: Array<{ + metric: string; + passed: boolean; + score: number; + details: string; + metadata?: Record; + }>; +} + +interface EvaluationReport { + config: Record; + evaluations: EvaluationResult[]; + passed: boolean; + timestamp: Date; +} + +interface TestResult { + config: string; + passed: boolean; + score: number; + duration: number; + workflows: { + name: string; + passed: boolean; + score: number; + }[]; +} + +const program = new Command(); + +program + .name("browserbase-mcp-evals") + .description("Run evaluation tests for Browserbase MCP Server") + .version("1.0.0"); + +program + .command("run") + .description("Run evaluation tests") + .option( + "-c, --config ", + "Config file path", + "./evals/mcp-eval.config.json", + ) + .option("-d, --debug", "Enable debug output") + .option("-j, --json", "Output results as JSON") + .option("-l, --llm", "Enable LLM judge") + .option("-o, --output ", "Save results to file") + .option("-t, --timeout ", "Override timeout in milliseconds") + .action(async (options) => { + try { + const startTime = Date.now(); + + // Check for required environment variables + const requiredEnvVars = [ + "BROWSERBASE_API_KEY", + "BROWSERBASE_PROJECT_ID", + "ANTHROPIC_API_KEY", + ]; + const missingVars = requiredEnvVars.filter((v) => !process.env[v]); + + if (missingVars.length > 0) { + console.error( + chalk.red( + `Missing required environment variables: ${missingVars.join(", ")}`, + ), + ); + console.error( + chalk.yellow("Please set them before running the tests."), + ); + console.error(chalk.yellow("Example:")); + + for (const missingVar of missingVars) { + switch (missingVar) { + case "BROWSERBASE_API_KEY": + console.error( + chalk.yellow( + " export BROWSERBASE_API_KEY='your_api_key_here'", + ), + ); + break; + case "BROWSERBASE_PROJECT_ID": + console.error( + chalk.yellow( + " export BROWSERBASE_PROJECT_ID='your_project_id_here'", + ), + ); + break; + case "ANTHROPIC_API_KEY": + console.error( + chalk.yellow( + " export ANTHROPIC_API_KEY='sk-ant-your_key_here'", + ), + ); + break; + } + } + process.exit(1); + } + + // Check for LLM judge requirements + if (options.llm && !process.env.OPENAI_API_KEY) { + console.error( + chalk.red("LLM judge requires OPENAI_API_KEY environment variable"), + ); + process.exit(1); + } + + // Resolve config path + const configPath = path.resolve(options.config); + + // Load config to get workflow count for display + const configContent = await fs.readFile(configPath, "utf-8"); + const config = JSON.parse(configContent); + + console.log(chalk.blue(`Running evaluation tests from: ${configPath}`)); + console.log(chalk.gray(`Workflows to test: ${config.workflows.length}`)); + + // Prepare evaluation options + const evalOptions = { + debug: options.debug, + reporter: (options.json ? "json" : "console") as + | "json" + | "console" + | "junit" + | undefined, + llmJudge: options.llm, + timeout: options.timeout ? parseInt(options.timeout) : undefined, + }; + + // Run evaluation - pass config file path, not parsed config object + const report: EvaluationReport = await evaluate(configPath, evalOptions); + + const duration = Date.now() - startTime; + + // Process results + const result: TestResult = { + config: configPath, + passed: report.passed, + score: + report.evaluations.reduce((sum, e) => sum + e.overallScore, 0) / + report.evaluations.length, + duration, + workflows: report.evaluations.map((e) => ({ + name: e.workflowName, + passed: e.passed, + score: e.overallScore, + })), + }; + + // Output results + if (options.json) { + console.log(JSON.stringify(result, null, 2)); + } else { + console.log( + chalk.green( + `\nTest execution completed in ${(duration / 1000).toFixed(2)}s`, + ), + ); + console.log( + chalk[result.passed ? "green" : "red"]( + `Overall result: ${result.passed ? "PASSED" : "FAILED"} (${(result.score * 100).toFixed(1)}%)`, + ), + ); + } + + // Save to file if requested + if (options.output) { + await fs.writeFile(options.output, JSON.stringify(report, null, 2)); + console.log(chalk.gray(`Results saved to: ${options.output}`)); + } + + process.exit(result.passed ? 0 : 1); + } catch (error) { + console.error("Error running evaluation tests:", error); + process.exit(1); + } + }); + +program + .command("compare") + .description("Compare results from multiple test runs") + .argument("", "First results file") + .argument("", "Second results file") + .option("-v, --verbose", "Show detailed comparison") + .action(async (file1, file2, options) => { + try { + const results1: EvaluationReport = JSON.parse( + await fs.readFile(file1, "utf-8"), + ); + const results2: EvaluationReport = JSON.parse( + await fs.readFile(file2, "utf-8"), + ); + + console.log(chalk.blue("Comparing test results:")); + console.log(chalk.gray(`File 1: ${file1}`)); + console.log(chalk.gray(`File 2: ${file2}`)); + console.log(); + + // Compare overall results + const passed1 = results1.passed; + const passed2 = results2.passed; + + if (passed1 === passed2) { + console.log(chalk.yellow(`Both runs ${passed1 ? "PASSED" : "FAILED"}`)); + } else { + console.log(chalk.green(`File 1: ${passed1 ? "PASSED" : "FAILED"}`)); + console.log(chalk.red(`File 2: ${passed2 ? "PASSED" : "FAILED"}`)); + } + + // Compare individual workflows if verbose + if (options.verbose) { + console.log(chalk.blue("\nWorkflow Comparison:")); + + const workflows1 = new Map( + results1.evaluations.map((e) => [e.workflowName, e]), + ); + const workflows2 = new Map( + results2.evaluations.map((e) => [e.workflowName, e]), + ); + + const allWorkflows = new Set([ + ...workflows1.keys(), + ...workflows2.keys(), + ]); + + for (const workflow of allWorkflows) { + const w1 = workflows1.get(workflow); + const w2 = workflows2.get(workflow); + + if (!w1) { + console.log(chalk.red(`- ${workflow}: Missing in file 1`)); + } else if (!w2) { + console.log(chalk.red(`- ${workflow}: Missing in file 2`)); + } else { + const scoreChange = (w2.overallScore - w1.overallScore) * 100; + const color = + scoreChange > 0 ? "green" : scoreChange < 0 ? "red" : "yellow"; + console.log( + chalk[color]( + `- ${workflow}: ${(w1.overallScore * 100).toFixed(1)}% → ${(w2.overallScore * 100).toFixed(1)}% (${scoreChange > 0 ? "+" : ""}${scoreChange.toFixed(1)}%)`, + ), + ); + } + } + } + } catch (error) { + console.error("Error comparing results:", error); + process.exit(1); + } + }); + +program.parse(); diff --git a/package.json b/package.json index b2bf7b6..20201fb 100644 --- a/package.json +++ b/package.json @@ -28,7 +28,11 @@ "prettier:check": "prettier --check .", "prettier:fix": "prettier --write .", "clean": "rm -rf dist", - "prepublishOnly": "pnpm clean && pnpm build" + "prepublishOnly": "pnpm clean && pnpm build", + "test": "npm run build && (tsx evals/run-evals.ts run --config evals/mcp-eval.config.json & tsx evals/run-evals.ts run --config evals/mcp-eval-minimal.config.json & tsx evals/run-evals.ts run --config evals/mcp-eval-advanced.config.json & wait)", + "test:config": "npm run build && tsx evals/run-evals.ts run --config evals/mcp-eval.config.json", + "test:minimal": "npm run build && tsx evals/run-evals.ts run --config evals/mcp-eval-minimal.config.json", + "test:advanced": "npm run build && tsx evals/run-evals.ts run --config evals/mcp-eval-advanced.config.json" }, "lint-staged": { "*.{js,jsx,ts,tsx,json,css,scss,md}": [ @@ -42,11 +46,13 @@ "@playwright/test": "^1.49.0", "commander": "^14.0.0", "dotenv": "^16.4.6", + "mcpvals": "0.0.1", "playwright-core": "^1.53.2", "zod": "^3.25.67" }, "devDependencies": { "@eslint/js": "^9.29.0", + "chalk": "^5.3.0", "eslint": "^9.29.0", "eslint-plugin-react": "^7.37.5", "globals": "^16.2.0", diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index e9cd0e5..4afd15b 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -23,6 +23,9 @@ importers: dotenv: specifier: ^16.4.6 version: 16.6.1 + mcpvals: + specifier: 0.0.1 + version: 0.0.1(react@19.1.0) playwright-core: specifier: ^1.53.2 version: 1.53.2 @@ -33,6 +36,9 @@ importers: '@eslint/js': specifier: ^9.29.0 version: 9.29.0 + chalk: + specifier: ^5.3.0 + version: 5.4.1 eslint: specifier: ^9.29.0 version: 9.29.0 @@ -599,9 +605,22 @@ packages: base64-js@1.5.1: resolution: {integrity: sha512-AKpaYlHn8t4SVbOHCy+b5+KKgvR4vrsD8vbvrbiQJps7fKDTkjkDry6ji0rUJjC0kzbNePLwzxq8iypo41qeWA==} + basic-auth@0.0.1: + resolution: {integrity: sha512-sCz6E05DMvrA9dUBGJFfnQ3qs+/lQkVr7qjOT5XMMNfpTzWbpkElpzXfnbNlBjPnDQyz0uBFJ4nELJRIdcKoNQ==} + + basic-auth@2.0.1: + resolution: {integrity: sha512-NF+epuEdnUYVlGuhaxbbq+dvJttwLnGY+YixlXlME5KpQ5W3CnXA5cVTneY3SPbPDRkcjMbifrwmFYcClgOZeg==} + engines: {node: '>= 0.8'} + bignumber.js@9.3.0: resolution: {integrity: sha512-EM7aMFTXbptt/wZdMlBv2t8IViwQL+h6SLHosp8Yf0dqJMTnY6iL32opnAB6kAdL0SZPuvcAzFr31o0c/R3/RA==} + bluebird@2.11.0: + resolution: {integrity: sha512-UfFSr22dmHPQqPP9XWHRhq+gWnHCYguQGkXQlbyPtW5qTnhFWA8/iXg765tH0cAjy7l/zPJ1aBTO0g5XgA7kvQ==} + + bluebird@3.7.2: + resolution: {integrity: sha512-XpNj6GDQzdfW+r2Wnn7xiSAd7TM3jzkxGXBGTtWKuSXv1xUV+azxAm8jdWZN06QTQk+2N2XB9jRDkvbmQmcRtg==} + body-parser@2.2.0: resolution: {integrity: sha512-02qvAaxv8tp7fBa/mw1ga98OGm+eCbqzJOKoRt70sLmfEEi+jyBYVTDGfCL/k06/4EMk/z01gCe7HoCH/f2LTg==} engines: {node: '>=18'} @@ -655,6 +674,14 @@ packages: resolution: {integrity: sha512-nPdaFdQ0h/GEigbPClz11D0v/ZJEwxmeVZGeMo3Z5StPtUTkA9o1lD6QwoirYiSDzbcwn2XcjwmCp68W1IS4TA==} engines: {node: '>=18'} + co-bluebird@1.1.0: + resolution: {integrity: sha512-JuoemMXxQjYAxbfRrNpOsLyiwDiY8mXvGqJyYLM7jMySDJtnMklW3V2o8uyubpc1eN2YoRsAdfZ1lfKCd3lsrA==} + engines: {node: '>=0.12.0'} + + co-use@1.1.0: + resolution: {integrity: sha512-1lVRtdywv41zQO/xvI2wU8w6oFcUYT6T84YKSxN25KN4N4Kld3scLovt8FjDmD63Cm7HtyRWHjezt+IanXmkyA==} + engines: {node: '>=0.12.0'} + color-convert@2.0.1: resolution: {integrity: sha512-RRECPsj7iu/xb5oKYcsFHSppFNnsj/52OVTRKb4zP5onXwVF3zVmmToNcOfGC+CRDpfK/U584fMg38ZHCaElKQ==} engines: {node: '>=7.0.0'} @@ -669,6 +696,10 @@ packages: resolution: {integrity: sha512-FQN4MRfuJeHf7cBbBMJFXhKSDq+2kAArBlmRBvcvFE5BB1HZKXtSFASDhdlz9zOYwxh8lDdnvmMOe/+5cdoEdg==} engines: {node: '>= 0.8'} + commander@11.1.0: + resolution: {integrity: sha512-yPVavfyCcRhmorC7rWlkHn15b4wDVgVmBA7kV4QVBsF7kv/9TKJAbAXVTxvTnwP8HHKjRCJDClKbciiYS7p0DQ==} + engines: {node: '>=16'} + commander@14.0.0: resolution: {integrity: sha512-2uM9rYjPvyq39NwLRqaiLtWHyDC1FvryJDa2ATTVims5YAS4PupsEQsDvP14FqhFr0P49CYDugi59xaxJlTXRA==} engines: {node: '>=20'} @@ -900,6 +931,10 @@ packages: resolution: {integrity: sha512-CRT1WTyuQoD771GW56XEZFQ/ZoSfWid1alKGDYMmkt2yl8UXrVR4pspqWNEcqKvVIzg6PAltWjxcSSPrboA4iA==} engines: {node: '>=18.0.0'} + execa@8.0.1: + resolution: {integrity: sha512-VyhnebXciFV2DESc+p6B+y0LjSm0krU4OgJN44qFAhBY0TJ+1V61tYD2+wHusZ6F9n5K+vl8k0sTy7PEfV4qpg==} + engines: {node: '>=16.17'} + express-rate-limit@7.5.1: resolution: {integrity: sha512-7iN8iPMDzOMHPUYllBEsQdWVB6fPDMPqwjBaFrgr4Jgr/+okjvzAy+UHlYYL/Vs0OsOrMkwS6PJDkFlJwoxUnw==} engines: {node: '>= 16'} @@ -1031,6 +1066,10 @@ packages: resolution: {integrity: sha512-sTSfBjoXBp89JvIKIefqw7U2CCebsc74kiY6awiGogKtoSGbgjYE/G/+l9sF3MWFPNc9IcoOC4ODfKHfxFmp0g==} engines: {node: '>= 0.4'} + get-stream@8.0.1: + resolution: {integrity: sha512-VaUJspBffn/LMCJVoMvSAdmscJyS1auj5Zulnn5UoYcY531UWmdwhRWkcGKnGU93m5HSXP9LP2usOryrBtQowA==} + engines: {node: '>=16'} + get-symbol-description@1.1.0: resolution: {integrity: sha512-w9UMqWwJxHNOvoNzSJ2oPF5wvYcvP7jUvYzhp67yEhTi17ZDBBC1z9pTdGuzjD+EFIqLSYRweZjqfiPzQ06Ebg==} engines: {node: '>= 0.4'} @@ -1119,6 +1158,10 @@ packages: resolution: {integrity: sha512-vK9P5/iUfdl95AI+JVyUuIcVtd4ofvtrOr3HNtM2yxC9bnMbEdp3x01OhQNnjb8IJYi38VlTE3mBXwcfvywuSw==} engines: {node: '>= 14'} + human-signals@5.0.0: + resolution: {integrity: sha512-AXcZb6vzzrFAUE61HnN4mpLqd/cSIwNQjtNWR0euPm6y0iqx3G4gOXaIDdtdDwZmhwe82LA6+zinmW4UBWVePQ==} + engines: {node: '>=16.17.0'} + humanize-ms@1.2.1: resolution: {integrity: sha512-Fl70vYtsAFb/C06PTS9dZBo7ihau+Tu/DNCk/OyHhea07S+aeMWpFFkUaXRa8fI+ScZbEI8dfSxwY7gxZ9SAVQ==} @@ -1218,6 +1261,9 @@ packages: resolution: {integrity: sha512-nPUB5km40q9e8UfN/Zc24eLlzdSf9OfKByBw9CIdw4H1giPMeA0OIJvbchsCu4npfI2QcMVBsGEBHKZ7wLTWmQ==} engines: {node: '>= 0.4'} + is-generator@1.0.3: + resolution: {integrity: sha512-G56jBpbJeg7ds83HW1LuShNs8J73Fv3CPz/bmROHOHlnKkN8sWb9ujiagjmxxMUywftgq48HlBZELKKqFLk0oA==} + is-glob@4.0.3: resolution: {integrity: sha512-xelSayHH36ZgE7ZWhli7pW34hNbNl8Ojv5KVmkJD4hBdD3th8Tfk9vYasLM+mXWOZhFkgZfxhLSnrwRr4elSSg==} engines: {node: '>=0.10.0'} @@ -1257,6 +1303,10 @@ packages: resolution: {integrity: sha512-hFoiJiTl63nn+kstHGBtewWSKnQLpyb155KHheA1l39uvtO9nWIop1p3udqPcUd/xbF1VLMO4n7OI6p7RbngDg==} engines: {node: '>=8'} + is-stream@3.0.0: + resolution: {integrity: sha512-LnQR4bZ9IADDRSkvpqMGvt/tEJWclzklNgSw48V5EAaAeDd6qGvN8ei6k5p0tvxSR171VmGyHuTiAOfxAbr8kA==} + engines: {node: ^12.20.0 || ^14.13.1 || >=16.0.0} + is-string@1.1.1: resolution: {integrity: sha512-BtEeSsoaQjlSPBemMQIrY1MY0uM6vnS1g5fmufYOtnxLGUZM2178PKbhsk7Ffv58IX+ZtcvoGwccYsh0PglkAA==} engines: {node: '>= 0.4'} @@ -1359,6 +1409,9 @@ packages: lodash.merge@4.6.2: resolution: {integrity: sha512-0KpjqXRVvrYyCsX1swR/XTK0va6VQkQM6MNo7PqW77ByjAhoARA8EfrP1N4+KlKj8YS0ZUCtRT/YUuhyYDujIQ==} + lodash@4.17.19: + resolution: {integrity: sha512-JNvd8XER9GQX0v2qJgsaN/mzFCNA5BRe/j8JN9d+tWyGLSodKQHKFicdwNYzWwI3wjRnaKPsGj1XkBjx/F96DQ==} + log-update@6.1.0: resolution: {integrity: sha512-9ie8ItPR6tjY5uYJh8K/Zrv/RMZ5VOlOWvtZdEHYSTFKZfIBPQa9tOAEeAWhd+AnIneLJ22w5fjOYtoutpWq5w==} engines: {node: '>=18'} @@ -1371,6 +1424,14 @@ packages: resolution: {integrity: sha512-/IXtbwEk5HTPyEwyKX6hGkYXxM9nbj64B+ilVJnC/R6B0pH5G4V3b0pVbL7DBj4tkhBAppbQUlf6F6Xl9LHu1g==} engines: {node: '>= 0.4'} + mcpvals@0.0.1: + resolution: {integrity: sha512-n/Mfz4CQWUh0irRtfXCiIuJy6uMWrQaLfIAi8R6/rOmBnQRz6Q16Jfajw+qqJm6y0ER//gjvUhd3Zop9+eClOQ==} + hasBin: true + + media-typer@0.3.0: + resolution: {integrity: sha512-dq+qelQ9akHpcOl/gUVRTxVIOkAJ1wR3QAvb4RsVjS8oVoFjDGTc679wJYmUmknUF5HwMLOgb5O+a3KxfWapPQ==} + engines: {node: '>= 0.6'} + media-typer@1.1.0: resolution: {integrity: sha512-aisnrDP4GNe06UcKFnV5bfMNPBUw4jsLGaWwWfnH3v02GnBuXX2MCVn5RbrWo0j3pczUilYblq7fQ7Nw2t5XKw==} engines: {node: '>= 0.8'} @@ -1379,6 +1440,9 @@ packages: resolution: {integrity: sha512-Snk314V5ayFLhp3fkUREub6WtjBfPdCPY1Ln8/8munuLuiYhsABgBVWsozAG+MWMbVEvcdcpbi9R7ww22l9Q3g==} engines: {node: '>=18'} + merge-stream@2.0.0: + resolution: {integrity: sha512-abv/qOcuPfk3URPfDzmZU1LKmuw8kT+0nIHvKrKgFrwifol/doWcdA4ZqsWQ8ENrFKkd67Mfpo/LovbIUsbt3w==} + merge2@1.4.1: resolution: {integrity: sha512-8q7VEgMJW4J8tcfVPy8g09NcQwZdbwFEqhe/WZkoIzjn/3TGDwtOCYtXGxA3O8tPzpczCCDgv+P2P5y00ZJOOg==} engines: {node: '>= 8'} @@ -1403,6 +1467,10 @@ packages: resolution: {integrity: sha512-xRc4oEhT6eaBpU1XF7AjpOFD+xQmXNB5OVKwp4tqCuBpHLS/ZbBDrc07mYTDqVMg6PfxUjjNp85O6Cd2Z/5HWA==} engines: {node: '>= 0.6'} + mimic-fn@4.0.0: + resolution: {integrity: sha512-vqiC06CuhBTUdZH+RYl8sFrL096vA45Ok5ISO6sE/Mr1jRbGH4Csnhi8f3wKVl7x8mO4Au7Ir9D3Oyv1VYMFJw==} + engines: {node: '>=12'} + mimic-function@5.0.1: resolution: {integrity: sha512-VP79XUPxV2CigYP3jWwAUFSku2aKqBH7uTAapFWCBqutsbmDo96KY5o8uh6U+/YSIn5OxJnXp73beVkpqMIGhA==} engines: {node: '>=18'} @@ -1449,6 +1517,18 @@ packages: encoding: optional: true + node-oauth2-server@2.4.0: + resolution: {integrity: sha512-k3NUmzjEIPyKBuY1OYtHqJ2L6siIlN+oERGe1MVeUGxzeOxEq/2z5K03/P8lfW4ys0Iivbn1KlGJgBeXNZ6Z5w==} + engines: {node: '>=0.8'} + + npm-run-path@5.3.0: + resolution: {integrity: sha512-ppwTtiJZq0O/ai0z7yfudtBpWIoxM8yE6nHi1X47eFR2EWORqfbu6CnPlNsjeN683eT0qG6H/Pyf9fCcvjnnnQ==} + engines: {node: ^12.20.0 || ^14.13.1 || >=16.0.0} + + oauth2-server@3.1.1: + resolution: {integrity: sha512-4dv+fE9hrK+xTaCygOLh/kQeFzbFr7UqSyHvBDbrQq8Hg52sAkV2vTsyH3Z42hoeaKpbhM7udhL8Y4GYbl6TGQ==} + engines: {node: '>=4.0'} + object-assign@4.1.1: resolution: {integrity: sha512-rJgTQnkUnH1sFw8yT6VSU3zD3sWmu6sZhIseY8VX+GRu3P6F7Fu+JNDoXfklElbLJSnc3FUQHVe4cU5hj+BcUg==} engines: {node: '>=0.10.0'} @@ -1497,6 +1577,10 @@ packages: once@1.4.0: resolution: {integrity: sha512-lNaJgI+2Q5URQBkccEKHTQOPaXdUxnZZElQTZY0MFUAuaEqe1E+Nyvgdz/aIyNi6Z9MzO5dv1H8n58/GELp3+w==} + onetime@6.0.0: + resolution: {integrity: sha512-1FlR+gjXK7X+AsAHso35MnyN5KqGwJRi/31ft6x0M194ht7S+rWAvd7PHss9xSKMzE0asv1pyIHaJYq+BbacAQ==} + engines: {node: '>=12'} + onetime@7.0.0: resolution: {integrity: sha512-VXJjc87FScF88uafS3JllDgvAm+c/Slfz06lorj2uAY34rlUu0Nt+v8wreiImcrgAjjIHp1rXpTDlLOGw29WwQ==} engines: {node: '>=18'} @@ -1552,6 +1636,10 @@ packages: resolution: {integrity: sha512-ojmeN0qd+y0jszEtoY48r0Peq5dwMEkIlCOu6Q5f41lfkswXuKtYrhgoTpLnyIcHm24Uhqx+5Tqm2InSwLhE6Q==} engines: {node: '>=8'} + path-key@4.0.0: + resolution: {integrity: sha512-haREypq7xkM7ErfgIyA0z+Bj4AGKlMSdlQE2jvJo6huWD1EdkKYV+G/T4nq0YEF2vgTT8kqMFKo1uHn950r4SQ==} + engines: {node: '>=12'} + path-parse@1.0.7: resolution: {integrity: sha512-LDJzPVEEEPR+y48z93A0Ed0yXb8pAByGWo/k5YYdYgpY2/2EsOsksJrq7lOHxryrVOn1ejG6oAp8ahvOIQD8sw==} @@ -1617,6 +1705,10 @@ packages: process-warning@5.0.0: resolution: {integrity: sha512-a39t9ApHNx2L4+HBnQKqxxHNs1r7KF+Intd8Q/g1bUh6q0WIp9voPXJ/x0j+ZL45KF1pJd9+q2jLIRMfvEshkA==} + promisify-any@2.0.1: + resolution: {integrity: sha512-pVaGouFbTVxqpVJ+T5A15olNJDASAZHYq5cXz6mWdr6/X34mVWiG9MSdzHTcVBCv4aqBP7wGspi7BUSRbEmhsw==} + engines: {node: '>=0.10.0'} + prop-types@15.8.1: resolution: {integrity: sha512-oj87CgZICdulUohogVAR7AjlC0327U4el4L6eAvOqCeudMDVU0NThNaV+b9Df4dXgSP1gXMTnPdhfe/2qDH5cg==} @@ -1710,6 +1802,9 @@ packages: resolution: {integrity: sha512-AURm5f0jYEOydBj7VQlVvDrjeFgthDdEF5H1dP+6mNpoXOMo1quQqJ4wvJDyRZ9+pO3kGWoOdmV08cSv2aJV6Q==} engines: {node: '>=0.4'} + safe-buffer@5.1.2: + resolution: {integrity: sha512-Gd2UZBJDkXlY7GbJxfsE8/nvKkUEU1G38c1siN6QP6a9PT9MmHB8GnpscSmMJSoF8LOIrt8ud/wPtojys4G6+g==} + safe-buffer@5.2.1: resolution: {integrity: sha512-rp3So07KcdmmKbGvgaNxQSJr7bGVSVk5S9Eq1F+ppbRo70+YeaDxkw5Dd8NPN+GD6bjnYm2VuPuCXmpuYvmCXQ==} @@ -1819,6 +1914,10 @@ packages: resolution: {integrity: sha512-UcjcJOWknrNkF6PLX83qcHM6KHgVKNkV62Y8a5uYDVv9ydGQVwAHMKqHdJje1VTWpljG0WYpCDhrCdAOYH4TWg==} engines: {node: '>= 10.x'} + statuses@1.5.0: + resolution: {integrity: sha512-OpZ3zP+jT1PI7I8nemJX4AKmAX070ZkYPVWV/AaKTJl+tXCTGyVdC1a4SL8RUQYEwk/f34ZX8UTykN68FwrqAA==} + engines: {node: '>= 0.6'} + statuses@2.0.1: resolution: {integrity: sha512-RwNA9Z/7PrK06rYLIzFMlaF+l73iwpzsqRIFgbMLbTcLD6cOao82TaWefPXQvB2fOC4AjuYSEndS7N/mTCbkdQ==} engines: {node: '>= 0.8'} @@ -1862,6 +1961,10 @@ packages: resolution: {integrity: sha512-iq6eVVI64nQQTRYq2KtEg2d2uU7LElhTJwsH4YzIHZshxlgZms/wIc4VoDQTlG/IvVIrBKG06CrZnp0qv7hkcQ==} engines: {node: '>=12'} + strip-final-newline@3.0.0: + resolution: {integrity: sha512-dOESqjYr96iWYylGObzd39EuNTa5VJxyvVAEm5Jnh7KGo75V43Hk1odPQkNDyXNmUR6k+gEiDVXnjB8HJ3crXw==} + engines: {node: '>=12'} + strip-json-comments@3.1.1: resolution: {integrity: sha512-6fPc+R4ihwqP6N/aIv2f1gMH8lOVtWQHoqC4yK6oSDVVocumAsfCqjkXnqiYMhmMwS/mEHLp7Vehlt3ql6lEig==} engines: {node: '>=8'} @@ -1923,6 +2026,10 @@ packages: resolution: {integrity: sha512-XleUoc9uwGXqjWwXaUTZAmzMcFZ5858QA2vvx1Ur5xIcixXIP+8LnFDgRplU30us6teqdlskFfu+ae4K79Ooew==} engines: {node: '>= 0.8.0'} + type-is@1.6.18: + resolution: {integrity: sha512-TkRKr9sUTxEH8MdfuCSP7VizJyzRNMjj2J2do2Jr3Kym598JVdEksuzPQCnlFPW4ky9Q+iA+ma9BGm06XQBy8g==} + engines: {node: '>= 0.6'} + type-is@2.0.1: resolution: {integrity: sha512-OZs6gsjF4vMp32qrCbiVSkrFmXtG/AZhY3t0iAMrMBiAZyV9oALtXO8hsrHbMXF9x6L3grlFuwW2oAz7cav+Gw==} engines: {node: '>= 0.6'} @@ -2060,7 +2167,6 @@ snapshots: '@ai-sdk/provider': 1.1.3 '@ai-sdk/provider-utils': 2.2.8(zod@3.25.67) zod: 3.25.67 - optional: true '@ai-sdk/azure@1.3.23(zod@3.25.67)': dependencies: @@ -2119,7 +2225,6 @@ snapshots: '@ai-sdk/provider': 1.1.3 '@ai-sdk/provider-utils': 2.2.8(zod@3.25.67) zod: 3.25.67 - optional: true '@ai-sdk/perplexity@1.1.9(zod@3.25.67)': dependencies: @@ -2645,8 +2750,18 @@ snapshots: base64-js@1.5.1: {} + basic-auth@0.0.1: {} + + basic-auth@2.0.1: + dependencies: + safe-buffer: 5.1.2 + bignumber.js@9.3.0: {} + bluebird@2.11.0: {} + + bluebird@3.7.2: {} + body-parser@2.2.0: dependencies: bytes: 3.1.2 @@ -2713,6 +2828,13 @@ snapshots: slice-ansi: 5.0.0 string-width: 7.2.0 + co-bluebird@1.1.0: + dependencies: + bluebird: 2.11.0 + co-use: 1.1.0 + + co-use@1.1.0: {} + color-convert@2.0.1: dependencies: color-name: 1.1.4 @@ -2725,6 +2847,8 @@ snapshots: dependencies: delayed-stream: 1.0.0 + commander@11.1.0: {} + commander@14.0.0: {} concat-map@0.0.1: {} @@ -3062,6 +3186,18 @@ snapshots: dependencies: eventsource-parser: 3.0.3 + execa@8.0.1: + dependencies: + cross-spawn: 7.0.6 + get-stream: 8.0.1 + human-signals: 5.0.0 + is-stream: 3.0.0 + merge-stream: 2.0.0 + npm-run-path: 5.3.0 + onetime: 6.0.0 + signal-exit: 4.1.0 + strip-final-newline: 3.0.0 + express-rate-limit@7.5.1(express@5.1.0): dependencies: express: 5.1.0 @@ -3244,6 +3380,8 @@ snapshots: dunder-proto: 1.0.1 es-object-atoms: 1.1.1 + get-stream@8.0.1: {} + get-symbol-description@1.1.0: dependencies: call-bound: 1.0.4 @@ -3345,6 +3483,8 @@ snapshots: transitivePeerDependencies: - supports-color + human-signals@5.0.0: {} + humanize-ms@1.2.1: dependencies: ms: 2.1.3 @@ -3442,6 +3582,8 @@ snapshots: has-tostringtag: 1.0.2 safe-regex-test: 1.1.0 + is-generator@1.0.3: {} + is-glob@4.0.3: dependencies: is-extglob: 2.1.1 @@ -3474,6 +3616,8 @@ snapshots: is-stream@2.0.1: {} + is-stream@3.0.0: {} + is-string@1.1.1: dependencies: call-bound: 1.0.4 @@ -3598,6 +3742,8 @@ snapshots: lodash.merge@4.6.2: {} + lodash@4.17.19: {} + log-update@6.1.0: dependencies: ansi-escapes: 7.0.0 @@ -3612,10 +3758,30 @@ snapshots: math-intrinsics@1.1.0: {} + mcpvals@0.0.1(react@19.1.0): + dependencies: + '@ai-sdk/anthropic': 1.2.12(zod@3.25.67) + '@ai-sdk/openai': 1.3.22(zod@3.25.67) + '@modelcontextprotocol/sdk': 1.13.1 + ai: 4.3.16(react@19.1.0)(zod@3.25.67) + chalk: 5.4.1 + commander: 11.1.0 + execa: 8.0.1 + node-oauth2-server: 2.4.0 + oauth2-server: 3.1.1 + zod: 3.25.67 + transitivePeerDependencies: + - react + - supports-color + + media-typer@0.3.0: {} + media-typer@1.1.0: {} merge-descriptors@2.0.0: {} + merge-stream@2.0.0: {} + merge2@1.4.1: {} micromatch@4.0.8: @@ -3635,6 +3801,8 @@ snapshots: dependencies: mime-db: 1.54.0 + mimic-fn@4.0.0: {} + mimic-function@5.0.1: {} minimatch@3.1.2: @@ -3663,6 +3831,23 @@ snapshots: dependencies: whatwg-url: 5.0.0 + node-oauth2-server@2.4.0: + dependencies: + basic-auth: 0.0.1 + + npm-run-path@5.3.0: + dependencies: + path-key: 4.0.0 + + oauth2-server@3.1.1: + dependencies: + basic-auth: 2.0.1 + bluebird: 3.7.2 + lodash: 4.17.19 + promisify-any: 2.0.1 + statuses: 1.5.0 + type-is: 1.6.18 + object-assign@4.1.1: {} object-inspect@1.13.4: {} @@ -3718,6 +3903,10 @@ snapshots: dependencies: wrappy: 1.0.2 + onetime@6.0.0: + dependencies: + mimic-fn: 4.0.0 + onetime@7.0.0: dependencies: mimic-function: 5.0.1 @@ -3775,6 +3964,8 @@ snapshots: path-key@3.1.1: {} + path-key@4.0.0: {} + path-parse@1.0.7: {} path-to-regexp@8.2.0: {} @@ -3839,6 +4030,12 @@ snapshots: process-warning@5.0.0: {} + promisify-any@2.0.1: + dependencies: + bluebird: 2.11.0 + co-bluebird: 1.1.0 + is-generator: 1.0.3 + prop-types@15.8.1: dependencies: loose-envify: 1.4.0 @@ -3951,6 +4148,8 @@ snapshots: has-symbols: 1.1.0 isarray: 2.0.5 + safe-buffer@5.1.2: {} + safe-buffer@5.2.1: {} safe-push-apply@1.0.0: @@ -4088,6 +4287,8 @@ snapshots: split2@4.2.0: {} + statuses@1.5.0: {} + statuses@2.0.1: {} statuses@2.0.2: {} @@ -4153,6 +4354,8 @@ snapshots: dependencies: ansi-regex: 6.1.0 + strip-final-newline@3.0.0: {} + strip-json-comments@3.1.1: {} supports-color@7.2.0: @@ -4206,6 +4409,11 @@ snapshots: dependencies: prelude-ls: 1.2.1 + type-is@1.6.18: + dependencies: + media-typer: 0.3.0 + mime-types: 2.1.35 + type-is@2.0.1: dependencies: content-type: 1.0.5 diff --git a/pnpm-workspace.yaml b/pnpm-workspace.yaml index 6bdb532..33dd6e3 100644 --- a/pnpm-workspace.yaml +++ b/pnpm-workspace.yaml @@ -1,2 +1,2 @@ packages: - - '.' \ No newline at end of file + - . \ No newline at end of file diff --git a/src/tools/multiSession.ts b/src/tools/multiSession.ts index 4428dc6..624501c 100644 --- a/src/tools/multiSession.ts +++ b/src/tools/multiSession.ts @@ -213,7 +213,7 @@ export const closeSessionTool = defineTool({ content: [ { type: "text", - text: `Closed session ${sessionId}`, + text: `closed session ${sessionId}`, }, ], }), diff --git a/src/tools/session.ts b/src/tools/session.ts index c5c4e6a..6aff800 100644 --- a/src/tools/session.ts +++ b/src/tools/session.ts @@ -196,7 +196,7 @@ async function handleCloseSession(context: Context): Promise { } if (stagehandClosedSuccessfully) { - let successMessage = `Browserbase session (${previousSessionId || "default"}) closed successfully via Stagehand. Context reset to default.`; + let successMessage = `Browserbase session (${previousSessionId || "default"}) closed successfully via Stagehand. Context reset to default. session closed`; if (browserbaseSessionId && previousSessionId !== defaultSessionId) { successMessage += ` View replay at https://www.browserbase.com/sessions/${browserbaseSessionId}`; } diff --git a/tests/.gitkeep b/tests/.gitkeep deleted file mode 100644 index e69de29..0000000