Learn proven patterns and anti-patterns for testing MCP servers and agents. Write maintainable, reliable, and efficient tests that scale with your project.
Use this file to discover all available pages before exploring further.
🌟 Test like a pro! These best practices come from real-world experience testing MCP servers and agents at scale. Follow these guidelines to build a robust, maintainable test suite.
When to go broader: Complex agent behaviors sometimes require end-to-end scenarios (multi-tool flows, recovery, efficiency). In those cases:
Keep assertions layered and named (content, tools, performance, judge)
Bound scope (one coherent workflow per test)
Use separate tests for alternative branches or failure paths
Example end-to-end scenario:
@task("Fetch and summarize workflow")async def test_document_flow(agent, session): # Single coherent workflow summary = await agent.generate_str( "Fetch https://example.com and summarize the main content" ) await session.assert_that(Expect.tools.was_called("fetch"), name="fetched") await session.assert_that( Expect.content.contains("Example Domain"), response=summary, name="has_title" ) await session.assert_that(Expect.performance.max_iterations(3), name="efficient") await session.assert_that( Expect.path.efficiency(expected_tool_sequence=["fetch"], allow_extra_steps=0), name="golden_path", )
# BAD: Testing too many things at once@task("Test everything")async def test_calculator_everything(agent, session): response = await agent.generate_str( "Calculate 5+3, then 10/0, then fetch weather, " "then validate JSON, then check performance" ) # This test is hard to debug when it fails!
@task("Should return error message when dividing by zero")async def test_division_by_zero_returns_error(agent, session): # Clear what this test checks pass@task("Should complete simple calculation in under 2 seconds")async def test_simple_calculation_performance(agent, session): # Performance expectation is clear pass
❌ Don’t: Use vague or generic names
# BAD: What does this test?@task("Test 1")async def test_1(agent, session): pass# BAD: Too generic@task("Calculator test")async def test_calc(agent, session): pass
@task("Test user creation")async def test_create_user(agent, session): """Creates its own test data.""" user_id = f"test_user_{uuid.uuid4()}" response = await agent.generate_str( f"Create user with ID {user_id}" ) # Clean up after ourselves await agent.generate_str(f"Delete user {user_id}")
❌ Don’t: Depend on other tests or shared state
# BAD: Depends on previous test@task("Test user update")async def test_update_user(agent, session): # Assumes user was created by another test! response = await agent.generate_str( "Update user test_user_123" # Will fail if run alone )
@task("Test JSON response format")async def test_json_format(agent, session): response = await agent.generate_str("Get user data as JSON") # Explicit, specific assertions await session.assert_that( Expect.content.regex(r'\{"id":\s*\d+'), # Has ID field Expect.content.regex(r'"name":\s*"[^"]+"'), # Has name Expect.content.regex(r'"created":\s*"\d{4}-\d{2}-\d{2}"'), # Has date response=response )
❌ Don’t: Use vague or implicit checks
# BAD: Too vagueawait session.assert_that( Expect.content.contains("data"), # What data? response=response)
# Good naming patternstest_<feature>_<aspect>.py# Examples:test_calculator_basic_operations.pytest_calculator_error_handling.pytest_calculator_performance.pytest_api_authentication.pytest_api_rate_limiting.py
# assertions/common.pyasync def assert_successful_api_call(session, response, endpoint): """Reusable assertion for API calls.""" await session.assert_that( Expect.tools.was_called("http_client"), Expect.tools.success_rate(min_rate=1.0), Expect.content.regex(r'"status":\s*20\d'), # 2xx status Expect.content.contains(endpoint), response=response )# Use in tests@task("Test user API")async def test_user_api(agent, session): response = await agent.generate_str("Get user data from /api/users") await assert_successful_api_call(session, response, "/api/users")
# mcpeval.yamlexecution: max_concurrency: 10 # Run up to 10 tests in parallel parallel: true
# Mark tests that can run in parallel@pytest.mark.parallel@task("Independent test 1")async def test_independent_1(agent, session): pass@pytest.mark.parallel@task("Independent test 2")async def test_independent_2(agent, session): pass
@task("Test with retry logic")@retry(max_attempts=3)async def test_with_variation(agent, session): """Retry on transient failures.""" response = await agent.generate_str( "Generate a creative story about testing" ) # Use flexible assertions await session.assert_that( Expect.judge.llm( "Story is about testing and is creative", min_score=0.7 # Allow some variation ), response=response )
@setupdef reset_test_environment(): """Clean state before each test.""" # Clear any caches cache.clear() # Reset any global state global_state.reset() # Ensure clean database db.rollback()@teardowndef cleanup_test_artifacts(): """Clean up after each test.""" # Delete test files for file in Path("test_outputs").glob("test_*"): file.unlink() # Close connections await close_all_connections()
@task("Test complex data transformation workflow")async def test_data_transformation(agent, session): """ Test the complete data transformation pipeline. Flow: 1. Load raw CSV data 2. Validate format and content 3. Transform to normalized JSON 4. Store in database 5. Generate summary report Expected behavior: - All steps complete successfully - Data integrity is maintained - Report contains key metrics """ # Step 1: Load data # Important: Using test fixture with known values response = await agent.generate_str( "Process data from test_fixtures/sample.csv" ) # Verify each step completed await session.assert_that( Expect.tools.sequence([ "file_reader", "validator", "transformer", "database", "report_generator" ]), name="correct_pipeline_sequence" )
@task("Test API v2 compatibility")@since_version("2.0.0")async def test_api_v2(agent, session): """Test new v2 API features.""" pass@task("Test legacy API support")@deprecated("3.0.0", "Use test_api_v2 instead")async def test_api_v1(agent, session): """Test old API for backwards compatibility.""" pass
# GOOD: Fix the root cause or mark appropriately@pytest.mark.flaky(reruns=3, reruns_delay=2)async def test_with_external_dependency(agent, session): """Test that depends on external service.""" pass
Quarterly: Refactor test organization, update patterns
Yearly: Major test suite health assessment
You’re now equipped with best practices that will make your mcp-eval tests reliable, maintainable, and valuable! Remember: good tests are an investment in your project’s future. 🌟