from langgraph.checkpoint.memory import InMemorySaveragent = create_agent( model, tools=[], checkpointer=InMemorySaver())# First invocationagent.invoke(HumanMessage(content="I live in Sydney, Australia."))# Second invocation: the first message is persisted (Sydney location), so the model returns GMT+10 timeagent.invoke(HumanMessage(content="What's my local time?"))
from langchain.agents import create_agentfrom langchain.tools import toolfrom langchain.messages import HumanMessage, AIMessage, ToolMessagefrom agentevals.trajectory.match import create_trajectory_match_evaluator@tooldef get_weather(city: str): """Get weather information for a city.""" return f"It's 75 degrees and sunny in {city}."agent = create_agent("gpt-4o", tools=[get_weather])evaluator = create_trajectory_match_evaluator( trajectory_match_mode="strict", ) def test_weather_tool_called_strict(): result = agent.invoke({ "messages": [HumanMessage(content="What's the weather in San Francisco?")] }) reference_trajectory = [ HumanMessage(content="What's the weather in San Francisco?"), AIMessage(content="", tool_calls=[ {"id": "call_1", "name": "get_weather", "args": {"city": "San Francisco"}} ]), ToolMessage(content="It's 75 degrees and sunny in San Francisco.", tool_call_id="call_1"), AIMessage(content="The weather in San Francisco is 75 degrees and sunny."), ] evaluation = evaluator( outputs=result["messages"], reference_outputs=reference_trajectory ) # { # 'key': 'trajectory_strict_match', # 'score': True, # 'comment': None, # } assert evaluation["score"] is True
Unordered match
The unordered mode allows the same tool calls in any order, which is helpful when you want to verify that specific information was retrieved but don’t care about the sequence. For example, an agent might need to check both weather and events for a city, but the order doesn’t matter.
Copy
from langchain.agents import create_agentfrom langchain.tools import toolfrom langchain.messages import HumanMessage, AIMessage, ToolMessagefrom agentevals.trajectory.match import create_trajectory_match_evaluator@tooldef get_weather(city: str): """Get weather information for a city.""" return f"It's 75 degrees and sunny in {city}."@tooldef get_events(city: str): """Get events happening in a city.""" return f"Concert at the park in {city} tonight."agent = create_agent("gpt-4o", tools=[get_weather, get_events])evaluator = create_trajectory_match_evaluator( trajectory_match_mode="unordered", ) def test_multiple_tools_any_order(): result = agent.invoke({ "messages": [HumanMessage(content="What's happening in SF today?")] }) # Reference shows tools called in different order than actual execution reference_trajectory = [ HumanMessage(content="What's happening in SF today?"), AIMessage(content="", tool_calls=[ {"id": "call_1", "name": "get_events", "args": {"city": "SF"}}, {"id": "call_2", "name": "get_weather", "args": {"city": "SF"}}, ]), ToolMessage(content="Concert at the park in SF tonight.", tool_call_id="call_1"), ToolMessage(content="It's 75 degrees and sunny in SF.", tool_call_id="call_2"), AIMessage(content="Today in SF: 75 degrees and sunny with a concert at the park tonight."), ] evaluation = evaluator( outputs=result["messages"], reference_outputs=reference_trajectory, ) # { # 'key': 'trajectory_unordered_match', # 'score': True, # } assert evaluation["score"] is True
Subset and superset match
The superset and subset modes match partial trajectories. The superset mode verifies that the agent called at least the tools in the reference trajectory, allowing additional tool calls. The subset mode ensures the agent did not call any tools beyond those in the reference.
Copy
from langchain.agents import create_agentfrom langchain.tools import toolfrom langchain.messages import HumanMessage, AIMessage, ToolMessagefrom agentevals.trajectory.match import create_trajectory_match_evaluator@tooldef get_weather(city: str): """Get weather information for a city.""" return f"It's 75 degrees and sunny in {city}."@tooldef get_detailed_forecast(city: str): """Get detailed weather forecast for a city.""" return f"Detailed forecast for {city}: sunny all week."agent = create_agent("gpt-4o", tools=[get_weather, get_detailed_forecast])evaluator = create_trajectory_match_evaluator( trajectory_match_mode="superset", ) def test_agent_calls_required_tools_plus_extra(): result = agent.invoke({ "messages": [HumanMessage(content="What's the weather in Boston?")] }) # Reference only requires get_weather, but agent may call additional tools reference_trajectory = [ HumanMessage(content="What's the weather in Boston?"), AIMessage(content="", tool_calls=[ {"id": "call_1", "name": "get_weather", "args": {"city": "Boston"}}, ]), ToolMessage(content="It's 75 degrees and sunny in Boston.", tool_call_id="call_1"), AIMessage(content="The weather in Boston is 75 degrees and sunny."), ] evaluation = evaluator( outputs=result["messages"], reference_outputs=reference_trajectory, ) # { # 'key': 'trajectory_superset_match', # 'score': True, # 'comment': None, # } assert evaluation["score"] is True
You can also set the tool_args_match_mode property and/or tool_args_match_overrides to customize how the evaluator considers equality between tool calls in the actual trajectory vs. the reference. By default, only tool calls with the same arguments to the same tool are considered equal. Visit the repository for more details.
You can also use an LLM to evaluate the agent’s execution path with the create_trajectory_llm_as_judge function. Unlike the trajectory match evaluators, it doesn’t require a reference trajectory, but one can be provided if available.
Without reference trajectory
Copy
from langchain.agents import create_agentfrom langchain.tools import toolfrom langchain.messages import HumanMessage, AIMessage, ToolMessagefrom agentevals.trajectory.llm import create_trajectory_llm_as_judge, TRAJECTORY_ACCURACY_PROMPT@tooldef get_weather(city: str): """Get weather information for a city.""" return f"It's 75 degrees and sunny in {city}."agent = create_agent("gpt-4o", tools=[get_weather])evaluator = create_trajectory_llm_as_judge( model="openai:o3-mini", prompt=TRAJECTORY_ACCURACY_PROMPT, ) def test_trajectory_quality(): result = agent.invoke({ "messages": [HumanMessage(content="What's the weather in Seattle?")] }) evaluation = evaluator( outputs=result["messages"], ) # { # 'key': 'trajectory_accuracy', # 'score': True, # 'comment': 'The provided agent trajectory is reasonable...' # } assert evaluation["score"] is True
With reference trajectory
If you have a reference trajectory, you can add an extra variable to your prompt and pass in the reference trajectory. Below, we use the prebuilt TRAJECTORY_ACCURACY_PROMPT_WITH_REFERENCE prompt and configure the reference_outputs variable:
调用真实 LLM API 的集成测试可能很慢且昂贵,尤其是在 CI/CD 管道中频繁运行时。我们建议使用库来记录 HTTP 请求和响应,然后在后续运行中重放它们,而无需进行实际的网络调用。您可以使用 vcrpy 来实现这一点。如果您使用 pytest,pytest-recording 插件提供了一种简单的方法,只需最少的配置即可启用此功能。请求/响应记录在 cassettes 中,然后在后续运行中用于模拟真实的网络调用。Set up your conftest.py file to filter out sensitive information from the cassettes:
conftest.py
Copy
import pytest@pytest.fixture(scope="session")def vcr_config(): return { "filter_headers": [ ("authorization", "XXXX"), ("x-api-key", "XXXX"), # ... other headers you want to mask ], "filter_query_parameters": [ ("api_key", "XXXX"), ("key", "XXXX"), ], }
Then configure your project to recognise the vcr marker:
Copy
[pytest]markers = vcr: record/replay HTTP via VCRaddopts = --record-mode=once
The --record-mode=once option records HTTP interactions on the first run and replays them on subsequent runs.
Now, simply decorate your tests with the vcr marker:
The first time you run this test, your agent will make real network calls and pytest will generate a cassette file test_agent_trajectory.yaml in the tests/cassettes directory. Subsequent runs will use that cassette to mock the real network calls, granted the agent’s requests don’t change from the previous run. If they do, the test will fail and you’ll need to delete the cassette and rerun the test to record fresh interactions.
When you modify prompts, add new tools, or change expected trajectories, your saved cassettes will become outdated and your existing tests will fail. You should delete the corresponding cassette files and rerun the tests to record fresh interactions.