rds=100)
assert len(result.split()) <= 100
assert len(provider.calls) == 1
def test_summarizer_retries_on_empty_response():
provider = FakeProvider(["", "Here is the summary: short and done."])
result = summarize_document("some text", client=provider)
assert "short and done" in result
assert len(provider.calls) == 2
Enter fullscreen mode Exit fullscreen mode
Your agent function needs to accept `client` as a parameter. If it creates the client internally, you cannot swap it. Dependency injection is not optional here.
## [](#snapshot-testing-with-agentsnap)Snapshot Testing with agentsnap
The FakeProvider tests behavior. agentsnap tests regression: you run the real agent once, capture the full trace of tool calls and outputs, then replay that trace forever without hitting the API again.
pip install agentsnap
from agentsnap import Snap, record_session, replay_session
Step 1: record once (costs API tokens, run locally)
AGENTSNAP_RECORD=1 pytest tests/snapshots/test_research_agent.py
Step 2: replay forever in CI (free, deterministic)
tests/snapshots/test_research_agent.py
import pytest
from agentsnap import Snap
from myagent.research import run_research_agent
SNAP = Snap("snapshots/research_agent_v1.json")
@SNAP.test
def test_research_agent_finds_sources(snap_client):
"""
In record mode: runs the real agent, saves trace.
In replay mode: feeds saved tool outputs back, checks final output matches.
"""
result = run_research_agent(
query="What is the capital of France?",
client=snap_client,
)
assert "Paris" in result.answer
assert len(result.sources) >= 1
The snapshot file stores:
- model inputs (messages, tools, temperature)
- tool call sequences
- tool outputs
- final model response
On replay, snap_client intercepts tool calls and feeds back the recorded outputs.
Enter fullscreen mode Exit fullscreen mode
The snapshot file goes in version control. When you update the agent, you regenerate the snapshot on your machine and commit it. CI uses the committed snapshot. You spend tokens once, not on every push.
## [](#retry-logic-testing-with-fakeprovider-llmretrypy)Retry Logic Testing with FakeProvider + llm-retry-py
Testing retry logic against a real API is painful. The rate limit has to actually fire. With FakeProvider, you control exactly when it fails.
from fake_provider import FakeProvider
from llm_retry import with_retry, RateLimitError
from myagent.caller import call_with_retry
def test_retries_on_rate_limit():
def raise_rate_limit(**kwargs):
raise RateLimitError("rate limited")
provider = FakeProvider([])
provider.completions.create = raise_rate_limit # first call fails
call_count = 0
real_responses = ["Success after retry"]
def side_effect(**kwargs):
nonlocal call_count
call_count += 1
if call_count == 1:
raise RateLimitError("rate limited")
return FakeCompletion(real_responses[0])
provider.completions.create = side_effect
result = call_with_retry(client=provider, prompt="hello")
assert result == "Success after retry"
assert call_count == 2
Enter fullscreen mode Exit fullscreen mode
## [](#what-these-patterns-do-not-do)What These Patterns Do NOT Do
FakeProvider and agentsnap cannot test reasoning quality. If your agent produces bad output because of a subtle prompt issue, no amount of unit testing will catch it. They also cannot catch emergent failures: behaviors that only appear when the model sees a specific combination of context.
Snapshot tests go stale. When you change tools or update the system prompt, old snapshots no longer reflect real behavior. You need to regenerate them. That is not a bug, that is the workflow.
Module-level mocks (httpretty, responses) work for tools that make HTTP calls directly. They do not work well for structured tool-use flows where the model decides which tool to call.
## [](#structuring-the-test-suite)Structuring the Test Suite
Three tiers:
**Smoke tests** run on every commit. Use FakeProvider. Fast, no API tokens, covers happy paths and error branches.
**Regression tests** run on every PR. Use agentsnap. Replay real traces, confirm the agent still produces the same output structure. Regenerate snapshots when behavior intentionally changes.
**Integration tests** run before release or on a schedule. Use the real API with a dedicated low-quota key. Cover end-to-end flows that snapshots cannot capture.
Keep the tiers separate. Do not let integration tests sneak into the smoke tier. Mark them explicitly.
pyproject.toml
[tool.pytest.ini_options]
markers = [
"smoke: fast, no API calls",
"regression: agentsnap replay, no API calls",
"integration: real API, slow, run before release",
]
Run smoke only in CI:
pytest -m smoke
Run smoke + regression locally:
pytest -m "smoke or regression"
Run all before release:
pytest -m "smoke or regression or integration"
Enter fullscreen mode Exit fullscreen mode
## [](#quickstart-snippet)Quick-Start Snippet
pip install agentsnap llm-retry-py
Create your FakeProvider (copy the pattern above, it has no dependencies)
Write smoke tests with FakeProvider first
When smoke tests pass, record a snapshot:
AGENTSNAP_RECORD=1 pytest tests/snapshots/ -m regression
Commit the snapshot file
git add snapshots/
git commit -m "add agentsnap regression snapshot for research agent"
CI runs replay automatically (no AGENTSNAP_RECORD set)
pytest -m "smoke or regression"
Enter fullscreen mode Exit fullscreen mode
## [](#related-libraries)Related Libraries
Library
What It Does
[agentsnap](https://github.com/MukundaKatta/agentsnap)
Record and replay tool-call traces for regression tests
[agenttrace](https://github.com/MukundaKatta/agenttrace)
Structured trace export (cost, latency, tool calls per run)
[agentvet](https://github.com/MukundaKatta/agentvet)
Static checks for agent configuration before deploy
[llm-retry-py](https://pypi.org/project/llm-retry-py/)
Exponential backoff retry for LLM calls
[prompt-eval-rubric](https://pypi.org/project/prompt-eval-rubric/)
0.0-1.0 scoring rubrics for output quality checks
[llm-output-validator](https://pypi.org/project/llm-output-validator/)
Rule-based string validation for model outputs
## [](#whats-next)What's Next
Once you have smoke and regression tests running in CI, the next gap is output quality. Snapshot tests tell you the structure did not change. They do not tell you the content is good.
That is where prompt-eval-rubric and llm-output-validator fill in. You define a rubric (relevance, completeness, no hallucinated citations) and score outputs against it. You can run rubric checks against your agentsnap replays without touching the API.
The pattern after that is A/B prompt testing: keep the snapshot, change the prompt template, regenerate the snapshot, diff the outputs. That workflow is covered in post 114 on prompt engineering hygiene.