number[] };
}
### Step 2: Implement Tool-Based Builders
Each tool corresponds to a schema segment. Validation occurs at the tool boundary, providing immediate feedback to the model.
```typescript
class StateAccumulator {
private state: ExtractionState;
constructor() {
this.state = {
entities: [],
timeline: [],
financials: [],
metadata: { status: 'processing', stepsCompleted: [] }
};
}
registerEntity(name: string, type: 'individual' | 'organization'): string {
if (!name.trim()) return 'Error: entity name cannot be empty.';
const id = `ent_${this.state.entities.length + 1}`;
this.state.entities.push({ id, name, type });
return `Success: Registered ${type} "${name}" with ID ${id}.`;
}
logEvent(timestamp: string, description: string, sourceRef: string): string {
if (!/^\d{4}-\d{2}-\d{2}$/.test(timestamp)) {
return 'Error: timestamp must follow YYYY-MM-DD format.';
}
if (!sourceRef) return 'Error: sourceRef is required for auditability.';
this.state.timeline.push({ timestamp, description, sourceRef });
return `Success: Logged event on ${timestamp}. Total events: ${this.state.timeline.length}.`;
}
recordFinancial(category: string, amount: number, currency: string): string {
const validCategories = ['revenue', 'expense', 'liability', 'asset'];
if (!validCategories.includes(category)) {
return `Error: invalid category. Must be one of: ${validCategories.join(', ')}`;
}
if (amount <= 0) return 'Error: amount must be positive.';
this.state.financials.push({ category, amount, currency, note: '' });
const total = this.state.financials.reduce((sum, f) => sum + f.amount, 0);
return `Success: Recorded ${category} (${currency} ${amount}). Running total: ${currency} ${total.toFixed(2)}.`;
}
markStepComplete(stepIndex: number): string {
if (!this.state.metadata.stepsCompleted.includes(stepIndex)) {
this.state.metadata.stepsCompleted.push(stepIndex);
}
const remaining = [1, 2, 3, 4].filter(s => !this.state.metadata.stepsCompleted.includes(s));
return `Step ${stepIndex} complete. Remaining: ${remaining.length > 0 ? remaining.join(', ') : 'None'}.`;
}
getState(): ExtractionState {
return { ...this.state };
}
}
Step 3: Decouple Read and Write Operations
Agents naturally interleave information retrieval with state mutation. By separating tools, you prevent the model from conflating exploration with serialization.
const agentTools = [
// Read/Exploration
{ name: 'fetch_document_section', description: 'Retrieve specific pages from source files' },
{ name: 'query_external_registry', description: 'Cross-reference entity data with public databases' },
// Write/Accumulation (Builder methods)
{ name: 'register_entity', description: 'Add a person or organization to the extraction state' },
{ name: 'log_event', description: 'Record a chronological occurrence with source attribution' },
{ name: 'record_financial', description: 'Append monetary values to the financial ledger' },
{ name: 'mark_step_complete', description: 'Track pipeline progression' }
];
Step 4: Context Compression with State Injection
Because the accumulator lives externally, you can aggressively truncate conversation history. Before compression, serialize a compact summary of the current state and inject it into the system prompt. This preserves extracted data while freeing tokens for reasoning.
function compressContextWithState(agentMessages: any[], accumulator: StateAccumulator): any[] {
const stateSnapshot = accumulator.getState();
const summary = `
[STATE SUMMARY]
Entities: ${stateSnapshot.entities.length}
Events: ${stateSnapshot.timeline.length}
Financials: ${stateSnapshot.financials.length}
Progress: ${stateSnapshot.metadata.stepsCompleted.length}/4 steps
`;
// Keep system prompt, inject summary, retain last 3 turns for immediate context
return [
agentMessages[0], // System prompt
{ role: 'system', content: summary },
...agentMessages.slice(-3)
];
}
Architecture Rationale:
- External state prevents context window truncation from destroying partial work.
- Tool boundaries enforce validation before data enters the pipeline, eliminating post-generation parsing failures.
- Decoupled read/write tools align with how agents actually process information: explore, verify, then commit.
- State injection during compression maintains continuity without consuming tokens on raw conversation history.
Pitfall Guide
Explanation: Passing nested objects or entire schema segments in a single tool call defeats the purpose of incremental accumulation. The model still faces the same generation pressure.
Fix: Decompose every schema node into atomic operations. One tool per data type. One tool per relationship.
2. Idempotency Blind Spots
Explanation: LLMs occasionally repeat tool calls or generate duplicate entries when context shifts. Without deduplication, your accumulator grows with redundant data.
Fix: Implement unique key generation (hashes, UUIDs, or composite keys) inside each tool. Check for existence before appending. Return a warning instead of duplicating.
3. State-Prompt Desynchronization
Explanation: After context compression, the model loses awareness of what has already been extracted, leading to redundant tool calls or skipped steps.
Fix: Always inject a structured state summary into the system prompt after compression. Include counts, completed steps, and pending validation flags.
4. Over-Engineered Validation Gates
Explanation: Rejecting inputs for minor formatting deviations (e.g., extra whitespace, case sensitivity) causes the model to loop or hallucinate workarounds.
Fix: Normalize at the tool boundary. Strip whitespace, coerce types, and accept flexible formats. Log normalization actions for auditability.
5. Context Compression Without State Injection
Explanation: Truncating history to save tokens while leaving the model blind to accumulated state causes regression. The agent restarts extraction from scratch.
Fix: Compression must always be paired with state serialization. The summary should be concise but contain enough metadata to guide next-step decisions.
Explanation: When agents execute multiple tool calls in parallel, race conditions can corrupt the accumulator if state mutations aren't thread-safe.
Fix: Use synchronous state updates or implement a locking mechanism. For parallel calls, batch mutations and apply them atomically after all tools return.
Explanation: Modifying tool parameters without updating the system prompt or validation logic causes silent failures or type mismatches.
Fix: Treat tool signatures as contract interfaces. Version them. Generate system prompts dynamically from tool definitions to ensure alignment.
Production Bundle
Action Checklist
Decision Matrix
| Scenario | Recommended Approach | Why | Cost Impact |
|---|
| Small, static forms (<5 fields) | Monolithic JSON with schema enforcement | Simpler implementation, low context usage | Low |
| Long documents, multi-step extraction | Incremental Tool Accumulation | Prevents context exhaustion, enables crash recovery | Medium (more tool calls) |
| Real-time streaming pipelines | Incremental Tool Accumulation + WebSocket sync | State updates propagate immediately to downstream systems | Medium-High |
| High-throughput batch processing | Incremental Tool Accumulation + Batched tool calls | Reduces API latency while preserving state integrity | Low-Medium |
Configuration Template
// agent-config.ts
import { AgentOrchestrator } from './orchestrator';
import { StateAccumulator } from './accumulator';
export function createExtractionAgent() {
const accumulator = new StateAccumulator();
const tools = [
{
name: 'register_entity',
parameters: {
type: 'object',
properties: {
name: { type: 'string' },
type: { type: 'string', enum: ['individual', 'organization'] }
},
required: ['name', 'type']
},
handler: (args: any) => accumulator.registerEntity(args.name, args.type)
},
{
name: 'log_event',
parameters: {
type: 'object',
properties: {
timestamp: { type: 'string', pattern: '^\\d{4}-\\d{2}-\\d{2}$' },
description: { type: 'string' },
sourceRef: { type: 'string' }
},
required: ['timestamp', 'description', 'sourceRef']
},
handler: (args: any) => accumulator.logEvent(args.timestamp, args.description, args.sourceRef)
},
{
name: 'record_financial',
parameters: {
type: 'object',
properties: {
category: { type: 'string', enum: ['revenue', 'expense', 'liability', 'asset'] },
amount: { type: 'number', minimum: 0.01 },
currency: { type: 'string' }
},
required: ['category', 'amount', 'currency']
},
handler: (args: any) => accumulator.recordFinancial(args.category, args.amount, args.currency)
},
{
name: 'mark_step_complete',
parameters: {
type: 'object',
properties: { stepIndex: { type: 'integer', minimum: 1, maximum: 4 } },
required: ['stepIndex']
},
handler: (args: any) => accumulator.markStepComplete(args.stepIndex)
}
];
return new AgentOrchestrator({
model: 'claude-sonnet-4-20250514', // or openai/gpt-4o
systemPrompt: `You are a data extraction agent. Use tools to build the extraction state incrementally. Validate inputs at tool boundaries. Track progress using mark_step_complete.`,
tools,
contextManager: {
maxTokens: 120000,
compressThreshold: 0.85,
onCompress: (messages) => compressContextWithState(messages, accumulator)
}
});
}
Quick Start Guide
- Initialize the accumulator: Create a state object that mirrors your target schema. Keep it outside the agent's conversation history.
- Define atomic tools: Map each schema field to a dedicated tool. Implement validation and idempotency inside the handler.
- Wire the orchestrator: Attach tools to your agent framework. Configure context compression to inject state summaries when token usage exceeds 80%.
- Run extraction: Invoke the agent with your source document. Monitor tool calls and state mutations in real-time.
- Retrieve final output: Call
accumulator.getState() after completion. The structured data is ready for downstream pipelines without parsing or cleanup.