tState {
messages: string[];
executionPlan: string[];
toolResults: Record<string, unknown>;
metadata: { step: number; status: 'running' | 'paused' | 'completed' | 'failed' };
}
const stateSchema: Record<keyof AgentState, StateChannel<any>> = {
messages: {
name: 'messages',
initialValue: [],
reducer: (current, incoming) => [...current, ...incoming]
},
executionPlan: {
name: 'executionPlan',
initialValue: [],
reducer: (current, incoming) => incoming // overwrite on replanning
},
toolResults: {
name: 'toolResults',
initialValue: {},
reducer: (current, incoming) => ({ ...current, ...incoming })
},
metadata: {
name: 'metadata',
initialValue: { step: 0, status: 'running' },
reducer: (current, incoming) => ({ ...current, ...incoming })
}
};
**Why this matters:** Linear state accumulation causes context drift. Typed reducers ensure that message history appends, plans overwrite, and tool results merge predictably. This structure enables deterministic state reconstruction during recovery.
### Step 2: Implement Node-Boundary Checkpointing with Tree Versioning
Checkpoints must occur at execution node boundaries, not mid-stream. Each checkpoint references a parent, forming a version tree that supports branching and rollback.
```typescript
interface Checkpoint {
id: string;
parentId: string | null;
state: AgentState;
timestamp: number;
branchId: string;
}
class CheckpointStore {
private storage: Map<string, Checkpoint> = new Map();
async save(checkpoint: Checkpoint): Promise<void> {
this.storage.set(checkpoint.id, checkpoint);
}
async load(id: string): Promise<Checkpoint | undefined> {
return this.storage.get(id);
}
async branchFrom(parentId: string, newState: AgentState): Promise<Checkpoint> {
const parent = await this.load(parentId);
if (!parent) throw new Error('Parent checkpoint not found');
const newCheckpoint: Checkpoint = {
id: crypto.randomUUID(),
parentId,
state: newState,
timestamp: Date.now(),
branchId: crypto.randomUUID()
};
await this.save(newCheckpoint);
return newCheckpoint;
}
}
Why this matters: Mid-stream checkpointing during LLM token generation creates inconsistent states. Node-boundary saves guarantee that each step completes or rolls back cleanly. The tree structure allows you to fork execution paths (e.g., retry with different tools, skip approval, adjust parameters) without losing historical context.
Step 3: Build Interrupt/Pause Mechanism
Human-in-the-loop and external signals should reuse the checkpoint system. Pausing execution is simply saving a checkpoint and emitting a control signal.
type ControlSignal = 'CONTINUE' | 'PAUSE' | 'ABORT' | 'REPLAN';
interface ExecutionNode {
id: string;
execute: (state: AgentState) => Promise<{ nextState: AgentState; signal: ControlSignal }>;
requiresApproval?: boolean;
}
class AgentOrchestrator {
constructor(
private store: CheckpointStore,
private nodes: ExecutionNode[]
) {}
async run(initialState: AgentState, runId: string): Promise<AgentState> {
let currentState = initialState;
let currentCheckpointId = crypto.randomUUID();
await this.store.save({
id: currentCheckpointId,
parentId: null,
state: currentState,
timestamp: Date.now(),
branchId: runId
});
for (const node of this.nodes) {
const result = await node.execute(currentState);
currentState = result.nextState;
currentState.metadata.step = node.id;
if (result.signal === 'PAUSE' || node.requiresApproval) {
currentState.metadata.status = 'paused';
await this.store.save({
id: crypto.randomUUID(),
parentId: currentCheckpointId,
state: currentState,
timestamp: Date.now(),
branchId: runId
});
return currentState; // Yield control to runtime
}
currentCheckpointId = crypto.randomUUID();
await this.store.save({
id: currentCheckpointId,
parentId: currentCheckpointId,
state: currentState,
timestamp: Date.now(),
branchId: runId
});
}
currentState.metadata.status = 'completed';
return currentState;
}
}
Why this matters: HITL is not a UI concern. By treating pauses as checkpoint saves, the runtime survives process restarts, container scaling events, and network partitions. The agent resumes exactly where it left off, with full context intact.
Step 4: Abstract Persistence for Pluggable Backends
Development and production have different durability requirements. The storage layer must be swappable without changing execution logic.
interface PersistenceAdapter {
save(checkpoint: Checkpoint): Promise<void>;
load(id: string): Promise<Checkpoint | undefined>;
listByBranch(branchId: string): Promise<Checkpoint[]>;
}
// Example: Redis adapter for high-throughput production
class RedisPersistenceAdapter implements PersistenceAdapter {
constructor(private client: any) {} // ioredis or similar
async save(checkpoint: Checkpoint) {
await this.client.hset(`cp:${checkpoint.id}`, JSON.stringify(checkpoint));
await this.client.expire(`cp:${checkpoint.id}`, 86400); // TTL for cost control
}
async load(id: string) {
const raw = await this.client.hgetall(`cp:${id}`);
return raw ? JSON.parse(raw) : undefined;
}
async listByBranch(branchId: string) {
const keys = await this.client.keys('cp:*');
return keys.map(k => JSON.parse(this.client.hgetall(k))).filter(c => c.branchId === branchId);
}
}
Why this matters: Hardcoding storage ties your agent to a specific infrastructure tier. Pluggable adapters let you start with in-memory or SQLite for local testing, then migrate to Redis or Postgres for production without rewriting orchestration logic.
Pitfall Guide
1. Monolithic State Storage
Explanation: Storing all context (messages, tool outputs, user preferences, business records) in a single JSON blob or conversation table.
Fix: Enforce strict state layering. Short-term execution state lives in checkpoints. Long-term memory resides in a namespaced vector or key-value store. Business data remains in domain systems and is accessed exclusively through permission-scoped tools.
2. Mid-Stream Checkpointing
Explanation: Attempting to save state while an LLM is still generating tokens or a tool is mid-execution.
Fix: Checkpoint only at node boundaries. If a model call fails, the entire node re-executes. This guarantees atomic state transitions and prevents partial context corruption.
3. Treating HITL as Frontend Logic
Explanation: Implementing human approval as a UI spinner or polling endpoint that loses state on page refresh or server restart.
Fix: Move HITL to the runtime. A pause signal triggers a checkpoint save and yields control. The frontend simply queries the runtime for pending approvals. Resume sends a CONTINUE signal with the checkpoint ID.
Explanation: Assuming that because checkpointing prevents re-runs, tools don't need idempotency guarantees.
Fix: Design every external tool call with idempotency keys. Checkpoints prevent duplicate execution in the same run, but network retries, manual replays, or branch merges can still trigger duplicate calls. Idempotency is the safety net.
5. Hardcoding Persistence Backends
Explanation: Tying checkpoint storage to a specific database driver or in-memory map without abstraction.
Fix: Define a PersistenceAdapter interface. Implement environment-specific adapters. Use configuration flags or dependency injection to swap backends. This prevents infrastructure lock-in and simplifies scaling.
6. Over-Relying on Model Self-Correction
Explanation: Expecting the LLM to detect and recover from tool failures, permission denials, or malformed outputs through prompt engineering alone.
Fix: Implement deterministic fallback chains at the runtime level. If a tool fails twice, route to a secondary tool or trigger a human pause. Let the model focus on reasoning, not infrastructure error handling.
7. Blurring Business Data with Agent Context
Explanation: Injecting raw business records (orders, customer profiles, financial data) directly into the agent's working memory.
Fix: Keep business data in domain systems. Agents access it through read-only tools with explicit permission checks. This prevents context pollution, reduces token costs, and maintains audit compliance.
Production Bundle
Action Checklist
Decision Matrix
| Scenario | Recommended Approach | Why | Cost Impact |
|---|
| Rapid Prototype / Internal Tool | In-memory or SQLite checkpoint store | Fast iteration, zero infrastructure overhead | Low (dev time only) |
| Enterprise SaaS with HITL | Postgres-backed checkpoint tree with TTL | ACID compliance, audit trails, reliable pause/resume | Medium (managed DB costs) |
| High-Throughput Batch Processing | Redis or DynamoDB with async checkpointing | Low latency, horizontal scaling, high concurrency | High (provisioned throughput) |
| Multi-Tenant Platform | Namespaced state channels + row-level security | Tenant isolation, compliance, predictable retrieval | Medium-High (schema complexity) |
Configuration Template
// runtime.config.ts
import { CheckpointStore } from './checkpoint-store';
import { RedisPersistenceAdapter } from './adapters/redis';
import { PostgresPersistenceAdapter } from './adapters/postgres';
export function createRuntimeConfig(env: 'dev' | 'staging' | 'prod') {
const adapters = {
dev: new RedisPersistenceAdapter({ host: 'localhost', port: 6379 }),
staging: new PostgresPersistenceAdapter({ connectionString: process.env.STAGING_DB_URL }),
prod: new PostgresPersistenceAdapter({ connectionString: process.env.PROD_DB_URL })
};
return {
checkpointStore: new CheckpointStore(adapters[env]),
maxRetriesPerNode: 2,
checkpointTTL: env === 'prod' ? 86400 : 3600,
enableBranching: true,
hitlTimeout: 1800 // 30 minutes
};
}
Quick Start Guide
- Initialize the runtime: Import
createRuntimeConfig and instantiate the orchestrator with your execution nodes.
- Define state schema: Map your workflow's data requirements to typed channels with appropriate reducers.
- Build execution nodes: Wrap each logical step (model call, tool invocation, conditional branch) in an
ExecutionNode with explicit return signals.
- Deploy with checkpointing: Start the orchestrator. Simulate a failure mid-run, restart the process, and resume using the latest checkpoint ID. Verify state continuity and side-effect safety.
The gap between a working demo and a production agent isn't measured in model parameters or prompt tokens. It's measured in checkpoint reliability, state isolation, and execution durability. Build the runtime first. The intelligence will follow.