umber;
maxDelayMs: number;
jitter: boolean;
retryableErrors: string[];
}
const defaultRetryPolicy: RetryPolicy = {
maxAttempts: 3,
baseDelayMs: 1000,
maxDelayMs: 30000,
jitter: true,
retryableErrors: ['TIMEOUT', 'RATE_LIMIT', 'INTERNAL_ERROR'],
};
async function executeWithRetry<T>(
fn: () => Promise<T>,
policy: RetryPolicy = defaultRetryPolicy
): Promise<T> {
let lastError: Error;
for (let attempt = 1; attempt <= policy.maxAttempts; attempt++) {
try {
return await fn();
} catch (error: any) {
lastError = error;
const errorCode = error.code || error.message;
if (!policy.retryableErrors.includes(errorCode)) {
throw error; // Non-retryable error
}
if (attempt === policy.maxAttempts) break;
const delay = Math.min(
policy.baseDelayMs * Math.pow(2, attempt - 1),
policy.maxDelayMs
);
const jitteredDelay = policy.jitter
? delay * (0.5 + Math.random() * 0.5)
: delay;
await new Promise((resolve) => setTimeout(resolve, jitteredDelay));
}
}
throw lastError!;
}
**Rationale:** Jitter randomizes retry timing across concurrent requests, preventing synchronized retries that overwhelm recovering services. The `retryableErrors` list ensures that deterministic failures (e.g., schema validation errors) do not waste resources on retries.
#### Layer 2: Circuit Breaking and Degraded Responses
When an agent fails repeatedly, the circuit breaker opens to stop traffic. Instead of returning an error, the system should invoke a fallback strategy, such as returning cached data or a static default, while flagging the response as degraded.
```typescript
type CircuitState = 'CLOSED' | 'OPEN' | 'HALF_OPEN';
interface CircuitBreakerConfig {
failureThreshold: number;
recoveryTimeoutMs: number;
monitoringWindowMs: number;
}
class CircuitBreaker {
private state: CircuitState = 'CLOSED';
private failureCount: number = 0;
private lastFailureTime: number = 0;
constructor(private config: CircuitBreakerConfig) {}
async execute<T>(fn: () => Promise<T>, fallback: () => T): Promise<T> {
if (this.state === 'OPEN') {
if (Date.now() - this.lastFailureTime > this.config.recoveryTimeoutMs) {
this.state = 'HALF_OPEN';
} else {
return fallback();
}
}
try {
const result = await fn();
this.onSuccess();
return result;
} catch (error) {
this.onFailure();
return fallback();
}
}
private onSuccess(): void {
this.failureCount = 0;
this.state = 'CLOSED';
}
private onFailure(): void {
this.failureCount++;
this.lastFailureTime = Date.now();
if (this.failureCount >= this.config.failureThreshold) {
this.state = 'OPEN';
}
}
}
// Usage with degradation flag
interface DegradedResult<T> {
data: T;
status: 'success' | 'degraded';
warning?: string;
}
function createDegradedFallback<T>(
cache: Map<string, T>,
key: string,
warning: string
): () => DegradedResult<T> {
return () => ({
data: cache.get(key) as T,
status: 'degraded',
warning,
});
}
Rationale: The circuit breaker protects the system from cascading latency. The HALF_OPEN state allows probing for recovery without risking a full traffic spike. The DegradedResult interface ensures that downstream consumers are aware of data staleness, enabling them to adjust their behavior or warn end-users.
Layer 3: Dynamic Graph Re-Planning
For critical failures where fallbacks are insufficient, the orchestrator must re-plan the execution graph. This involves analyzing dependencies, identifying alternative agents, and skipping non-critical steps.
interface AgentNode {
id: string;
dependencies: string[];
critical: boolean;
execute: () => Promise<any>;
alternatives?: string[]; // IDs of backup agents
}
interface ExecutionPlan {
nodes: AgentNode[];
order: string[]; // Topological sort order
}
class GraphOrchestrator {
private agents: Map<string, AgentNode> = new Map();
registerAgent(agent: AgentNode): void {
this.agents.set(agent.id, agent);
}
async replanAndExecute(
failedNodeId: string,
context: any
): Promise<ExecutionPlan> {
const failedAgent = this.agents.get(failedNodeId);
if (!failedAgent) throw new Error('Unknown agent');
// Strategy: Try alternatives first
if (failedAgent.alternatives) {
for (const altId of failedAgent.alternatives) {
const altAgent = this.agents.get(altId);
if (altAgent && this.isHealthy(altId)) {
// Substitute agent in graph
failedAgent.execute = altAgent.execute;
return this.generatePlan();
}
}
}
// Strategy: Skip if non-critical
if (!failedAgent.critical) {
// Remove node from plan, adjust dependencies
this.removeNode(failedNodeId);
return this.generatePlan();
}
// Strategy: Halt if critical and no alternatives
throw new Error(`Critical agent ${failedNodeId} failed with no recovery path.`);
}
private isHealthy(id: string): boolean {
// Check circuit breaker state or health endpoint
return true; // Simplified for example
}
private removeNode(id: string): void {
// Update graph topology to bypass removed node
this.agents.forEach((node) => {
node.dependencies = node.dependencies.filter((dep) => dep !== id);
});
}
private generatePlan(): ExecutionPlan {
// Perform topological sort based on current dependencies
return { nodes: [], order: [] }; // Placeholder
}
}
Rationale: Re-planning provides the highest level of resilience. By maintaining a registry of agents and their dependencies, the orchestrator can dynamically substitute failed components or prune non-essential paths. This requires agents to be loosely coupled and for the graph to be defined declaratively.
Pitfall Guide
-
Retry Storms Without Jitter
- Explanation: Retrying multiple agents simultaneously with fixed delays causes synchronized traffic spikes that overwhelm recovering services.
- Fix: Always implement jitter in retry policies to randomize delay intervals.
-
Ignoring Idempotency
- Explanation: Retrying non-idempotent agent actions (e.g., "create order") can lead to duplicate side effects.
- Fix: Enforce idempotency keys for all agent calls. Ensure agents check for existing results before executing.
-
Silent Degradation
- Explanation: Returning cached or fallback data without metadata flags causes downstream agents to treat stale data as fresh, leading to incorrect conclusions.
- Fix: Wrap all fallback responses in a
DegradedResult structure that includes status and warning fields. Downstream agents must inspect this metadata.
-
Hardcoded Dependencies
- Explanation: If agent dependencies are hardcoded in code, the orchestrator cannot re-plan or substitute agents dynamically.
- Fix: Define agent graphs using configuration files or a registry. Use dependency injection to resolve agents at runtime.
-
Circuit Breaker Threshold Misconfiguration
- Explanation: Thresholds that are too sensitive cause unnecessary circuit openings; thresholds that are too loose allow prolonged failure states.
- Fix: Tune thresholds based on historical error rates and latency distributions. Use adaptive thresholds that adjust based on system load.
-
State Loss During Re-Planning
- Explanation: When substituting agents or skipping steps, context accumulated from previous steps may be lost or incompatible.
- Fix: Maintain a shared context store that is accessible to all agents. Ensure alternative agents accept the same input schema.
-
Testing Only Happy Paths
- Explanation: Systems that are only tested with successful responses fail unpredictably in production when errors occur.
- Fix: Implement chaos engineering practices. Inject timeouts, errors, and latency into agent calls during testing to validate recovery logic.
Production Bundle
Action Checklist
Decision Matrix
| Scenario | Recommended Approach | Why | Cost Impact |
|---|
| High-Value Transaction | Halt and Alert | Data integrity is paramount; degraded output is unacceptable. | High (Lost transaction opportunity). |
| Real-Time Dashboard | Degraded/Cached Response | Latency is critical; users prefer stale data over errors. | Low (Minimal token cost for fallback). |
| Batch Processing | Retry + Re-Plan | Time is flexible; accuracy is required. Re-planning ensures completion. | Medium (Additional compute for retries). |
| User-Facing Chat | Fallback to Simpler Model | User experience must be maintained; switch to cheaper/faster model. | Low (Reduced latency, lower model cost). |
Configuration Template
// resilience.config.ts
export const agentResilienceConfig: Record<string, any> = {
marketDataAgent: {
retry: { maxAttempts: 3, baseDelayMs: 1000, jitter: true },
circuitBreaker: { failureThreshold: 5, recoveryTimeoutMs: 60000 },
fallback: { type: 'cache', ttlSeconds: 900 },
critical: true,
alternatives: ['marketDataBackupAgent'],
},
sentimentAnalysisAgent: {
retry: { maxAttempts: 2, baseDelayMs: 500 },
circuitBreaker: { failureThreshold: 10, recoveryTimeoutMs: 30000 },
fallback: { type: 'default', value: { score: 0, label: 'neutral' } },
critical: false,
},
};
Quick Start Guide
- Define Agent Graph: Create a registry of agents with their dependencies, criticality flags, and alternative agents.
- Attach Resilience Policies: Apply retry, circuit breaker, and fallback configurations to each agent based on the decision matrix.
- Implement Fallbacks: Code fallback functions for each agent, ensuring they return data in the expected schema with degradation metadata.
- Deploy with Observability: Instrument the orchestrator to log recovery events. Set up alerts for circuit breaker openings and re-planning triggers.
- Validate with Chaos Testing: Introduce synthetic failures to verify that the system handles errors gracefully and maintains output integrity.