cific failure modes.
// Base diagnostic contract
abstract class AgentEnsembleException extends Error {
public readonly timestamp: Date;
public readonly pipelineId: string;
constructor(message: string, pipelineId: string) {
super(message);
this.name = this.constructor.name;
this.timestamp = new Date();
this.pipelineId = pipelineId;
}
}
// Configuration and build-time failures
class ValidationException extends AgentEnsembleException {}
class PromptTemplateException extends AgentEnsembleException {
public readonly missingVariables: string[];
constructor(message: string, pipelineId: string, missingVariables: string[]) {
super(message, pipelineId);
this.missingVariables = missingVariables;
}
}
// Execution and runtime failures
class TaskExecutionException extends AgentEnsembleException {
public readonly completedOutputs: TaskOutput[];
public readonly failedTaskId: string;
constructor(message: string, pipelineId: string, completedOutputs: TaskOutput[], failedTaskId: string) {
super(message, pipelineId);
this.completedOutputs = completedOutputs;
this.failedTaskId = failedTaskId;
}
}
class AgentExecutionException extends AgentEnsembleException {
public readonly httpStatus?: number;
public readonly retryAfter?: number;
constructor(message: string, pipelineId: string, httpStatus?: number, retryAfter?: number) {
super(message, pipelineId);
this.httpStatus = httpStatus;
this.retryAfter = retryAfter;
}
}
class MaxIterationsExceededException extends AgentEnsembleException {
public readonly configuredLimit: number;
public readonly actualCount: number;
constructor(message: string, pipelineId: string, configuredLimit: number, actualCount: number) {
super(message, pipelineId);
this.configuredLimit = configuredLimit;
this.actualCount = actualCount;
}
}
class GuardrailViolationException extends AgentEnsembleException {
public readonly direction: 'INPUT' | 'OUTPUT';
public readonly violationPayload: Record<string, unknown>;
constructor(message: string, pipelineId: string, direction: 'INPUT' | 'OUTPUT', violationPayload: Record<string, unknown>) {
super(message, pipelineId);
this.direction = direction;
this.violationPayload = violationPayload;
}
}
Architecture Rationale: Using a strict inheritance chain ensures that catch blocks can be ordered from most specific to most general. Unchecked exceptions (or runtime errors in TypeScript) are preferred here because agent failures are expected control flow, not exceptional programming errors. This prevents try/catch bloat and keeps the orchestration layer clean.
Step 2: Model Terminal States Explicitly
Not every pipeline termination is an error. Exit reasons must be modeled as a discriminated union to separate system failures from intentional stops.
type PipelineExitReason =
| { type: 'COMPLETED'; rawOutput: string }
| { type: 'USER_EXIT_EARLY'; completedTaskCount: number }
| { type: 'TIMEOUT'; gateName: string }
| { type: 'ERROR'; exception: AgentEnsembleException };
interface PipelineResult {
exitReason: PipelineExitReason;
metadata: {
durationMs: number;
tokenUsage: number;
toolCalls: number;
};
}
Architecture Rationale: Decoupling exit reasons from exceptions prevents false-positive alerting. A USER_EXIT_EARLY or TIMEOUT should trigger workflow state updates, not PagerDuty incidents. This separation enables accurate dashboarding and cost attribution.
Step 3: Implement the Orchestration Handler
The execution layer routes failures based on type, preserves partial state, and delegates retry logic to a resilience policy.
async function executeAgentPipeline(
pipelineId: string,
inputs: Record<string, unknown>
): Promise<PipelineResult> {
const startTime = performance.now();
try {
// Framework execution (e.g., AgentEnsemble.run())
const result = await runEnsemble(pipelineId, inputs);
return {
exitReason: { type: 'COMPLETED', rawOutput: result.raw },
metadata: extractMetrics(result)
};
} catch (error) {
const duration = performance.now() - startTime;
if (error instanceof TaskExecutionException) {
// Preserve intermediate work before alerting
await persistPartialState(pipelineId, error.completedOutputs);
await notifyPipelineBreakpoint(error.failedTaskId, error.message);
return {
exitReason: { type: 'ERROR', exception: error },
metadata: { ...extractMetrics(error), durationMs: duration }
};
}
if (error instanceof AgentExecutionException) {
// Delegate to resilience layer; do not retry here
return {
exitReason: { type: 'ERROR', exception: error },
metadata: { ...extractMetrics(error), durationMs: duration }
};
}
if (error instanceof GuardrailViolationException) {
await auditGuardrailBreach(error.direction, error.violationPayload);
return {
exitReason: { type: 'ERROR', exception: error },
metadata: { ...extractMetrics(error), durationMs: duration }
};
}
// Fallback for unknown errors
return {
exitReason: { type: 'ERROR', exception: error as AgentEnsembleException },
metadata: { durationMs: duration, tokenUsage: 0, toolCalls: 0 }
};
}
}
Architecture Rationale: The handler acts as a routing switch, not a retry engine. Each exception type triggers a specific operational action: partial state persistence, audit logging, or resilience delegation. This keeps the execution path deterministic and observable.
Pitfall Guide
1. Flat Exception Catching
Explanation: Catching only the base AgentEnsembleException discards type-specific metadata. Operators lose the ability to distinguish between a missing template variable and a rate-limited API call.
Fix: Order catch blocks from most specific to most general. Extract subtype properties before logging or routing.
Explanation: Treating a TaskExecutionException as a total failure wastes completed LLM calls and tool invocations. This forces full pipeline replays, increasing latency and cost.
Fix: Always serialize completedOutputs to durable storage before triggering alerts. Use these outputs to seed resumable checkpoints.
3. Blind Retry on Non-Transient Errors
Explanation: Applying exponential backoff to ValidationException, GuardrailViolationException, or PromptTemplateException burns API quota and delays human intervention.
Fix: Classify error transience before retrying. Only retry AgentExecutionException with HTTP 429/5xx status codes. Fail fast on configuration and policy errors.
4. Conflating Timeouts with System Failures
Explanation: Treating TIMEOUT or USER_EXIT_EARLY as errors triggers false-positive monitoring alerts and skews success rate metrics.
Fix: Route exit reasons to separate operational workflows. Timeouts should trigger escalation queues; user exits should update task status to ABORTED_BY_REVIEWER.
5. Hardcoding Retry Policies in the Agent Runner
Explanation: Embedding backoff logic inside the execution loop couples resilience to business logic. It prevents dynamic policy updates and makes testing difficult.
Fix: Delegate retry, circuit breaking, and rate limiting to a dedicated resilience library (e.g., Resilience4j, Polly, or custom middleware). The agent runner should only expose failure signals.
6. Ignoring Guardrail Context in Logs
Explanation: Logging only "guardrail triggered" without the input/output payload or violation type makes compliance audits impossible and obscures model behavior patterns.
Fix: Capture direction, violationPayload, and taskDescription in structured logs. Route to a compliance sink for regulatory tracking.
7. Masking ReAct Loops with Higher Iteration Limits
Explanation: Increasing maxIterations to bypass MaxIterationsExceededException hides inefficient tool usage and increases token consumption without solving the root cause.
Fix: Analyze tool call sequences before adjusting limits. Refine tool descriptions, remove redundant tools, or adjust system prompts to guide the agent toward convergence.
Production Bundle
Action Checklist
Decision Matrix
| Scenario | Recommended Approach | Why | Cost Impact |
|---|
| HTTP 429 / 5xx from LLM provider | Resilience4j retry with exponential backoff | Transient infrastructure failure; retry succeeds >85% of the time | Low; adds ~10-20% latency, saves full pipeline replay cost |
| Guardrail violation (INPUT) | Fail fast + audit log | Policy breach; retry violates compliance and wastes tokens | None; immediate termination prevents downstream waste |
| Human reviewer abandons gate | Update state to USER_EXIT_EARLY + notify workflow | Intentional stop; not a system failure | None; accurate metrics prevent false alerting costs |
| ReAct loop exceeds limit | Analyze tool sequence + refine prompts | Iteration cap is a symptom, not a fix; masking increases token spend | High if ignored; optimization reduces token cost by 30-50% |
| Missing template variable | Fail fast + CI/CD gate | Build-time configuration error; runtime retry is impossible | None; catches error before deployment |
Configuration Template
// resilience-config.ts
import { RetryPolicy, CircuitBreaker, RateLimiter } from 'resilience-framework';
export const agentRetryPolicy = new RetryPolicy({
maxAttempts: 3,
backoffStrategy: 'exponential',
initialDelayMs: 1000,
maxDelayMs: 10000,
retryableExceptions: [AgentExecutionException],
shouldRetry: (error) => {
if (error instanceof AgentExecutionException) {
return error.httpStatus === 429 || (error.httpStatus >= 500 && error.httpStatus < 600);
}
return false;
}
});
export const pipelineCircuitBreaker = new CircuitBreaker({
failureThreshold: 5,
resetTimeoutMs: 30000,
monitoredExceptions: [AgentExecutionException, ToolExecutionException]
});
export const guardrailAuditSink = {
write: async (violation: GuardrailViolationException) => {
await logToComplianceStore({
pipelineId: violation.pipelineId,
direction: violation.direction,
payload: violation.violationPayload,
timestamp: violation.timestamp.toISOString()
});
}
};
Quick Start Guide
- Install resilience dependencies: Add your preferred resilience library (Resilience4j, Polly, or custom middleware) to handle transient failures outside the agent runner.
- Wire the exception hierarchy: Ensure your pipeline runner catches
AgentEnsembleException subtypes in order of specificity, routing each to its designated handler.
- Implement partial state persistence: Before any alert or retry, serialize
completedOutputs to durable storage. Tag the checkpoint with pipelineId and failedTaskId.
- Configure exit reason routing: Create separate handlers for
COMPLETED, USER_EXIT_EARLY, TIMEOUT, and ERROR. Attach exit reasons to all telemetry and dashboard metrics.
- Validate with synthetic failures: Inject controlled failures (rate limits, guardrail triggers, template gaps) to verify routing, persistence, and alerting paths before production deployment.