tring;
inputPricePerM: number;
outputPricePerM: number;
maxContextWindow: number;
}
export const TIER_REGISTRY: Record<TaskTier, ModelConfig> = {
[TaskTier.COMPLEX]: {
providerId: 'claude-opus-4-7',
inputPricePerM: 5.0,
outputPricePerM: 25.0,
maxContextWindow: 200000
},
[TaskTier.STANDARD]: {
providerId: 'claude-sonnet-4-6',
inputPricePerM: 3.0,
outputPricePerM: 15.0,
maxContextWindow: 200000
},
[TaskTier.STRUCTURED]: {
providerId: 'gpt-5.5',
inputPricePerM: 3.0,
outputPricePerM: 12.0,
maxContextWindow: 128000
},
[TaskTier.BULK]: {
providerId: 'deepseek-chat',
inputPricePerM: 0.27,
outputPricePerM: 1.10,
maxContextWindow: 64000
}
};
**Why this structure:** Decoupling tier definitions from execution logic allows hot-swapping models without touching business code. Pricing metadata embedded at the tier level enables real-time cost calculation without external lookups.
### 2. Intelligent Classification Engine
Classification should be deterministic, fast, and extensible. Relying solely on LLM-based classification creates a recursive cost problem. Instead, use a hybrid heuristic approach: explicit metadata hints, semantic keyword matching, and payload size analysis.
```typescript
export class RequestClassifier {
private readonly complexSignals = [
'refactor', 'architect', 'design system', 'debug',
'race condition', 'security audit', 'performance optimize',
'explain trade-offs', 'compare approaches', 'root cause',
'memory leak', 'deadlock', 'distributed consensus'
];
private readonly bulkSignals = [
'generate tests', 'add docstrings', 'translate all',
'add comments', 'rename variable', 'format code',
'boilerplate', 'template', 'placeholder', 'stub'
];
private readonly structuredSignals = [
'json', 'csv', 'xml', 'schema', 'extract',
'parse', 'format as', 'return as', 'structured', 'array of objects'
];
classify(payload: string, metadata?: Record<string, unknown>): TaskTier {
if (metadata?.tier) {
return metadata.tier as TaskTier;
}
if (metadata?.jsonOutput === true) {
return TaskTier.STRUCTURED;
}
const normalized = payload.toLowerCase();
const tokenEstimate = payload.split(/\s+/).length;
if (this.structuredSignals.some(s => normalized.includes(s))) {
return TaskTier.STRUCTURED;
}
if (this.complexSignals.some(s => normalized.includes(s)) || tokenEstimate > 1000) {
return TaskTier.COMPLEX;
}
if (this.bulkSignals.some(s => normalized.includes(s))) {
return TaskTier.BULK;
}
return TaskTier.STANDARD;
}
}
Why this structure: Keyword matching operates in O(n) time with negligible overhead. Metadata hints allow upstream services to override classification when domain knowledge exists. The 1,000-token threshold acts as a proxy for context-heavy reasoning tasks that typically require higher-capability models.
3. Execution Layer with Fallback Chains
The execution engine must handle provider volatility. Fallback chains should be explicit, non-circular, and bounded by retry limits.
export interface ExecutionResult {
content: string;
modelUsed: string;
tier: TaskTier;
attempts: number;
tokens: { input: number; output: number };
costEstimate: number;
}
export class ExecutionEngine {
private readonly fallbackMap: Record<string, string> = {
'claude-opus-4-7': 'claude-sonnet-4-6',
'claude-sonnet-4-6': 'gpt-5.5',
'gpt-5.5': 'claude-sonnet-4-6',
'deepseek-chat': 'gpt-5.5'
};
constructor(private readonly client: any) {}
async dispatch(
userPrompt: string,
systemPrompt?: string,
metadata?: Record<string, unknown>,
maxRetries = 2
): Promise<ExecutionResult> {
const classifier = new RequestClassifier();
const tier = classifier.classify(userPrompt, metadata);
const config = TIER_REGISTRY[tier];
let currentModel = config.providerId;
const messages: any[] = [];
if (systemPrompt) messages.push({ role: 'system', content: systemPrompt });
messages.push({ role: 'user', content: userPrompt });
for (let attempt = 1; attempt <= maxRetries + 1; attempt++) {
try {
const response = await this.client.chat.completions.create({
model: currentModel,
messages,
max_tokens: 4096
});
const inputTokens = response.usage.prompt_tokens;
const outputTokens = response.usage.completion_tokens;
const cost = (inputTokens * config.inputPricePerM / 1_000_000) +
(outputTokens * config.outputPricePerM / 1_000_000);
return {
content: response.choices[0].message.content,
modelUsed: currentModel,
tier,
attempts: attempt,
tokens: { input: inputTokens, output: outputTokens },
costEstimate: cost
};
} catch (error) {
const fallback = this.fallbackMap[currentModel];
if (fallback && attempt <= maxRetries) {
currentModel = fallback;
continue;
}
throw new Error(`Routing execution failed after ${attempt} attempts: ${error}`);
}
}
throw new Error('Unreachable fallback state');
}
}
Why this structure: The fallback map prevents infinite retry loops and ensures traffic degrades gracefully. Cost estimation happens synchronously post-response, enabling real-time budget tracking. Separating classification from execution allows independent scaling and testing.
4. Quality Verification & Async Batching
For non-critical workflows, a verification step prevents cheap models from producing subtly degraded output. For high-volume tasks, parallel execution reduces wall-clock time.
export async function verifyAndReroute(
engine: ExecutionEngine,
prompt: string,
threshold = 0.8
): Promise<ExecutionResult> {
const initial = await engine.dispatch(prompt);
if (initial.tier === TaskTier.COMPLEX) return initial;
const verification = await engine.dispatch(
`Rate the quality of this response on a scale of 0-1. Return only the number.\n\nPrompt: ${prompt}\n\nResponse: ${initial.content}`,
undefined,
{ tier: TaskTier.STANDARD }
);
const score = parseFloat(verification.content.trim());
if (!isNaN(score) && score < threshold) {
return engine.dispatch(prompt, undefined, { tier: TaskTier.COMPLEX });
}
return initial;
}
export async function processBatch(
engine: ExecutionEngine,
items: string[],
template: string
): Promise<ExecutionResult[]> {
const promises = items.map(item =>
engine.dispatch(template.replace('{{item}}', item))
);
return Promise.all(promises);
}
Why this structure: Verification runs only when the initial tier is below complex, preventing unnecessary spend. Batching leverages async concurrency while maintaining individual fallback chains per item.
Pitfall Guide
1. Over-Engineering the Classifier
Explanation: Teams often attempt to train custom ML models or use LLMs to classify incoming requests before routing. This adds latency, introduces recursive token costs, and rarely outperforms well-tuned heuristics for standard workloads.
Fix: Start with deterministic keyword matching and metadata hints. Only introduce ML classification when false-positive rates exceed 15% after manual tuning.
2. Ignoring Context Window Constraints
Explanation: Routing logic frequently overlooks model-specific context limits. Sending a 90,000-token payload to DeepSeek V3 (64K window) causes silent truncation or API errors.
Fix: Validate payload size against maxContextWindow in the tier registry before dispatch. Implement automatic chunking or fallback to a larger-window model when limits are breached.
3. Circular or Dead-End Fallback Chains
Explanation: Poorly mapped fallbacks can create infinite retry loops or route traffic to models that also fail under the same conditions.
Fix: Audit fallback maps as directed acyclic graphs. Ensure every chain terminates at a highly available baseline model. Log fallback transitions for post-mortem analysis.
4. Prompt Contamination Across Tiers
Explanation: System prompts optimized for Claude Opus often rely on advanced reasoning patterns that degrade or fail on cheaper models. Routing the same prompt across tiers without adaptation causes inconsistent outputs.
Fix: Implement prompt templating per tier. Strip complex chain-of-thought instructions for bulk/structured tiers. Maintain a prompt registry that maps system instructions to capability levels.
5. Missing Observability & Cost Attribution
Explanation: Without tracking tier distribution, fallback rates, and per-request costs, routing becomes a black box. Teams cannot validate whether the 70% savings projection holds in production.
Fix: Emit structured metrics on every dispatch: tier hit rate, model used, attempt count, token consumption, and estimated cost. Aggregate into dashboards with alerting on fallback spikes.
6. Rate Limit Blind Spots in Batching
Explanation: Parallel execution without concurrency control triggers provider rate limits, causing cascading failures that bypass fallback logic.
Fix: Implement token-aware concurrency limits or use a queue-based dispatcher with exponential backoff. Respect provider-specific RPM/TPM quotas before dispatching batches.
7. Verification Overhead Exceeding Task Value
Explanation: Running a quality check on every request doubles token consumption for simple tasks, negating routing savings.
Fix: Gate verification behind explicit flags or cost thresholds. Only verify when the initial tier is bulk/structured and the task impacts downstream data integrity.
Production Bundle
Action Checklist
Decision Matrix
| Scenario | Recommended Approach | Why | Cost Impact |
|---|
| High-stakes architectural reasoning | Route to Complex tier (Opus) | Multi-step logic requires advanced reasoning; fallback degrades quality | High per-token, but prevents costly rework |
| Bulk code transformation / test generation | Route to Bulk tier (DeepSeek V3) | Repetitive patterns execute efficiently on lightweight models | ~90% reduction vs monolithic routing |
| Real-time user chat interface | Route to Standard tier (Sonnet) | Balances latency, cost, and conversational coherence | Moderate, predictable baseline |
| API response parsing / data extraction | Route to Structured tier (GPT-5.5) | Optimized JSON/schema adherence reduces parsing errors | Lower output pricing, higher reliability |
| Unknown / mixed intent requests | Default to Standard tier with verification flag | Safe baseline that triggers quality checks only when needed | Minimal overhead, prevents misrouting |
Configuration Template
// routing.config.ts
export const ROUTING_CONFIG = {
tiers: {
complex: { model: 'claude-opus-4-7', fallback: 'claude-sonnet-4-6', maxRetries: 2 },
standard: { model: 'claude-sonnet-4-6', fallback: 'gpt-5.5', maxRetries: 2 },
structured: { model: 'gpt-5.5', fallback: 'claude-sonnet-4-6', maxRetries: 2 },
bulk: { model: 'deepseek-chat', fallback: 'gpt-5.5', maxRetries: 2 }
},
thresholds: {
complexTokenLimit: 1000,
verificationScore: 0.8,
batchConcurrency: 10,
contextWindowBuffer: 0.9
},
observability: {
enableCostTracking: true,
fallbackAlertThreshold: 0.15,
metricsEndpoint: '/api/v1/metrics/routing'
}
};
Quick Start Guide
- Initialize the registry: Copy the tier configuration and pricing data into your project. Ensure model IDs match your provider's current naming conventions.
- Deploy the classifier: Integrate the
RequestClassifier into your API gateway or service layer. Pass explicit metadata from upstream clients when task intent is known.
- Wire the execution engine: Replace direct model calls with
ExecutionEngine.dispatch(). Configure your HTTP client with appropriate timeouts and retry policies.
- Enable metrics collection: Attach a middleware or interceptor to log tier distribution, fallback events, and token consumption. Verify routing decisions against expected workload patterns within 7 days.
- Tune thresholds: Adjust keyword signals, context limits, and verification gates based on production telemetry. Disable verification for bulk tasks once baseline quality stabilizes.