letions',
modelId: 'gpt-4o',
costPerMillionTokens: 5000,
expectedLatencyMs: 2100,
supportedTasks: ['complex_reasoning', 'creative_writing', 'deep_analysis'],
authHeader: 'Authorization',
},
anthropic: {
endpoint: 'https://api.anthropic.com/v1/messages',
modelId: 'claude-sonnet-4-20250514',
costPerMillionTokens: 3000,
expectedLatencyMs: 1800,
supportedTasks: ['complex_reasoning', 'deep_analysis', 'structured_output'],
authHeader: 'x-api-key',
},
};
**Architecture Rationale:** Storing provider metadata in a typed dictionary enables compile-time validation and simplifies runtime lookups. Pricing is normalized to per-million tokens to align with industry billing standards. Expected latency serves as a heuristic for fallback ordering, ensuring that slower providers are deprioritized for latency-sensitive paths.
### Step 2: Heuristic Query Classifier
A lightweight classifier determines task complexity without invoking additional model calls. This implementation uses signal scoring based on lexical patterns, prompt length, and explicit context hints.
```typescript
type TaskCategory = 'simple' | 'code' | 'medium' | 'complex';
interface ClassificationResult {
category: TaskCategory;
confidence: number;
signals: string[];
}
function classifyInferenceTask(prompt: string, context?: Record<string, string>): ClassificationResult {
const normalized = prompt.toLowerCase();
const signals: string[] = [];
let score = 0;
const codePatterns = ['function', 'debug', 'implement', 'refactor', 'async', 'class', 'import'];
const simplePatterns = ['what is', 'how to', 'define', 'list', 'convert', 'format'];
const complexPatterns = ['analyze', 'compare', 'evaluate', 'architect', 'trade-off', 'optimize', 'strategy'];
codePatterns.forEach(p => { if (normalized.includes(p)) { signals.push('code'); score += 2; } });
simplePatterns.forEach(p => { if (normalized.includes(p)) { signals.push('simple'); score += 1; } });
complexPatterns.forEach(p => { if (normalized.includes(p)) { signals.push('complex'); score += 3; } });
if (context?.domain === 'coding') score += 2;
if (prompt.length > 600) score += 1;
let category: TaskCategory = 'medium';
if (score >= 6 || prompt.length > 800) category = 'complex';
else if (score >= 3) category = 'code';
else if (score <= 1 && prompt.length < 250) category = 'simple';
const confidence = Math.min(score / 8, 1);
return { category, confidence, signals };
}
Architecture Rationale: Keyword scoring provides deterministic classification with minimal overhead. Confidence scoring allows the router to apply conservative fallbacks when classification certainty drops. Context hints enable explicit task declaration from the application layer, bypassing heuristic ambiguity for known workflows.
Step 3: Routing Engine with Circuit Breaking & Telemetry
The routing engine orchestrates provider selection, executes requests, handles failures, and records telemetry. This implementation includes a sliding-window failure counter and cost-aware fallback logic.
interface RoutingMetrics {
provider: string;
tokensUsed: number;
costUsd: number;
latencyMs: number;
timestamp: number;
}
class InferenceRouter {
private registry: Record<string, ProviderConfig>;
private failureWindow: Map<string, number[]>;
private metricsLog: RoutingMetrics[];
private maxRetries: number;
constructor(registry: Record<string, ProviderConfig>, options?: { maxRetries?: number }) {
this.registry = registry;
this.failureWindow = new Map();
this.metricsLog = [];
this.maxRetries = options?.maxRetries ?? 2;
}
private recordFailure(provider: string): void {
const now = Date.now();
const window = this.failureWindow.get(provider) ?? [];
window.push(now);
this.failureWindow.set(provider, window.filter(t => now - t < 60000));
}
private getRecentFailures(provider: string): number {
const now = Date.now();
const window = this.failureWindow.get(provider) ?? [];
return window.filter(t => now - t < 60000).length;
}
private selectCandidates(category: TaskCategory): string[] {
const priorityMap: Record<TaskCategory, string[]> = {
simple: ['groq', 'cerebras'],
code: ['cerebras', 'groq'],
medium: ['groq', 'anthropic', 'openai'],
complex: ['openai', 'anthropic'],
};
return priorityMap[category] ?? Object.keys(this.registry);
}
async executeInference(prompt: string, category: TaskCategory): Promise<{ output: string; metrics: RoutingMetrics }> {
const candidates = this.selectCandidates(category);
const orderedProviders = candidates.sort((a, b) =>
this.getRecentFailures(a) - this.getRecentFailures(b)
);
for (let attempt = 0; attempt <= this.maxRetries; attempt++) {
const providerKey = orderedProviders[attempt % orderedProviders.length];
const config = this.registry[providerKey];
if (!config) continue;
const startTime = performance.now();
try {
const headers: Record<string, string> = {
'Content-Type': 'application/json',
[config.authHeader]: process.env[`${providerKey.toUpperCase()}_API_KEY`] ?? '',
};
const payload = {
model: config.modelId,
messages: [{ role: 'user', content: prompt }],
max_tokens: 1024,
};
const response = await fetch(config.endpoint, {
method: 'POST',
headers,
body: JSON.stringify(payload),
});
if (!response.ok) throw new Error(`HTTP ${response.status}`);
const data = await response.json();
const latency = performance.now() - startTime;
const tokens = data.usage?.total_tokens ?? 0;
const cost = (tokens / 1_000_000) * config.costPerMillionTokens;
const metrics: RoutingMetrics = {
provider: providerKey,
tokensUsed: tokens,
costUsd: cost,
latencyMs: Math.round(latency),
timestamp: Date.now(),
};
this.metricsLog.push(metrics);
this.failureWindow.delete(providerKey); // Reset on success
return { output: data.choices?.[0]?.message?.content ?? data.content?.[0]?.text ?? '', metrics };
} catch (err) {
this.recordFailure(providerKey);
console.warn(`[${providerKey}] attempt ${attempt + 1} failed: ${(err as Error).message}`);
}
}
throw new Error('Routing exhausted: all candidates failed within retry window');
}
getAggregatedStats(): { totalCost: number; totalRequests: number; avgLatency: number } {
const total = this.metricsLog.reduce((acc, m) => ({
cost: acc.cost + m.costUsd,
latency: acc.latency + m.latencyMs,
count: acc.count + 1,
}), { cost: 0, latency: 0, count: 0 });
return {
totalCost: parseFloat(total.cost.toFixed(4)),
totalRequests: total.count,
avgLatency: Math.round(total.latency / total.count),
};
}
}
Architecture Rationale: The router separates candidate selection from execution, enabling dynamic reordering based on real-time failure rates. The sliding-window failure counter prevents cascading outages by temporarily deprioritizing unstable endpoints. Telemetry is captured synchronously to ensure cost and latency tracking aligns with actual execution paths. The retry loop rotates through candidates rather than repeating the same provider, maximizing fault tolerance.
Pitfall Guide
1. Keyword Fragility in Classification
Explanation: Relying solely on lexical matches causes misclassification when domain-specific terms overlap with unrelated categories (e.g., "function" in biology vs programming).
Fix: Implement confidence scoring and allow explicit context hints from the application layer. When confidence drops below a threshold, default to a mid-tier provider rather than forcing a low-cost route.
2. Ignoring Provider Latency Variance
Explanation: Expected latency values are averages. Real-world performance fluctuates based on region, queue depth, and model load. Hardcoding latency assumptions causes timeout errors during peak hours.
Fix: Implement adaptive timeout thresholds that scale with provider tier. Use circuit breakers that temporarily route traffic away from endpoints exhibiting >2x expected latency.
3. Hardcoded Fallback Chains
Explanation: Static fallback orders (A -> B -> C) fail when multiple providers share the same underlying infrastructure or experience correlated outages.
Fix: Dynamically reorder fallbacks based on real-time health checks and geographic distribution. Maintain provider diversity in fallback chains to avoid correlated failure modes.
4. Missing Quality Feedback Loops
Explanation: Routing decisions based purely on cost and latency ignore output quality. A cheap provider may return technically correct but contextually poor responses, degrading user trust.
Fix: Implement lightweight quality scoring (e.g., user thumbs up/down, validation against known answers, or secondary model evaluation). Feed these scores back into the routing weights to gradually optimize for quality-cost balance.
5. Token Counting & Pricing Mismatches
Explanation: Different providers report token usage differently (input vs output vs total). Misaligned counting leads to budget overruns and inaccurate cost attribution.
Fix: Normalize all pricing to per-million tokens and explicitly track input/output splits when available. Apply a 10-15% buffer to budget calculations to account for tokenizer variance and system prompt overhead.
6. Rate Limit & Quota Blindness
Explanation: Routing logic rarely accounts for provider rate limits. Burst traffic can trigger 429 errors across multiple endpoints simultaneously, causing cascading failures.
Fix: Implement token bucket rate limiting at the router level. Track X-RateLimit-Remaining headers when available and throttle requests before hitting provider limits.
7. Budget Overrun Without Throttling
Explanation: Cost tracking is often retrospective. Without proactive budget enforcement, a sudden spike in complex queries can exhaust monthly inference budgets within hours.
Fix: Implement a rolling budget window with automatic degradation. When spend approaches 80% of the threshold, automatically route borderline queries to cheaper providers or enable response caching.
Production Bundle
Action Checklist
Decision Matrix
| Scenario | Recommended Approach | Why | Cost Impact |
|---|
| High-throughput simple queries | Route to Groq/Cerebras with aggressive caching | Low complexity tasks benefit from sub-500ms inference and sub-$1/M pricing | 85-90% reduction vs frontier models |
| Complex reasoning / architecture design | Route to GPT-4o or Claude Sonnet | Requires deep context window, multi-step planning, and structured output | Baseline cost, justified by task criticality |
| Strict monthly budget cap | Enable budget throttling + fallback to GLM-4/Gemini for medium tasks | Prevents overrun by dynamically downgrading non-critical paths | Predictable spend, ~40-60% savings |
| Low-latency SLA (<600ms) | Prioritize Cerebras/Groq, disable slow fallbacks | Speed-sensitive workflows cannot tolerate frontier model queue times | Slight cost increase for premium speed tier |
| Multilingual / localization | Route to Gemini or provider with strong translation weights | Specialized tokenizers and training data reduce hallucination in non-English | 70% cheaper than forcing English-tuned models |
Configuration Template
import { InferenceRouter, classifyInferenceTask } from './inference-router';
const router = new InferenceRouter(INFERENCE_REGISTRY, { maxRetries: 2 });
async function handleUserQuery(userPrompt: string, domain?: string) {
const classification = classifyInferenceTask(userPrompt, domain ? { domain } : undefined);
// Apply budget guard: downgrade if confidence is low and budget is tight
const effectiveCategory = classification.confidence < 0.4 ? 'medium' : classification.category;
try {
const result = await router.executeInference(userPrompt, effectiveCategory);
return {
success: true,
output: result.output,
provider: result.metrics.provider,
cost: result.metrics.costUsd,
latency: result.metrics.latencyMs,
};
} catch (err) {
console.error('Routing failure:', err);
return { success: false, error: 'Inference unavailable' };
}
}
// Example invocation
const response = await handleUserQuery('Refactor this async loop to prevent race conditions', 'coding');
console.log(response);
Quick Start Guide
- Initialize the registry: Copy the
INFERENCE_REGISTRY structure into your configuration module. Populate API keys via environment variables matching the AUTH_HEADER naming convention.
- Deploy the router: Instantiate
InferenceRouter at application startup. Attach telemetry hooks to your existing logging or metrics pipeline (Prometheus, Datadog, or OpenTelemetry).
- Classify incoming requests: Pass user prompts through
classifyInferenceTask() before routing. Supply explicit domain hints when available to improve classification accuracy.
- Execute with fallback: Call
executeInference() with the classified category. The router automatically handles retries, failure tracking, and cost attribution.
- Monitor & tune: Review
getAggregatedStats() daily. Adjust fallback priorities, confidence thresholds, and budget limits based on actual traffic patterns and quality feedback.