adonly baseUrl: string;
private readonly apiKey: string;
constructor(baseUrl: string, apiKey: string) {
this.baseUrl = baseUrl;
this.apiKey = apiKey;
}
isCompatible(request: ModelRequest): boolean {
return ['gpt-4o', 'claude-3-5-sonnet-20241022', 'claude-3-opus'].includes(request.model);
}
async execute(request: ModelRequest): Promise<ModelResponse> {
const start = Date.now();
const response = await fetch(${this.baseUrl}/chat/completions, {
method: 'POST',
headers: {
'Authorization': Bearer ${this.apiKey},
'Content-Type': 'application/json',
},
body: JSON.stringify({
model: request.model,
messages: [{ role: 'user', content: request.prompt }],
max_tokens: request.maxTokens ?? 1024,
temperature: request.temperature ?? 0.7,
}),
});
if (!response.ok) {
const err = await response.json().catch(() => ({}));
throw new Error(`[${this.name}] ${response.status}: ${err.error?.message || response.statusText}`);
}
const data = await response.json();
return {
content: data.choices[0].message.content,
provider: this.name,
model: request.model,
inputTokens: data.usage.prompt_tokens,
outputTokens: data.usage.completion_tokens,
latencyMs: Date.now() - start,
};
}
}
```typescript
class GeminiAdapter implements ProviderAdapter {
readonly name = 'gemini';
private readonly baseUrl: string;
private readonly apiKey: string;
constructor(baseUrl: string, apiKey: string) {
this.baseUrl = baseUrl;
this.apiKey = apiKey;
}
isCompatible(request: ModelRequest): boolean {
return request.model.startsWith('gemini-');
}
async execute(request: ModelRequest): Promise<ModelResponse> {
const start = Date.now();
const url = `${this.baseUrl}/models/${request.model}:generateContent?key=${this.apiKey}`;
const response = await fetch(url, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({
contents: [{ parts: [{ text: request.prompt }] }],
generationConfig: {
maxOutputTokens: request.maxTokens ?? 1024,
temperature: request.temperature ?? 0.7,
},
}),
});
if (!response.ok) {
const err = await response.json().catch(() => ({}));
throw new Error(`[${this.name}] ${response.status}: ${err.error?.message || response.statusText}`);
}
const data = await response.json();
const content = data.candidates?.[0]?.content?.parts?.[0]?.text ?? '';
return {
content,
provider: this.name,
model: request.model,
inputTokens: data.usageMetadata?.promptTokenCount ?? 0,
outputTokens: data.usageMetadata?.candidatesTokenCount ?? 0,
latencyMs: Date.now() - start,
};
}
}
Step 3: Build the Dispatch Engine with Fallback & Smart Routing
The router coordinates adapters, enforces fallback chains, and applies task-based routing rules.
export class ModelRouter {
private adapters: ProviderAdapter[] = [];
private fallbackOrder: string[] = [];
private taskRoutes: Record<string, { provider: string; model: string }> = {};
register(adapter: ProviderAdapter) {
this.adapters.push(adapter);
}
setFallbackOrder(providers: string[]) {
this.fallbackOrder = providers;
}
defineTaskRoutes(routes: typeof this.taskRoutes) {
this.taskRoutes = routes;
}
async route(request: ModelRequest, taskType?: string): Promise<ModelResponse> {
let targetModel = request.model;
let targetProvider = request.provider;
if (taskType && this.taskRoutes[taskType]) {
targetProvider = this.taskRoutes[taskType].provider;
targetModel = this.taskRoutes[taskType].model;
}
const orderedProviders = targetProvider
? [targetProvider, ...this.fallbackOrder.filter(p => p !== targetProvider)]
: this.fallbackOrder;
for (const providerName of orderedProviders) {
const adapter = this.adapters.find(a => a.name === providerName);
if (!adapter) continue;
try {
return await adapter.execute({ ...request, model: targetModel, provider: providerName });
} catch (err) {
console.warn(`[${providerName}] execution failed: ${(err as Error).message}`);
continue;
}
}
throw new Error('All configured providers failed to execute the request');
}
}
Step 4: Integrate Cost Metering
Token pricing varies significantly across providers. A dedicated metering utility normalizes spend calculation and attaches it to request telemetry.
export class CostMeter {
private static readonly rates: Record<string, { input: number; output: number }> = {
'openai-compatible': { input: 0.000005, output: 0.000025 },
'gemini': { input: 0.000000125, output: 0.0000005 },
};
static calculate(response: ModelResponse): number {
const rates = this.rates[response.provider];
if (!rates) return 0;
return (response.inputTokens * rates.input) + (response.outputTokens * rates.output);
}
}
Architecture Rationale
- Adapter Pattern: Isolates vendor-specific HTTP logic. Adding a new provider requires only implementing
ProviderAdapter, leaving the router untouched.
- Sequential Fallback: Guarantees deterministic degradation. The router attempts the primary provider first, then cycles through the fallback list, preventing parallel waste during outages.
- Task-Based Routing: Decouples business intent from model selection. The application declares
taskType: 'code', and the router maps it to the optimal model without hardcoding vendor names in business logic.
- Explicit Cost Metering: Centralizes pricing logic. Token rates change frequently; isolating them in a single module prevents drift and simplifies audits.
Pitfall Guide
1. Ignoring Tokenization Differences
Explanation: Each provider tokenizes text differently. A 1000-character prompt may yield 200 tokens on one model and 350 on another. Hardcoding token limits causes silent truncation or budget overruns.
Fix: Always read usage.prompt_tokens and usage.completion_tokens from the response. Never assume character-to-token ratios. Implement dynamic truncation based on actual token counts.
2. Hardcoding Fallback Sequences Without Health Checks
Explanation: Blindly cycling through providers during an outage wastes compute and increases latency. If a provider is regionally degraded, repeated attempts will fail identically.
Fix: Implement a lightweight circuit breaker or health cache. Track consecutive failures per provider and temporarily remove them from the fallback chain until a recovery window passes.
3. Normalizing Errors Incorrectly
Explanation: Providers return different HTTP status codes and error payloads. OpenAI uses 429 for rate limits, Gemini uses 429 with different JSON structure, and Claude may return 503 during overload. Generic error handling masks root causes.
Fix: Parse error responses per adapter. Map vendor codes to a unified AIError enum (RATE_LIMIT, CONTEXT_EXCEEDED, SERVICE_UNAVAILABLE, AUTH_FAILURE). Log the raw payload for debugging.
4. Overlooking Context Window & Rate Limit Disparities
Explanation: Models have vastly different context limits (e.g., 128K vs 1M tokens) and RPM/TPM quotas. Routing a long-document summary to a model with a 4K window causes silent truncation.
Fix: Maintain a model capability registry. Validate request.prompt length against the target model's context window before execution. Implement token-aware chunking for oversized inputs.
5. Cost Attribution Drift
Explanation: Pricing tiers change without notice. Hardcoded rates in business logic cause budget miscalculations. Multi-tenant systems struggle to attribute spend to specific features or users.
Fix: Externalize pricing to a configuration service or environment variable. Attach a requestId to every call and log cost alongside telemetry. Reconcile spend weekly against provider invoices.
6. Streaming Protocol Mismatch
Explanation: The source examples use synchronous completion. Production systems often require streaming. OpenAI and Gemini use different SSE formats and chunk structures. Mixing them breaks UI rendering.
Fix: Abstract streaming into a separate StreamAdapter interface. Normalize chunks into a unified delta: string format. Handle done signals and error termination consistently across providers.
7. Missing Request Correlation IDs
Explanation: Without correlation IDs, debugging multi-provider failures becomes impossible. You cannot trace which provider handled a request, how much it cost, or where latency spiked.
Fix: Generate a traceId at the router level. Propagate it through adapters, cost metering, and logging. Use it in observability dashboards to correlate AI calls with downstream business events.
Production Bundle
Action Checklist
Decision Matrix
| Scenario | Recommended Approach | Why | Cost Impact |
|---|
| High-volume summarization | Route to Gemini Flash via fallback chain | Lowest per-token pricing, fast inference | ~80% reduction vs premium models |
| Complex code generation | Primary: Claude 3.5 Sonnet, Fallback: GPT-4o | Superior syntax reasoning, fewer hallucinations | Moderate (premium pricing justified by output quality) |
| Creative/marketing copy | Primary: GPT-4o, Fallback: Claude Opus | Stronger tone control and stylistic flexibility | Higher, but reduces manual editing overhead |
| Real-time chat with strict latency | Route to fastest available provider + circuit breaker | Minimizes TTFB, maintains UX during outages | Slight increase due to fallback attempts, offset by reduced timeout retries |
| Budget-constrained MVP | Single provider + explicit fallback to Gemini | Simplifies initial setup while preserving resilience | Lowest baseline, scales cost only when primary fails |
Configuration Template
// config/ai-routing.config.ts
import { ModelRouter, CostMeter } from './model-router';
import { OpenAICompatibleAdapter } from './adapters/openai-compatible';
import { GeminiAdapter } from './adapters/gemini';
export function initializeAIRouter() {
const router = new ModelRouter();
// Register adapters
router.register(new OpenAICompatibleAdapter(
process.env.OPENAI_BASE_URL || 'https://api.openai.com/v1',
process.env.OPENAI_API_KEY || ''
));
router.register(new OpenAICompatibleAdapter(
process.env.CLAUDE_COMPAT_BASE_URL || 'https://api.ofox.ai/v1',
process.env.CLAUDE_API_KEY || ''
));
router.register(new GeminiAdapter(
process.env.GEMINI_BASE_URL || 'https://generativelanguage.googleapis.com/v1beta',
process.env.GEMINI_API_KEY || ''
));
// Fallback chain
router.setFallbackOrder(['claude-compatible', 'openai-compatible', 'gemini']);
// Task routing map
router.defineTaskRoutes({
code: { provider: 'claude-compatible', model: 'claude-3-5-sonnet-20241022' },
creative: { provider: 'openai-compatible', model: 'gpt-4o' },
fast: { provider: 'gemini', model: 'gemini-1.5-flash' },
analysis: { provider: 'claude-compatible', model: 'claude-3-opus' },
});
return router;
}
export { CostMeter };
Quick Start Guide
- Install dependencies: Ensure your project supports
fetch (Node 18+ or modern bundler). No external HTTP client is required.
- Create
.env file: Populate OPENAI_API_KEY, CLAUDE_API_KEY, GEMINI_API_KEY, and optional base URLs. Never commit this file.
- Initialize the router: Import
initializeAIRouter in your application entry point and attach it to your dependency injection container or global service registry.
- Execute a routed request: Call
router.route({ prompt: '...', taskType: 'code' }). The engine handles provider selection, fallback, and response normalization automatically.
- Monitor costs & latency: Wrap execution in a try/catch block, pass the response to
CostMeter.calculate(), and log the result alongside your traceId for observability.
This architecture transforms AI integration from a fragile vendor dependency into a controllable, observable infrastructure layer. Start with two providers, enforce explicit fallback chains, and expand routing rules as workload diversity grows. The routing layer pays for itself in reduced downtime, predictable spend, and the ability to swap models without rewriting business logic.