fallbackOrder: string[];
maxContextTokens: number;
}
interface ProviderRoute {
vendor: string;
endpoint: string;
apiKeyRef: string;
rateLimit: number;
weight: number;
}
### Step 2: Implement Request Interception & Normalization
The gateway must normalize payloads across different provider schemas. Most providers support OpenAI-compatible endpoints, but subtle differences in parameter naming, streaming flags, and error formats require normalization.
```typescript
class RequestNormalizer {
static standardize(payload: unknown): NormalizedAIRequest {
const raw = payload as Record<string, unknown>;
return {
model: String(raw.model || 'default'),
messages: Array.isArray(raw.messages) ? raw.messages : [],
temperature: Number(raw.temperature ?? 0.7),
maxTokens: Number(raw.max_tokens ?? raw.maxTokens ?? 1024),
stream: Boolean(raw.stream ?? false),
metadata: {
traceId: crypto.randomUUID(),
timestamp: Date.now(),
source: String(raw._source ?? 'unknown')
}
};
}
}
Step 3: Build Async Fallback Chains
Synchronous retry loops block the event loop and inflate tail latency. Instead, implement race-condition fallbacks with strict timeout boundaries. The gateway attempts the primary route, and if it fails or exceeds the timeout, it immediately triggers the next provider in the chain.
class FallbackExecutor {
async execute(
request: NormalizedAIRequest,
chain: ProviderRoute[],
timeoutMs: number = 5000
): Promise<AIResponse> {
const attempts = chain.map(route =>
this.attemptRoute(request, route).catch(() => null)
);
const timeoutPromise = new Promise<null>((_, reject) =>
setTimeout(() => reject(new Error('Fallback timeout')), timeoutMs)
);
const results = await Promise.race([
Promise.any(attempts),
timeoutPromise
]);
if (!results) throw new Error('All fallback routes exhausted');
return results;
}
private async attemptRoute(req: NormalizedAIRequest, route: ProviderRoute): Promise<AIResponse> {
const response = await fetch(route.endpoint, {
method: 'POST',
headers: {
'Authorization': `Bearer ${process.env[route.apiKeyRef]}`,
'Content-Type': 'application/json'
},
body: JSON.stringify(req)
});
if (!response.ok) throw new Error(`Provider ${route.vendor} returned ${response.status}`);
return response.json();
}
}
Step 4: Integrate Real-Time Token Accounting
Cost overruns occur when token consumption is measured after the fact. The gateway must stream token counts alongside responses, updating a ledger in real time. This enables immediate budget enforcement and prevents runaway generation.
class TokenLedger {
private counters: Map<string, number> = new Map();
increment(alias: string, tokens: number): void {
const current = this.counters.get(alias) ?? 0;
this.counters.set(alias, current + tokens);
if (current + tokens > this.getBudget(alias)) {
this.emitBudgetAlert(alias, current + tokens);
}
}
private getBudget(alias: string): number {
return Number(process.env[`BUDGET_${alias.toUpperCase()}`] ?? 1000000);
}
private emitBudgetAlert(alias: string, consumed: number): void {
console.warn(`[LEDGER] Budget exceeded for ${alias}: ${consumed} tokens`);
// Integrate with monitoring system (Datadog, Prometheus, etc.)
}
}
Step 5: Wire OpenTelemetry at the Routing Layer
Telemetry injected at the gateway level captures 100% of AI traffic without requiring instrumentation in every downstream service. Use OpenTelemetry to trace request routing, fallback decisions, token consumption, and provider latency.
import { trace } from '@opentelemetry/api';
class TelemetryInjector {
static async span(request: NormalizedAIRequest, fn: () => Promise<AIResponse>): Promise<AIResponse> {
const tracer = trace.getTracer('ai-gateway');
return tracer.startActiveSpan('ai.route', async (span) => {
span.setAttribute('ai.model.alias', request.model);
span.setAttribute('ai.trace.id', request.metadata.traceId);
try {
const result = await fn();
span.setAttribute('ai.tokens.used', result.usage?.total_tokens ?? 0);
span.setStatus({ code: 1 });
return result;
} catch (err) {
span.recordException(err as Error);
span.setStatus({ code: 2, message: (err as Error).message });
throw err;
} finally {
span.end();
}
});
}
}
Architecture Rationale:
- Async fallback chains prevent thread blocking and reduce tail latency compared to sequential retries.
- Real-time token accounting shifts cost control from post-mortem analysis to active enforcement.
- OpenTelemetry at the proxy layer eliminates instrumentation duplication and guarantees consistent trace context across provider boundaries.
- Semantic aliases decouple business logic from vendor-specific identifiers, enabling zero-downtime provider migrations.
Pitfall Guide
1. Hardcoded Model Identifiers
Explanation: Embedding vendor-specific strings like gpt-4o or claude-sonnet-4-20250514 directly in application code creates tight coupling. When providers rename models or deprecate endpoints, every service requires coordinated deployments.
Fix: Route all requests through semantic aliases (primary-chat, fast-reasoning, cost-optimized). Maintain alias mappings in external configuration files that can be updated without code changes.
2. Ignoring Context Window Drift
Explanation: Providers frequently adjust context limits without breaking API contracts. An application that assumes a fixed token budget may silently truncate prompts or trigger provider-side validation errors.
Fix: Implement dynamic context validation before dispatch. Compare incoming message token counts against the alias's maxContextTokens threshold and truncate or reject requests proactively.
3. Synchronous Fallback Chains
Explanation: Using for...of loops with await for fallback providers blocks the event loop. If the primary provider hangs for 10 seconds, the entire request pipeline stalls, inflating p99 latency.
Fix: Use Promise.any() or Promise.race() with timeout boundaries. Fire all eligible routes concurrently and resolve with the first successful response, canceling pending requests via AbortController.
4. Unbounded Streaming Buffers
Explanation: Streaming responses accumulate chunks in memory until completion. Long generations or misbehaving providers can cause heap exhaustion and OOM crashes.
Fix: Implement backpressure-aware chunk processing. Pipe streaming data directly to the client or message queue without buffering the full payload. Use ReadableStream with controlled pull() cycles.
5. Missing PII/Content Filtering
Explanation: Raw user prompts often contain sensitive data. Forwarding unfiltered payloads to external providers violates compliance requirements (GDPR, HIPAA, SOC2) and exposes organizations to data leakage.
Fix: Inject pre-flight sanitization middleware. Use regex patterns, NER models, or dedicated redaction services to strip or tokenize sensitive fields before routing to external endpoints.
6. Rate Limit Blindness
Explanation: Assuming provider rate limits are static leads to sudden 429 errors during traffic spikes. Different providers enforce limits per minute, per token, or per concurrent request.
Fix: Implement a token bucket algorithm with provider-specific ceilings. Track request velocity locally and throttle dispatch before hitting provider thresholds. Cache rate limit headers from responses to adjust local buckets dynamically.
7. Cost Accounting After the Fact
Explanation: Measuring token consumption only after response completion delays budget enforcement. By the time overages are detected, the cost has already been incurred.
Fix: Stream token counts alongside generation. Update the ledger incrementally and enforce hard stops when approaching budget thresholds. Integrate with cloud billing APIs for real-time cost projection.
Production Bundle
Action Checklist
Decision Matrix
| Scenario | Recommended Approach | Why | Cost Impact |
|---|
| Startup MVP | Single-provider with lightweight fallback | Minimizes infrastructure overhead while maintaining basic resilience | Low (~$50/mo proxy + provider costs) |
| High-Scale Enterprise | Multi-provider routing with real-time cost optimization | Distributes load, prevents vendor lock-in, and dynamically selects cheapest capable model | Medium (+15% infra, -30% token spend) |
| Compliance-Heavy (Healthcare/Finance) | On-premise gateway with PII redaction and audit logging | Ensures data never leaves controlled boundaries; meets regulatory requirements | High (dedicated infra + compliance tooling) |
| Multi-Region Global | Geo-aware routing with latency-based provider selection | Reduces network hops, improves p95 latency, and complies with data residency laws | Medium (+regional endpoints, -latency penalties) |
Configuration Template
ai_gateway:
version: "2.1"
routing:
aliases:
primary_chat:
providers:
- vendor: "openai"
endpoint: "https://api.openai.com/v1/chat/completions"
api_key_ref: "OPENAI_API_KEY"
rate_limit: 5000
weight: 70
- vendor: "anthropic"
endpoint: "https://api.anthropic.com/v1/messages"
api_key_ref: "ANTHROPIC_API_KEY"
rate_limit: 3000
weight: 30
fallback_order: ["openai", "anthropic"]
max_context_tokens: 128000
timeout_ms: 4500
fast_reasoning:
providers:
- vendor: "google"
endpoint: "https://generativelanguage.googleapis.com/v1beta/models"
api_key_ref: "GOOGLE_API_KEY"
rate_limit: 4000
weight: 100
fallback_order: ["google"]
max_context_tokens: 32768
timeout_ms: 3000
telemetry:
provider: "opentelemetry"
export_interval_ms: 5000
trace_sampling: 0.1
budget:
primary_chat: 2000000
fast_reasoning: 500000
alert_threshold: 0.85
security:
pii_redaction: true
prompt_max_length: 10000
allowed_origins: ["*.yourdomain.com"]
Quick Start Guide
- Initialize the proxy service: Clone the gateway repository, install dependencies (
npm install), and copy the configuration template to config/gateway.yaml.
- Inject environment variables: Set provider API keys and budget thresholds in your runtime environment. Ensure
OPENAI_API_KEY, ANTHROPIC_API_KEY, and GOOGLE_API_KEY are populated.
- Start the routing engine: Run
npm run start:gateway. The service will bind to localhost:3000 and load routing policies from the YAML configuration.
- Redirect application traffic: Update your application's AI client to point to
http://localhost:3000/v1/chat/completions. Replace vendor-specific model names with semantic aliases (primary_chat, fast_reasoning).
- Verify telemetry and fallback: Trigger a test request, check OpenTelemetry traces for routing decisions, and simulate a provider failure by temporarily invalidating an API key. Confirm automatic fallback and budget alerting.