ack modeling.
3. Bidirectional Guardrails: Safety controls must inspect both inputs (prompt injection, PII leakage) and outputs (data exfiltration, policy violations). Placing guardrails only on requests leaves generated content unvetted.
4. OpenTelemetry Standardization: Proprietary dashboards create vendor lock-in and fragment incident response. Exporting traces, metrics, and logs to OpenTelemetry ensures compatibility with Grafana, Datadog, and Prometheus.
Implementation (TypeScript)
The following implementation demonstrates a modular routing engine with cost tracking, safety validation, and distributed tracing.
import { Span, trace, context } from '@opentelemetry/api';
// Unified request envelope carrying routing context
interface AIRequestEnvelope {
requestId: string;
teamId: string;
appId: string;
taskType: 'classification' | 'reasoning' | 'tool_execution';
payload: Record<string, unknown>;
metadata: {
maxLatencyMs: number;
costThresholdPerToken: number;
complianceTags: string[];
};
}
// Provider health snapshot for dynamic routing
interface ProviderHealth {
provider: string;
p99LatencyMs: number;
errorRate: number;
costPerToken: number;
available: boolean;
}
// Core router with fallback and cost-aware selection
class ModelOrchestrator {
private providerRegistry: Map<string, ProviderHealth> = new Map();
private telemetry: trace.Tracer;
constructor() {
this.telemetry = trace.getTracer('ai-gateway-router');
}
registerProvider(name: string, health: ProviderHealth): void {
this.providerRegistry.set(name, health);
}
async routeRequest(envelope: AIRequestEnvelope): Promise<unknown> {
const span = this.telemetry.startSpan('route-request', {
attributes: {
'request.id': envelope.requestId,
'team.id': envelope.teamId,
'task.type': envelope.taskType,
},
});
try {
// Filter providers by availability and latency budget
const viableProviders = Array.from(this.providerRegistry.values())
.filter(p => p.available && p.p99LatencyMs <= envelope.metadata.maxLatencyMs);
if (viableProviders.length === 0) {
throw new Error('No providers meet latency or availability constraints');
}
// Select based on task complexity and cost threshold
const selected = this.selectProvider(viableProviders, envelope.taskType, envelope.metadata.costThresholdPerToken);
span.setAttribute('selected.provider', selected.provider);
return await this.executeWithFallback(envelope, selected);
} catch (err) {
span.recordException(err as Error);
throw err;
} finally {
span.end();
}
}
private selectProvider(providers: ProviderHealth[], taskType: string, costLimit: number): ProviderHealth {
if (taskType === 'reasoning') {
// Prefer higher capability models within cost bounds
return providers
.filter(p => p.costPerToken <= costLimit)
.sort((a, b) => a.p99LatencyMs - b.p99LatencyMs)[0];
}
// Default to lowest latency/cost for simple tasks
return providers.sort((a, b) => (a.costPerToken + a.p99LatencyMs) - (b.costPerToken + b.p99LatencyMs))[0];
}
private async executeWithFallback(envelope: AIRequestEnvelope, primary: ProviderHealth): Promise<unknown> {
try {
return await this.invokeProvider(primary.provider, envelope);
} catch {
// Automatic failover to next viable provider
const fallback = Array.from(this.providerRegistry.values())
.filter(p => p.provider !== primary.provider && p.available)
.sort((a, b) => a.p99LatencyMs - b.p99LatencyMs)[0];
if (!fallback) throw new Error('Fallback exhausted');
return this.invokeProvider(fallback.provider, envelope);
}
}
private async invokeProvider(provider: string, envelope: AIRequestEnvelope): Promise<unknown> {
// Simulated provider call; replace with actual SDK/HTTP client
return { provider, status: 'success', tokensUsed: { input: 120, output: 45 } };
}
}
// Token-level cost ledger with quota enforcement
class TokenLedger {
private quotas: Map<string, { monthlyLimit: number; currentUsage: number }> = new Map();
setQuota(teamId: string, limit: number): void {
this.quotas.set(teamId, { monthlyLimit: limit, currentUsage: 0 });
}
recordUsage(teamId: string, tokens: number): void {
const quota = this.quotas.get(teamId);
if (!quota) throw new Error(`No quota configured for team: ${teamId}`);
quota.currentUsage += tokens;
if (quota.currentUsage > quota.monthlyLimit) {
throw new Error(`Quota exceeded for team ${teamId}: ${quota.currentUsage}/${quota.monthlyLimit}`);
}
}
getUsageReport(teamId: string): { used: number; limit: number; remaining: number } {
const q = this.quotas.get(teamId)!;
return { used: q.currentUsage, limit: q.monthlyLimit, remaining: q.monthlyLimit - q.currentUsage };
}
}
// Bidirectional safety pipeline
class SafetyPipeline {
async validateInput(payload: Record<string, unknown>, tags: string[]): Promise<boolean> {
// Simulated checks: prompt injection, PII detection, policy validation
const hasInjection = JSON.stringify(payload).includes('SYSTEM_OVERRIDE');
const hasRestrictedTag = tags.includes('restricted_content');
return !hasInjection && !hasRestrictedTag;
}
async validateOutput(response: unknown, tags: string[]): Promise<boolean> {
// Simulated checks: data leakage, compliance filtering
const raw = JSON.stringify(response);
const leaksPII = /\b\d{3}-\d{2}-\d{4}\b/.test(raw); // SSN pattern example
return !leaksPII;
}
}
Why This Architecture Works
- Separation of Concerns: Routing, cost tracking, safety, and telemetry operate independently. This prevents cascading failures and allows teams to update guardrail policies without touching routing logic.
- Context Propagation: The
AIRequestEnvelope carries team, application, and compliance metadata across all pipeline stages. This enables precise attribution and policy enforcement without relying on external lookups.
- Dynamic Fallback: Providers are evaluated against real-time health snapshots. When a provider degrades, the router automatically shifts traffic to the next viable option, maintaining SLA compliance.
- OpenTelemetry Integration: Every routing decision, fallback, and quota check emits structured spans. This feeds directly into Grafana, Datadog, or Prometheus, eliminating proprietary dashboard dependencies.
Pitfall Guide
1. Deployment-First Blindness
Explanation: Teams evaluate routing features before confirming data residency, VPC compatibility, or air-gapped support. This leads to late-stage disqualification when legal or security teams intervene.
Fix: Map infrastructure constraints (on-prem, multi-cloud, regional isolation) before reviewing feature matrices. Eliminate SaaS-only options immediately if compliance mandates internal hosting.
2. Latency Compounding in Agent Chains
Explanation: Sub-3ms routing overhead seems negligible until multiplied across 20+ sequential tool calls in an agent workflow. The cumulative delay degrades user experience and breaks timeout budgets.
Fix: Benchmark guardrail and routing latency under load. Set explicit p99 budgets per hop and implement async parallelization where tool calls are independent.
3. Request-Level Cost Tracking
Explanation: Billing by request obscures which models, teams, or workflows drive spend. Finance teams cannot allocate costs accurately, leading to uncontrolled budget overruns.
Fix: Implement token-level attribution tagged with team, application, and workflow identifiers. Enforce monthly quotas and route around cost thresholds dynamically.
4. Stateless Proxy Assumption for Agents
Explanation: Treating multi-step agents like simple chat requests ignores session state, tool permissions, and workflow tracing. Proxies cannot enforce per-agent RBAC or reconstruct execution graphs.
Fix: Deploy a gateway that maintains session context, tracks tool invocation chains, and supports MCP-style orchestration with explicit permission boundaries.
5. Proprietary Observability Lock-in
Explanation: Relying on vendor-specific dashboards fragments incident response. Operations teams cannot correlate AI gateway traces with backend services, slowing root-cause analysis.
Fix: Export all traces, metrics, and logs via OpenTelemetry. Configure standard receivers for Grafana, Datadog, or Prometheus to maintain a unified observability stack.
6. Hardcoded Fallback Chains
Explanation: Static fallback sequences (e.g., OpenAI β Anthropic β Local) ignore real-time provider health and cost fluctuations. Traffic may route to degraded or expensive endpoints unnecessarily.
Fix: Implement dynamic routing that evaluates live health snapshots, latency budgets, and cost thresholds. Update provider status via health checks every 30-60 seconds.
Explanation: Filtering prompts but ignoring generated outputs leaves systems vulnerable to data leakage, policy violations, and unsafe content reaching end users or external tools.
Fix: Deploy bidirectional guardrails. Validate inputs for injection and PII, then validate outputs for compliance, data exfiltration, and restricted information before delivery.
Production Bundle
Action Checklist
Decision Matrix
| Scenario | Recommended Approach | Why | Cost Impact |
|---|
| Early-stage startup with public data | SaaS-hosted dynamic gateway | Fast deployment, managed scaling, lower operational overhead | Low upfront, pay-as-you-go token pricing |
| Regulated enterprise (healthcare/finance) | VPC or on-prem orchestration gateway | Data residency compliance, audit trails, legal approval | Higher infrastructure cost, predictable monthly spend |
| Multi-cloud AI platform | Hybrid routing with regional isolation | Avoid vendor lock-in, optimize latency per region, maintain governance | Moderate infrastructure, reduced egress fees via smart routing |
| Air-gapped or classified environment | Self-hosted gateway with local model routing | Zero external data exposure, full control over updates and policies | Highest CapEx, requires dedicated DevOps/MLOps team |
Configuration Template
gateway:
version: "2.1"
routing:
strategy: dynamic
health_check_interval: 30s
p99_latency_budget_ms: 45
fallback_policy: automatic
cost_attribution:
granularity: token_level
tagging:
- team_id
- app_id
- workflow_id
quotas:
- team: "data-science"
monthly_limit_tokens: 50000000
action_on_exceed: "throttle_and_alert"
guardrails:
direction: bidirectional
input_rules:
- type: prompt_injection
action: block
- type: pii_detection
action: mask
output_rules:
- type: data_leakage
action: block
- type: compliance_filter
policy: "eu_ai_act_v2"
observability:
exporter: opentelemetry
endpoints:
- type: trace
url: "http://otel-collector:4317"
- type: metrics
url: "http://otel-collector:4318"
retention_days: 90
deployment:
mode: vpc
region_isolation: true
compliance_certifications:
- SOC2
- HIPAA
- GDPR
Quick Start Guide
- Initialize the routing engine: Deploy the
ModelOrchestrator and register provider health endpoints. Configure health checks to update provider status every 30 seconds.
- Attach cost tracking: Instantiate
TokenLedger, set monthly token quotas per team, and integrate it into the routing pipeline to block requests exceeding limits.
- Enable bidirectional guardrails: Deploy
SafetyPipeline before and after model invocation. Configure input rules for injection/PII and output rules for compliance/data leakage.
- Connect observability: Export spans and metrics via OpenTelemetry to your existing Grafana/Datadog instance. Verify that agent workflows produce complete trace graphs across all tool calls.
- Validate under load: Run synthetic traffic simulating 350+ RPS per vCPU. Confirm p99 latency remains under 3ms for routing, guardrails execute within budget, and cost attribution matches token consumption.