?: number;
}
interface ModelTier {
id: string;
name: string;
costPerToken: number;
avgLatencyMs: number;
maxAccuracy: number;
endpoint: string;
}
class InferenceRouter {
private tiers: ModelTier[] = [
{ id: 't1', name: 'fast-7b', costPerToken: 0.0002, avgLatencyMs: 45, maxAccuracy: 0.82, endpoint: '/v1/fast' },
{ id: 't2', name: 'balanced-13b', costPerToken: 0.0008, avgLatencyMs: 120, maxAccuracy: 0.91, endpoint: '/v1/balanced' },
{ id: 't3', name: 'premium-70b', costPerToken: 0.0035, avgLatencyMs: 280, maxAccuracy: 0.98, endpoint: '/v1/premium' }
];
async route(request: InferenceRequest): Promise<ModelTier> {
if (request.requiredAccuracy && request.requiredAccuracy > 0.95) {
return this.tiers[2];
}
if (request.maxLatency && request.maxLatency < 100) {
return this.tiers[0];
}
const complexity = await this.estimateComplexity(request.payload);
if (complexity < 0.4) return this.tiers[0];
if (complexity < 0.75) return this.tiers[1];
return this.tiers[2];
}
private async estimateComplexity(text: string): Promise<number> {
// Lightweight heuristic: token count, special characters, question markers, technical terms
const tokens = text.split(/\s+/).length;
const hasMath = /[=+-*/^βΟββ«]/.test(text);
const isQuestion = /?/.test(text);
const score = Math.min(1, (tokens / 500) * 0.5 + (hasMath ? 0.3 : 0) + (isQuestion ? 0.2 : 0));
return score;
}
}
### Step 2: Add Semantic Caching
LLM responses for identical or semantically similar prompts are cacheable. A vector-based cache reduces redundant compute by 40β60% for repetitive workflows.
```typescript
import { createClient } from 'redis';
class SemanticCache {
private redis = createClient({ url: process.env.REDIS_URL });
private embeddingModel = new EmbeddingClient(); // abstraction for text embedding
async get(request: InferenceRequest): Promise<string | null> {
const embedding = await this.embeddingModel.embed(request.payload);
const key = `cache:${this.hashEmbedding(embedding)}`;
return await this.redis.get(key);
}
async set(request: InferenceRequest, response: string, ttlSeconds = 3600): Promise<void> {
const embedding = await this.embeddingModel.embed(request.payload);
const key = `cache:${this.hashEmbedding(embedding)}`;
await this.redis.set(key, response, { EX: ttlSeconds });
}
private hashEmbedding(vec: number[]): string {
// Quantize and hash for cache key generation
const quantized = vec.map(v => Math.round(v * 100) / 100).join(',');
return require('crypto').createHash('sha256').update(quantized).digest('hex').slice(0, 16);
}
}
Step 3: Integrate Real-Time Cost Telemetry
Cost tracking must be request-level, not aggregate. Emit metrics to your observability stack for cost-per-tenant, cost-per-feature, and utilization tracking.
class CostTelemetry {
private metrics = new Map<string, { requests: number; tokens: number; cost: number }>();
record(tier: ModelTier, inputTokens: number, outputTokens: number): void {
const totalCost = (inputTokens + outputTokens) * tier.costPerToken;
const key = `${tier.id}`;
const current = this.metrics.get(key) || { requests: 0, tokens: 0, cost: 0 };
this.metrics.set(key, {
requests: current.requests + 1,
tokens: current.tokens + inputTokens + outputTokens,
cost: current.cost + totalCost
});
}
flush(): void {
// Emit to Prometheus / Datadog / CloudWatch
this.metrics.forEach((data, tierId) => {
console.log(JSON.stringify({
metric: 'ai_inference_cost',
tags: { tier: tierId },
values: { requests: data.requests, tokens: data.tokens, cost_usd: data.cost }
}));
});
this.metrics.clear();
}
}
Architecture Decisions & Rationale
- Router at the API Gateway: Decouples cost logic from business services. Enables consistent routing across microservices without code duplication.
- Tiered Model Registry: Models are versioned and tagged with cost/latency/accuracy profiles. Enables hot-swapping without deployment cycles.
- Semantic Cache with TTL: Vector similarity prevents cache collisions while maximizing hit rates. TTL prevents stale responses in dynamic domains.
- Async Batching for High Throughput: Non-critical requests are batched server-side to maximize GPU utilization. Reduces per-token overhead by 30β50%.
- Observability-First Design: Cost metrics are emitted at request completion. Enables automated scaling, budget alerts, and feature-level ROI tracking.
Pitfall Guide
-
Aggressive Quantization Without Validation
Quantizing to INT8 or FP4 without task-specific validation causes silent accuracy degradation. Always benchmark quantized models against a golden dataset before production rollout. Use per-channel quantization for vision models and per-token for LLMs.
-
Caching Without Context Awareness
Caching raw prompts ignores user state, session context, and dynamic variables. Implement context-aware cache keys that include user ID, session tokens, and feature flags. Never cache PII or regulated data.
-
Static Routing Instead of Dynamic
Hardcoded routing rules fail under traffic shifts. Implement dynamic routing with fallback chains. If the optimal model exceeds latency thresholds, automatically degrade to the next tier without dropping the request.
-
Ignoring Egress and Data Transfer Costs
GPU clusters in one region serving global users incur massive egress fees. Deploy inference close to users via edge nodes or regional replicas. Compress payloads and use streaming where applicable.
-
Over-Provisioning for Peak vs Average Load
Provisioning for 99th percentile latency wastes compute during off-peak hours. Use autoscaling with predictive scaling based on historical traffic patterns. Combine with spot/preemptible instances for non-critical inference.
-
Missing Cost-Per-Request Metrics
Aggregate dashboards hide inefficient endpoints. Track cost per request, cost per token, and cost per successful completion. Tie metrics to business outcomes to identify which features justify their compute spend.
-
Neglecting Cold Start Optimization
Model loading and GPU warm-up add 2β8 seconds to first requests. Pre-warm endpoints during traffic ramps, use model sharding to keep active layers in memory, and implement connection pooling to reduce initialization overhead.
Best Practices from Production:
- Run cost-aware routing in shadow mode for 14 days before enforcing.
- Implement feature flags to toggle optimization layers without deployments.
- Automate model retirement when cost/accuracy ratios degrade beyond thresholds.
- Align engineering KPIs with unit economics, not just latency/accuracy.
Production Bundle
Action Checklist
Decision Matrix
| Scenario | Recommended Approach | Why | Cost Impact |
|---|
| High-volume, low-complexity queries | Semantic cache + fast-7b tier | 60%+ hit rate eliminates redundant compute; small model handles simple intent | -72% vs baseline |
| Real-time chat with strict latency | Edge routing + dynamic degradation | Keeps p95 latency <150ms; falls back to smaller model if primary exceeds threshold | -45% vs baseline |
| Batch document processing | Async queue + FP8 quantization | Maximizes GPU throughput; quantization reduces memory bandwidth pressure | -68% vs baseline |
| Budget-constrained startup | Multi-tier routing + spot instances | Balances accuracy with cost; spot pricing cuts infrastructure spend by 60% | -81% vs baseline |
Configuration Template
ai_router:
enabled: true
fallback_chain: ["premium-70b", "balanced-13b", "fast-7b"]
thresholds:
max_latency_ms: 200
min_accuracy: 0.85
cost_per_request_limit: 0.05
model_registry:
tiers:
fast:
model: "meta-llama-3-8b-instruct"
quantization: "int8"
cost_per_token: 0.0002
endpoint: "http://gpu-pool-fast:8080/v1"
balanced:
model: "mistral-7b-instruct"
quantization: "fp16"
cost_per_token: 0.0008
endpoint: "http://gpu-pool-balanced:8080/v1"
premium:
model: "llama-3-70b-instruct"
quantization: "fp16"
cost_per_token: 0.0035
endpoint: "http://gpu-pool-premium:8080/v1"
cache:
provider: "redis"
ttl_seconds: 3600
embedding_model: "text-embedding-3-small"
similarity_threshold: 0.88
exclude_patterns:
- "api_key"
- "user_pii"
- "session_token"
telemetry:
enabled: true
exporter: "prometheus"
labels: ["tenant_id", "feature", "model_tier"]
flush_interval_ms: 5000
budget_alerts:
daily_limit_usd: 500
action: "scale_down"
Quick Start Guide
- Clone the router template:
git clone https://github.com/codcompass/ai-cost-router && cd ai-cost-router
- Configure environment variables: Copy
.env.example to .env, set REDIS_URL, MODEL_ENDPOINTS, and TELEMETRY_ENDPOINT
- Deploy the routing service:
docker compose up -d router cache telemetry
- Verify routing logic:
curl -X POST http://localhost:3000/inference -H "Content-Type: application/json" -d '{"payload":"Explain quantum computing simply","requiredAccuracy":0.85}'
- Monitor cost metrics: Open Grafana/Datadog dashboard at
http://localhost:3001/metrics to confirm request-level cost tracking and tier selection