ied interface. This prevents model-specific logic from leaking into business code.
// types/inference.ts
export interface InferenceRequest {
modelId: string;
payload: Record<string, unknown>;
options?: {
maxLatencyMs?: number;
maxCostCents?: number;
fallbackProviders?: string[];
};
}
export interface InferenceResponse {
provider: string;
modelId: string;
output: unknown;
latencyMs: number;
costCents: number;
metadata: Record<string, string | number>;
}
export interface ProviderAdapter {
readonly name: string;
supportsModality(modality: string): boolean;
invoke(request: InferenceRequest): Promise<InferenceResponse>;
getHealthStatus(): Promise<{ ready: boolean; latencyP95?: number }>;
}
Step 2: Implement Provider Adapters
Each adapter translates the unified schema into provider-specific calls. Here's a template for a token-based LLM provider and an output-based media provider.
// adapters/togetherAdapter.ts
import { ProviderAdapter, InferenceRequest, InferenceResponse } from '../types/inference';
export class TogetherAdapter implements ProviderAdapter {
readonly name = 'together';
private readonly apiKey: string;
private readonly baseUrl = 'https://api.together.xyz/v1';
constructor(apiKey: string) {
this.apiKey = apiKey;
}
supportsModality(modality: string): boolean {
return modality === 'text' || modality === 'chat';
}
async invoke(request: InferenceRequest): Promise<InferenceResponse> {
const start = performance.now();
const response = await fetch(`${this.baseUrl}/chat/completions`, {
method: 'POST',
headers: {
'Authorization': `Bearer ${this.apiKey}`,
'Content-Type': 'application/json'
},
body: JSON.stringify({
model: request.modelId,
messages: request.payload.messages as Array<{ role: string; content: string }>,
max_tokens: request.payload.max_tokens as number || 1024
})
});
if (!response.ok) throw new Error(`Together API error: ${response.status}`);
const data = await response.json();
const latency = performance.now() - start;
// Token-based cost estimation (example: $0.20/1M tokens)
const inputTokens = data.usage?.prompt_tokens || 0;
const outputTokens = data.usage?.completion_tokens || 0;
const cost = ((inputTokens + outputTokens) * 0.0000002) * 100;
return {
provider: this.name,
modelId: request.modelId,
output: data.choices[0].message.content,
latencyMs: latency,
costCents: cost,
metadata: { inputTokens, outputTokens, model: data.model }
};
}
async getHealthStatus(): Promise<{ ready: boolean; latencyP95?: number }> {
try {
const res = await fetch(`${this.baseUrl}/models`, {
headers: { 'Authorization': `Bearer ${this.apiKey}` }
});
return { ready: res.ok };
} catch {
return { ready: false };
}
}
}
// adapters/falAdapter.ts
import { ProviderAdapter, InferenceRequest, InferenceResponse } from '../types/inference';
export class FalAdapter implements ProviderAdapter {
readonly name = 'fal';
private readonly apiKey: string;
private readonly baseUrl = 'https://fal.run';
constructor(apiKey: string) {
this.apiKey = apiKey;
}
supportsModality(modality: string): boolean {
return modality === 'image' || modality === 'video';
}
async invoke(request: InferenceRequest): Promise<InferenceResponse> {
const start = performance.now();
const endpoint = `https://fal.run/${request.modelId}`;
const response = await fetch(endpoint, {
method: 'POST',
headers: {
'Authorization': `Key ${this.apiKey}`,
'Content-Type': 'application/json'
},
body: JSON.stringify(request.payload)
});
if (!response.ok) throw new Error(`Fal API error: ${response.status}`);
const data = await response.json();
const latency = performance.now() - start;
// Output-based pricing (example: $0.008 per image generation)
const cost = 0.008 * 100;
return {
provider: this.name,
modelId: request.modelId,
output: data.images?.[0]?.url || data.video?.url,
latencyMs: latency,
costCents: cost,
metadata: { resolution: data.images?.[0]?.width, format: 'output_based' }
};
}
async getHealthStatus(): Promise<{ ready: boolean }> {
return { ready: true }; // Media endpoints are typically warm-pooled
}
}
Step 3: Build the Routing Engine with Fallback Chains
The router orchestrates providers, enforces SLAs, and tracks cumulative costs.
// core/inferenceRouter.ts
import { ProviderAdapter, InferenceRequest, InferenceResponse } from '../types/inference';
export class InferenceRouter {
private adapters: Map<string, ProviderAdapter> = new Map();
private costTracker: Map<string, number> = new Map();
register(adapter: ProviderAdapter): void {
this.adapters.set(adapter.name, adapter);
}
async route(request: InferenceRequest): Promise<InferenceResponse> {
const chain = this.resolveChain(request);
let lastError: Error | null = null;
for (const providerName of chain) {
const adapter = this.adapters.get(providerName);
if (!adapter) continue;
try {
const health = await adapter.getHealthStatus();
if (!health.ready) continue;
const response = await adapter.invoke(request);
// Enforce cost cap
const cap = request.options?.maxCostCents;
if (cap && response.costCents > cap) {
throw new Error(`Cost cap exceeded: ${response.costCents} > ${cap}`);
}
this.costTracker.set(providerName, (this.costTracker.get(providerName) || 0) + response.costCents);
return response;
} catch (err) {
lastError = err as Error;
continue;
}
}
throw new Error(`All providers failed. Last error: ${lastError?.message}`);
}
private resolveChain(request: InferenceRequest): string[] {
const fallbacks = request.options?.fallbackProviders || [];
const primary = fallbacks[0] || this.selectDefaultProvider(request);
return [primary, ...fallbacks.slice(1)];
}
private selectDefaultProvider(request: InferenceRequest): string {
const modality = this.detectModality(request.payload);
for (const [name, adapter] of this.adapters) {
if (adapter.supportsModality(modality)) return name;
}
throw new Error('No provider supports requested modality');
}
private detectModality(payload: Record<string, unknown>): string {
if ('messages' in payload) return 'chat';
if ('prompt' in payload && ('width' in payload || 'height' in payload)) return 'image';
return 'text';
}
getCostSummary(): Record<string, number> {
return Object.fromEntries(this.costTracker);
}
}
Architecture Rationale:
- Adapter Pattern: Isolates provider-specific authentication, payload transformation, and error handling. Adding a new provider requires implementing one interface, not rewriting business logic.
- Explicit Fallback Chains: Prevents silent degradation. Teams define primary, secondary, and tertiary providers per workload, ensuring predictable failover behavior.
- Unit-Level Cost Tracking: Normalizes disparate pricing models (tokens, outputs, compute seconds) into a single currency metric. This enables budget caps and real-time spend alerts.
- Modality Detection: Routes requests to providers optimized for the workload type, avoiding unnecessary latency from general-purpose gateways.
Pitfall Guide
1. Cold Start Blindness
Explanation: Assuming all providers maintain warm containers. Marketplace APIs frequently spin down idle instances, causing 2–8 second delays on first invocation.
Fix: Implement health checks with latency thresholds. Route latency-sensitive requests to providers with guaranteed warm pools or use provisioned concurrency where available.
2. Pricing Model Mismatch
Explanation: Treating per-second compute pricing as equivalent to per-token pricing. Sustained throughput on runtime-based platforms can exceed budget by 300% compared to token/output models.
Fix: Normalize all costs to a per-unit metric in your routing layer. Enforce hard caps and alert when projected spend exceeds thresholds.
3. API Contract Assumption
Explanation: Assuming OpenAI-compatible endpoints behave identically across providers. Parameter naming, streaming behavior, and error codes often diverge.
Fix: Validate contracts in staging. Use adapter-level schema validation and mock responses that mirror production edge cases.
4. Over-Abstraction Debt
Explanation: Building a universal router that tries to support every provider simultaneously. This creates configuration bloat and obscures failure modes.
Fix: Scope routers to workload categories (text, media, custom). Use separate routing instances per product domain to maintain clarity and reduce blast radius.
5. Observability Gap
Explanation: Logging only success responses. Production failures, retries, and cost anomalies go undetected until customers report issues.
Fix: Instrument every adapter call with latency, cost, provider, and error code. Export metrics to your observability stack with alerting on P95 latency spikes and cost drift.
6. Hardcoded Model Routing
Explanation: Embedding model IDs directly in business logic. When providers deprecate models or change pricing, deployments break.
Fix: Externalize model routing to configuration files or feature flags. Map business intents (e.g., high_quality_image) to provider-specific model IDs at runtime.
7. Rate Limit Neglect
Explanation: Assuming provider rate limits are static. Limits often vary by tier, model, and time of day. Unhandled 429 responses cause cascading failures.
Fix: Implement exponential backoff with jitter. Track remaining quota headers and throttle requests proactively before hitting limits.
Production Bundle
Action Checklist
Decision Matrix
| Scenario | Recommended Approach | Why | Cost Impact |
|---|
| High-throughput text generation | Together AI + token-based router | Predictable per-token pricing, optimized LLM clusters | Low (scales linearly with usage) |
| Bursty media generation | fal.ai + output-based router | Warm pools eliminate cold starts, output pricing aligns with creative workflows | Medium (spikes during traffic bursts) |
| Custom Python inference pipelines | Modal + serverless GPU | Full code control, dependency isolation, batch job support | High (requires engineering overhead) |
| Cost-constrained batch processing | RunPod + dedicated instances | Spot/preemptible GPUs, hourly billing, full infra control | Low-Medium (requires capacity planning) |
| Multi-modal product roadmap | WisGate + unified gateway | OpenAI-compatible layer, cross-modality routing, simplified integration | Medium (gateway markup vs. direct APIs) |
Configuration Template
{
"routing": {
"defaultModality": "text",
"fallbackTimeoutMs": 3000,
"costCapCents": 500,
"providers": {
"together": {
"enabled": true,
"modality": ["text", "chat"],
"pricingModel": "token",
"rateLimit": { "requestsPerMinute": 60, "tokensPerMinute": 100000 }
},
"fal": {
"enabled": true,
"modality": ["image", "video"],
"pricingModel": "output",
"rateLimit": { "requestsPerMinute": 30, "outputsPerMinute": 15 }
},
"wisgate": {
"enabled": false,
"modality": ["text", "image", "embedding"],
"pricingModel": "unified",
"rateLimit": { "requestsPerMinute": 120 }
}
},
"fallbackChains": {
"chat_completion": ["together", "wisgate"],
"image_generation": ["fal", "wisgate"],
"embedding": ["wisgate", "together"]
}
},
"observability": {
"metricsEndpoint": "/metrics",
"costTracking": true,
"latencyThresholds": { "p50": 800, "p95": 2000, "p99": 4000 }
}
}
Quick Start Guide
- Initialize the router: Install dependencies, create an
InferenceRouter instance, and register adapters for your target providers.
- Load configuration: Parse the JSON template into your application's config store. Map business intents to provider-specific model IDs.
- Wire fallback chains: Define primary and secondary providers per modality. Set latency and cost thresholds in the routing options.
- Deploy with monitoring: Route production traffic through the adapter layer. Verify metrics export to your observability stack. Validate fallback behavior by simulating provider outages in staging.