ient } from '@supabase/supabase-js';
import { z } from 'zod';
const EvalSchema = z.object({
prompt: z.string(),
response: z.string(),
latencyMs: z.number(),
tokenCount: z.number(),
timestamp: z.number(),
});
type EvalInput = z.infer<typeof EvalSchema>;
export class ProductionEvalHarness {
private supabase: ReturnType<typeof createClient>;
private threshold: { hallucination: number; latency: number; costPer1k: number };
constructor(config: { supabaseUrl: string; supabaseKey: string }) {
this.supabase = createClient(config.supabaseUrl, config.supabaseKey);
this.threshold = { hallucination: 0.08, latency: 1200, costPer1k: 2.5 };
}
async evaluate(input: EvalInput): Promise<{ pass: boolean; flags: string[] }> {
const flags: string[] = [];
// Rule-based latency check
if (input.latencyMs > this.threshold.latency) {
flags.push('LATENCY_BREACH');
}
// Lightweight hallucination proxy: check for known refusal/loop patterns
const hallucinationProxy = this.detectLoopOrRefusal(input.response);
if (hallucinationProxy) flags.push('HALLUCINATION_PROXY');
// Cost normalization
const costPer1k = (input.tokenCount / 1000) * 0.002; // Example pricing
if (costPer1k > this.threshold.costPer1k) flags.push('COST_BREACH');
// Persist for dashboarding and drift detection
await this.supabase.from('ai_evals').insert({
prompt: input.prompt,
response: input.response,
latency_ms: input.latencyMs,
token_count: input.tokenCount,
flags,
created_at: new Date(input.timestamp).toISOString(),
});
return { pass: flags.length === 0, flags };
}
private detectLoopOrRefusal(text: string): boolean {
const patterns = [/again.\s*again./i, /i cannot/i, /error/i, /undefined/i];
return patterns.some(p => p.test(text));
}
}
Architecture decision: Separate evaluation from request routing. This prevents eval overhead from blocking user responses. Use asynchronous insertion to maintain p95 latency. Store flags for drift detection and rollback triggers.
### Step 2: Progressive Canary Router
Never route 100% of traffic to a new model or prompt version. Implement a weight-based router that splits traffic by tenant, session, or feature flag.
```typescript
import { createClient } from '@supabase/supabase-js';
export class CanaryRouter {
private supabase: ReturnType<typeof createClient>;
constructor(config: { supabaseUrl: string; supabaseKey: string }) {
this.supabase = createClient(config.supabaseUrl, config.supabaseKey);
}
async selectVersion(tenantId: string, feature: string): Promise<'stable' | 'canary'> {
// Fetch current rollout weight from config table
const { data: config } = await this.supabase
.from('ai_rollout_config')
.select('canary_weight')
.eq('feature', feature)
.single();
const weight = config?.canary_weight ?? 0.1; // Default 10% canary
const hash = this.hashString(`${tenantId}:${feature}`);
return hash % 100 < weight * 100 ? 'canary' : 'stable';
}
private hashString(str: string): number {
let hash = 0;
for (let i = 0; i < str.length; i++) {
const char = str.charCodeAt(i);
hash = ((hash << 5) - hash) + char;
hash |= 0;
}
return Math.abs(hash);
}
}
Architecture decision: Use deterministic hashing based on tenant ID to ensure session consistency. Users should not experience version flickering. Store rollout weights in a low-latency config store (Supabase, Redis, or feature flag service). This enables hot-adjustment without redeployment.
Step 3: Observability & Feedback Ingestion
User feedback must be captured, normalized, and routed to evaluation pipelines. Decouple feedback collection from core request handling.
export class FeedbackIngestor {
async recordFeedback(sessionId: string, rating: number, comment?: string) {
const payload = {
session_id: sessionId,
rating,
comment: comment || null,
ingested_at: Date.now(),
};
// Fire-and-forget to avoid blocking response
fetch('/api/feedback/ingest', {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify(payload),
keepalive: true,
}).catch(() => {}); // Suppress network errors in client
}
}
Architecture decision: Use keepalive or background workers for feedback ingestion. Never block the response cycle. Correlate feedback with eval flags using session_id to identify which prompts or model versions drive negative sentiment.
Step 4: Cost-Aware Fallback Chain
AI requests can exceed token budgets or trigger rate limits. Implement a deterministic fallback that preserves user experience without incurring model costs.
export class FallbackChain {
async execute(request: { prompt: string; maxTokens: number }): Promise<{ text: string; source: string }> {
try {
const response = await this.callModel(request.prompt, request.maxTokens);
return { text: response, source: 'model' };
} catch (err) {
if (this.isTokenLimitError(err)) {
return { text: this.generateDeterministicSummary(request.prompt), source: 'fallback_summary' };
}
return { text: 'Service temporarily unavailable. Please retry.', source: 'circuit_breaker' };
}
}
private isTokenLimitError(err: unknown): boolean {
return err instanceof Error && err.message.includes('context_length_exceeded');
}
private generateDeterministicSummary(prompt: string): string {
return `Processed request: ${prompt.slice(0, 100)}... (summary generated via deterministic pipeline)`;
}
private async callModel(prompt: string, maxTokens: number): Promise<string> {
// Placeholder for model invocation
throw new Error('context_length_exceeded');
}
}
Architecture decision: Fallback chains must be deterministic, fast, and cost-neutral. Use them for token overflow, rate limiting, or model degradation. Log fallback triggers separately to measure chain effectiveness.
Pitfall Guide
-
Relying exclusively on static benchmark scores
Benchmarks measure capability, not reliability. Production traffic contains adversarial inputs, edge cases, and distribution shifts that holdout sets never capture. Always pair static evals with continuous production scoring.
-
Ignoring token cost volatility
Token count scales non-linearly with prompt complexity. Retry loops, long-tail user inputs, and fallback triggers can multiply costs 4x. Implement per-request token budgeting and cost-per-success tracking.
-
Skipping progressive rollout
Shipping to 100% of users on day one eliminates rollback safety. Use canary routing with tenant-based hashing. Start at 5-10%, monitor eval flags, and incrementally increase only when p95 latency and hallucination proxies remain stable.
-
Hardcoding prompts without versioning
Prompt changes are deployment events. Without version control, you cannot correlate user feedback with specific prompt iterations. Store prompts in a registry with semantic versioning and rollout timestamps.
-
No deterministic fallback mechanism
Model failures, rate limits, and token overflows are inevitable. Without a fallback chain, users experience hard errors. Implement summary generators, cached responses, or rule-based routing as safety nets.
-
Treating user feedback as post-launch metadata
Feedback delayed by days cannot drive immediate rollback decisions. Ingest feedback in real time, correlate with eval flags, and trigger automated canary reduction when negative sentiment exceeds thresholds.
-
Over-provisioning without autoscaling triggers
AI inference workloads spike unpredictably. Static instance counts waste budget during quiet periods and fail during load. Use latency p99 and queue depth as autoscaling triggers, not CPU/memory.
Best practices from production:
- Run shadow mode for 48 hours before canary routing. Log predictions without serving them to validate eval pipeline accuracy.
- Implement token budgeting at the API gateway level. Reject or truncate requests that exceed configured limits before they reach the model.
- Use circuit breakers for model endpoints. When failure rate exceeds 15% over 60 seconds, route to fallback automatically.
- Maintain a prompt drift dashboard. Track average token count, latency distribution, and hallucination proxy rate per prompt version.
- Separate eval storage from production databases. Use time-series or columnar storage for eval metrics to avoid query contention.
Production Bundle
Action Checklist
Decision Matrix
| Scenario | Recommended Approach | Why | Cost Impact |
|---|
| Internal analytics dashboard | Static eval + 20% canary | Low user-facing risk, allows faster iteration | Low: minimal fallback overhead |
| Customer-facing SaaS feature | Continuous eval + 5% canary β 100% over 7 days | High retention sensitivity requires strict hallucination control | Medium: eval storage + fallback chain |
| High-volume API (10k+ req/min) | Token budgeting + circuit breakers + deterministic fallback | Prevents cost blowouts and cascading failures | High upfront, but prevents 400%+ cost spikes |
| Regulated domain (healthcare/finance) | Shadow mode β 1% canary β manual approval gates | Compliance requires audit trails and controlled exposure | High: extended validation period, dedicated eval infra |
Configuration Template
# ai-launch-config.yaml
evaluation:
thresholds:
latency_p95_ms: 1200
hallucination_proxy_rate: 0.08
cost_per_1k_success: 2.5
storage:
type: supabase
table: ai_evals
retention_days: 30
rollout:
feature: chat_assistant_v2
initial_canary_weight: 0.05
max_canary_weight: 1.0
increment_interval_hours: 24
tenant_hashing: true
consistency_window_minutes: 1440
fallback:
enabled: true
triggers:
- context_length_exceeded
- rate_limit_exceeded
- model_timeout_ms: 5000
strategy: deterministic_summary
circuit_breaker:
failure_threshold: 0.15
window_seconds: 60
observability:
feedback_ingestion: async
correlation_field: session_id
dashboard:
metrics:
- latency_distribution
- token_cost_per_request
- hallucination_proxy_rate
- fallback_trigger_count
Quick Start Guide
- Initialize the evaluation harness: Deploy the
ProductionEvalHarness class to your API gateway or middleware layer. Configure Supabase or equivalent time-series storage for ai_evals table.
- Enable canary routing: Replace direct model calls with
CanaryRouter.selectVersion(). Set initial weight to 0.05 in your config store. Verify tenant hashing produces consistent routing across requests.
- Wire feedback ingestion: Add the
FeedbackIngestor to your client-side rating component. Ensure session_id matches the inference request ID for correlation.
- Deploy fallback chain: Wrap model invocation in
FallbackChain.execute(). Configure circuit breaker thresholds and deterministic fallback logic. Test with simulated token overflow and timeout errors.
- Validate with shadow mode: Route 100% of traffic through eval harness without serving canary responses. Monitor dashboard for 48 hours. If hallucination proxy rate stays below threshold, begin progressive canary rollout.