o',
format: format.combine(format.timestamp(), format.json()),
transports: [new transports.File({ filename: config.logPath })]
});
this.requestCounter = new Counter({
name: 'ai_inference_requests_total',
help: 'Total AI inference requests by model and status',
labelNames: ['model', 'status']
});
this.latencyHistogram = new Histogram({
name: 'ai_inference_latency_ms',
help: 'AI inference latency distribution',
buckets: [100, 500, 1000, 2500, 5000, 10000]
});
this.tokenCounter = new Counter({
name: 'ai_tokens_consumed_total',
help: 'Total tokens consumed by type and model',
labelNames: ['model', 'type']
});
this.costCounter = new Counter({
name: 'ai_cost_cents_total',
help: 'Cumulative API cost in cents',
labelNames: ['model']
});
}
async instrument<T>(
model: string,
operation: () => Promise<T>,
context: { userId?: string; sessionId?: string }
): Promise<T> {
const tracer = trace.getTracer('ai-observability');
const span = tracer.startSpan('llm_inference', {
attributes: { 'ai.model': model, 'ai.user_id': context.userId ?? 'anonymous' }
});
const startTime = performance.now();
let result: T;
let status: InferenceResult['status'] = 'success';
let errorType: string | undefined;
try {
result = await operation();
span.setStatus({ code: SpanStatusCode.OK });
} catch (err: any) {
status = this.classifyError(err);
errorType = status;
span.recordException(err);
span.setStatus({ code: SpanStatusCode.ERROR, message: err.message });
throw err;
} finally {
const latency = performance.now() - startTime;
const tokens = this.extractTokenUsage(result as any);
const cost = this.calculateCost(model, tokens.input, tokens.output);
const telemetry: InferenceResult = {
model,
inputTokens: tokens.input,
outputTokens: tokens.output,
latencyMs: latency,
status,
errorType,
costCents: cost
};
this.emitMetrics(telemetry);
this.logger.info('ai_inference_complete', telemetry);
span.setAttributes({
'ai.latency_ms': latency,
'ai.tokens.input': tokens.input,
'ai.tokens.output': tokens.output,
'ai.cost_cents': cost
});
span.end();
}
return result;
}
private classifyError(err: any): InferenceResult['status'] {
const msg = err.message?.toLowerCase() ?? '';
if (msg.includes('429') || msg.includes('rate limit')) return 'error';
if (msg.includes('timeout') || msg.includes('aborted')) return 'timeout';
if (msg.includes('filtered') || msg.includes('content policy')) return 'filtered';
return 'error';
}
private extractTokenUsage(response: any) {
return {
input: response?.usage?.prompt_tokens ?? 0,
output: response?.usage?.completion_tokens ?? 0
};
}
private calculateCost(model: string, input: number, output: number): number {
const rates: Record<string, { input: number; output: number }> = {
'gpt-5.4': { input: 0.000005, output: 0.000015 },
'claude-sonnet-4': { input: 0.000003, output: 0.000015 }
};
const rate = rates[model] ?? { input: 0.00001, output: 0.00003 };
return (input * rate.input + output * rate.output) * 100;
}
private emitMetrics(data: InferenceResult) {
this.requestCounter.labels(data.model, data.status).inc();
this.latencyHistogram.observe(data.latencyMs);
this.tokenCounter.labels(data.model, 'input').inc(data.inputTokens);
this.tokenCounter.labels(data.model, 'output').inc(data.outputTokens);
this.costCounter.labels(data.model).inc(data.costCents);
}
}
**Architecture Rationale**:
- The interceptor pattern ensures zero business logic coupling. Every model call passes through a single instrumentation boundary.
- OpenTelemetry spans are created synchronously but exported asynchronously via batch processors, preventing latency injection.
- Prometheus metrics use fixed label sets to avoid cardinality explosion. Dynamic identifiers (user IDs, session tokens) are hashed or truncated before metric emission.
- Cost calculation runs client-side using published pricing tiers, enabling real-time budget tracking without waiting for provider billing cycles.
### Step 2: Asynchronous Quality Evaluation
Running LLM-as-judge evaluations in the critical path destroys throughput. The evaluation pipeline must operate asynchronously, consuming inference logs and producing semantic scores without blocking user requests.
```typescript
import { EventEmitter } from 'events';
interface EvaluationPayload {
requestId: string;
prompt: string;
response: string;
context?: string[];
model: string;
}
interface EvaluationResult {
requestId: string;
relevance: number;
accuracy: number;
completeness: number;
safety: number;
hallucinationRisk: boolean;
flaggedContent?: string[];
}
export class SemanticEvaluator extends EventEmitter {
private queue: EvaluationPayload[] = [];
private isProcessing = false;
async submit(payload: EvaluationPayload): Promise<void> {
this.queue.push(payload);
if (!this.isProcessing) this.processQueue();
}
private async processQueue(): Promise<void> {
this.isProcessing = true;
while (this.queue.length > 0) {
const item = this.queue.shift()!;
try {
const result = await this.runEvaluation(item);
this.emit('evaluation_complete', result);
} catch (err) {
this.emit('evaluation_error', { requestId: item.requestId, error: err });
}
}
this.isProcessing = false;
}
private async runEvaluation(payload: EvaluationPayload): Promise<EvaluationResult> {
const systemPrompt = `
Evaluate the AI response against the following criteria (1-5 scale):
- Relevance: Directly addresses the prompt
- Accuracy: Factually consistent with provided context
- Completeness: Covers all requested aspects
- Safety: Contains no policy violations
Return JSON with scores and a hallucinationRisk boolean.
`;
const evaluationPrompt = `
Context: ${payload.context?.join('\n') ?? 'None'}
Prompt: ${payload.prompt}
Response: ${payload.response}
`;
const response = await this.callEvaluatorModel(systemPrompt, evaluationPrompt);
return this.parseEvaluationResponse(response);
}
private async callEvaluatorModel(system: string, user: string): Promise<string> {
// Placeholder for evaluator LLM client (e.g., smaller/cheaper model)
return JSON.stringify({
relevance: 4, accuracy: 5, completeness: 4, safety: 5,
hallucinationRisk: false, flaggedContent: []
});
}
private parseEvaluationResponse(raw: string): EvaluationResult {
try {
return JSON.parse(raw);
} catch {
return {
requestId: 'unknown', relevance: 0, accuracy: 0,
completeness: 0, safety: 0, hallucinationRisk: true, flaggedContent: ['parse_failure']
};
}
}
}
Architecture Rationale:
- The evaluator uses an event-driven queue to decouple scoring from inference. This prevents evaluation latency from impacting user-facing response times.
- A dedicated, lower-cost model handles evaluation tasks. Running GPT-5.4 or Claude Opus for self-evaluation is economically unsustainable at scale.
- Hallucination detection requires explicit context injection. Without source documents, the evaluator cannot distinguish between model creativity and factual fabrication.
- Results are emitted as events, allowing downstream consumers (alerting systems, dashboards, feedback loops) to subscribe without tight coupling.
Pitfall Guide
1. Streaming Latency Blindness
Explanation: Measuring only total request duration ignores time-to-first-token (TTFB) and streaming throughput. Users perceive responsiveness based on TTFB, not final completion time.
Fix: Instrument both time_to_first_token and time_to_last_token. Alert when TTFB exceeds 800ms, as this directly impacts perceived UX.
2. Metric Cardinality Explosion
Explanation: Tagging Prometheus metrics or OpenTelemetry spans with raw prompts, full user emails, or unbounded session IDs causes storage bloat and query degradation.
Fix: Hash dynamic identifiers, truncate prompts to 50 characters, and use predefined label buckets. Enforce cardinality limits at the instrumentation layer.
3. Blind Retry Logic
Explanation: Automatically retrying all failed requests wastes tokens and amplifies costs. Retrying on content filters, authentication failures, or invalid request schemas violates provider policies.
Fix: Implement error classification before retry decisions. Only retry on transient network failures, 429 rate limits, or 5xx model errors. Apply exponential backoff with jitter.
4. Context-Free Hallucination Checks
Explanation: Evaluating model output without retrieval context produces false positives. The evaluator cannot verify factual claims against source material.
Fix: Always pass retrieved documents, knowledge base IDs, or grounding references to the evaluation pipeline. Use vector similarity scores as a secondary hallucination signal.
5. Synchronous Evaluation Blocking
Explanation: Running LLM-as-judge scoring in the request path adds 1β3 seconds to every inference call, destroying throughput and increasing latency SLO violations.
Fix: Offload evaluation to a background worker, message queue (Redis Streams, Kafka), or serverless function. Process scores asynchronously and update dashboards via webhooks.
6. Ignoring Token Budget Drift
Explanation: Monitoring total API spend without tracking input/output ratios masks prompt inefficiency. A 20% cost increase might stem from verbose system prompts rather than traffic growth.
Fix: Track input_tokens / output_tokens ratio per endpoint. Alert when ratios deviate >15% from baseline, indicating prompt drift or model version changes.
7. Over-Reliance on LLM-as-Judge
Explanation: Using another LLM for evaluation introduces circular bias and additional cost. Models often grade leniently on self-generated content.
Fix: Combine LLM evaluation with deterministic checks: JSON schema validation, regex pattern matching, keyword filtering, and embedding similarity against ground truth. Use LLM scoring only for semantic nuance.
Production Bundle
Action Checklist
Decision Matrix
| Scenario | Recommended Approach | Why | Cost Impact |
|---|
| High-throughput chatbot | Async evaluation + rule-based guards | Prevents latency injection, maintains 200ms TTFB | Low (background workers) |
| Compliance/medical AI | Sync evaluation + strict schema validation | Zero tolerance for hallucination or policy drift | High (evaluator model calls) |
| Multi-model routing | Centralized cost ledger + dynamic fallback | Enables real-time budget switching between tiers | Medium (routing overhead) |
| Internal developer tools | Lightweight logging + Prometheus metrics | Reduces complexity, focuses on developer UX | Low (minimal eval) |
| Customer-facing generative UI | Streaming metrics + TTFB alerting | Directly correlates with perceived responsiveness | Low (client-side instrumentation) |
Configuration Template
// otel-config.ts
import { NodeSDK } from '@opentelemetry/sdk-node';
import { getNodeAutoInstrumentations } from '@opentelemetry/auto-instrumentations-node';
import { OTLPTraceExporter } from '@opentelemetry/exporter-trace-otlp-http';
import { PrometheusExporter } from '@opentelemetry/exporter-prometheus';
const sdk = new NodeSDK({
traceExporter: new OTLPTraceExporter({
url: process.env.OTEL_EXPORTER_OTLP_ENDPOINT ?? 'http://localhost:4318/v1/traces',
timeoutMillis: 5000
}),
metricExporter: new PrometheusExporter({
port: 9464,
endpoint: '/metrics'
}),
instrumentations: [getNodeAutoInstrumentations()],
serviceName: 'ai-production-pipeline'
});
sdk.start();
process.on('SIGTERM', () => sdk.shutdown().catch(console.error));
// telemetry-bootstrap.ts
import { LLMTelemetryManager } from './LLMTelemetryManager';
import { SemanticEvaluator } from './SemanticEvaluator';
export function bootstrapObservability() {
const telemetry = new LLMTelemetryManager({
serviceName: 'ai-pipeline',
logPath: '/var/log/ai/telemetry.jsonl',
enableEvaluation: true
});
const evaluator = new SemanticEvaluator();
evaluator.on('evaluation_complete', (result) => {
if (result.hallucinationRisk || result.safety < 3) {
// Trigger alerting webhook or Slack notification
console.warn(`[ALERT] Semantic degradation detected: ${result.requestId}`);
}
});
evaluator.on('evaluation_error', ({ requestId, error }) => {
console.error(`[EVAL_FAIL] ${requestId}: ${error.message}`);
});
return { telemetry, evaluator };
}
Quick Start Guide
- Install dependencies:
npm install @opentelemetry/api @opentelemetry/sdk-node @opentelemetry/auto-instrumentations-node @opentelemetry/exporter-trace-otlp-http @opentelemetry/exporter-prometheus prom-client winston
- Initialize the telemetry manager: Import
LLMTelemetryManager and instantiate it with your log path and model pricing tiers.
- Wrap model calls: Replace direct LLM client invocations with
telemetry.instrument(model, async () => client.generate(...), context).
- Start the evaluation worker: Instantiate
SemanticEvaluator, attach event listeners for alerts, and submit payloads after each inference completes.
- Expose metrics: Run the OpenTelemetry SDK in your application entrypoint. Access Prometheus metrics at
http://localhost:9464/metrics and configure Grafana dashboards using the provided metric names.