user_id?: string;
timestamp: number;
// Context
feature_id: string;
model_id: string;
model_version: string;
routing_strategy?: 'primary' | 'fallback' | 'cache';
// Performance
ttft_ms: number;
total_latency_ms: number;
stream_chunks_received: number;
// Token Accounting
prompt_tokens: number;
completion_tokens: number;
cached_tokens?: number;
estimated_cost_usd: number;
// Quality Signals
user_regenerated: boolean;
user_edited_output: boolean;
explicit_rating?: 'up' | 'down' | null;
implicit_quality_score: number; // 0-1, derived from behavior
}
This schema ensures every AI call carries deterministic identifiers, resource consumption metrics, and behavioral quality indicators. Versioning `model_version` and `routing_strategy` enables drift detection and fallback analysis.
### Step 2: Instrument Server-Side Token & Latency Accounting
Client-side tracking cannot reliably capture token consumption or streaming latency. Server-side instrumentation must intercept the model call, measure TTFT, count tokens, and calculate cost before emitting the event.
```typescript
import { createHash } from 'crypto';
export async function trackAIInteraction(
analyticsClient: AnalyticsClient,
payload: {
sessionId: string;
userId?: string;
featureId: string;
modelId: string;
modelVersion: string;
prompt: string;
completion: string;
ttftMs: number;
totalLatencyMs: number;
promptTokens: number;
completionTokens: number;
cachedTokens?: number;
costUsd: number;
userRegenerated: boolean;
userEditedOutput: boolean;
explicitRating?: 'up' | 'down' | null;
}
) {
// PII mitigation: hash prompt prefix, trim sensitive patterns
const sanitizedPrompt = payload.prompt.replace(/\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b/gi, '[EMAIL]');
const promptHash = createHash('sha256').update(sanitizedPrompt.slice(0, 128)).digest('hex');
// Implicit quality scoring (0-1)
const implicitQuality = calculateImplicitQuality({
regenerated: payload.userRegenerated,
edited: payload.userEditedOutput,
latencyPenalty: payload.totalLatencyMs > 3000 ? 0.6 : 1.0,
tokenEfficiency: payload.completionTokens / Math.max(payload.promptTokens, 1)
});
const event: AIInteractionEvent = {
event_id: crypto.randomUUID(),
session_id: payload.sessionId,
user_id: payload.userId,
timestamp: Date.now(),
feature_id: payload.featureId,
model_id: payload.modelId,
model_version: payload.modelVersion,
routing_strategy: 'primary',
ttft_ms: payload.ttftMs,
total_latency_ms: payload.totalLatencyMs,
stream_chunks_received: Math.ceil(payload.completionTokens / 4),
prompt_tokens: payload.promptTokens,
completion_tokens: payload.completionTokens,
cached_tokens: payload.cachedTokens,
estimated_cost_usd: payload.costUsd,
user_regenerated: payload.userRegenerated,
user_edited_output: payload.userEditedOutput,
explicit_rating: payload.explicitRating,
implicit_quality_score: implicitQuality
};
await analyticsClient.capture('ai_interaction', event);
}
function calculateImplicitQuality({
regenerated,
edited,
latencyPenalty,
tokenEfficiency
}: {
regenerated: boolean;
edited: boolean;
latencyPenalty: number;
tokenEfficiency: number;
}): number {
let score = 1.0;
if (regenerated) score -= 0.3;
if (edited) score -= 0.15;
score *= latencyPenalty;
// Penalize excessive completion tokens relative to prompt
if (tokenEfficiency > 2.5) score -= 0.1;
return Math.max(0, Math.min(1, score));
}
Step 3: Implement Streaming-Aware Client Instrumentation
For streaming AI features, the client must track chunk arrival rates, regeneration triggers, and edit events without blocking the UI thread.
export class AIStreamTracker {
private chunks: number = 0;
private startTime: number = 0;
private ttft: number = 0;
private sessionId: string;
private featureId: string;
constructor(sessionId: string, featureId: string) {
this.sessionId = sessionId;
this.featureId = featureId;
this.startTime = performance.now();
}
onChunk() {
this.chunks++;
if (this.chunks === 1) {
this.ttft = performance.now() - this.startTime;
}
}
onComplete(completion: string, prompt: string) {
const totalLatency = performance.now() - this.startTime;
return {
ttft_ms: Math.round(this.ttft),
total_latency_ms: Math.round(totalLatency),
stream_chunks_received: this.chunks,
prompt_tokens: Math.ceil(prompt.length / 4),
completion_tokens: Math.ceil(completion.length / 4)
};
}
}
Step 4: Route to Analytics Warehouse with Batching
AI events generate high volume. Batch emission with TTL-based flush and backpressure handling prevents pipeline saturation.
export class AIBatchEmitter {
private buffer: AIInteractionEvent[] = [];
private flushInterval: NodeJS.Timeout;
private readonly MAX_BATCH = 100;
private readonly FLUSH_MS = 5000;
constructor(private transport: (events: AIInteractionEvent[]) => Promise<void>) {
this.flushInterval = setInterval(() => this.flush(), this.FLUSH_MS);
}
push(event: AIInteractionEvent) {
this.buffer.push(event);
if (this.buffer.length >= this.MAX_BATCH) {
this.flush();
}
}
private async flush() {
if (this.buffer.length === 0) return;
const batch = this.buffer.splice(0, this.MAX_BATCH);
try {
await this.transport(batch);
} catch (err) {
// Re-queue failed batch with exponential backoff in production
this.buffer.unshift(...batch);
console.error('AI analytics flush failed:', err);
}
}
destroy() {
clearInterval(this.flushInterval);
this.flush();
}
}
Architecture Decisions & Rationale
- Server-side token accounting: Client environments cannot reliably count tokens or access model provider billing APIs. Server-side interception ensures accuracy and enables cost attribution per session.
- Implicit quality scoring: Explicit ratings suffer from low participation rates (~3-5%). Implicit signals (regeneration, editing, latency tolerance, token efficiency) provide continuous, high-signal feedback without UX friction.
- Schema versioning: AI models and routing strategies change frequently. Embedding
model_version and routing_strategy in every event enables cohort analysis across iterations without pipeline breaks.
- Batching with TTL: Streaming AI features can emit hundreds of events per minute. Batching reduces network overhead and warehouse write costs while maintaining sub-10-second dashboard freshness.
Pitfall Guide
- Logging raw prompts and responses: Storing unredacted AI I/O violates privacy policies, inflates storage costs, and creates compliance liabilities. Always sanitize PII, truncate sensitive prefixes, and hash content identifiers before emission.
- Treating all AI calls as identical: Different models, versions, and routing strategies have distinct cost and performance profiles. Failing to tag
model_id and model_version prevents drift detection and makes cost attribution impossible.
- Ignoring implicit feedback signals: Relying solely on explicit thumbs-up/down ratings yields sparse data. Regeneration clicks, output edits, scroll abandonment, and retry patterns are continuous quality indicators that correlate strongly with retention.
- Over-tracing without sampling: Instrumenting every internal retry, cache lookup, and fallback call creates noise and skews metrics. Sample non-critical paths, but guarantee 100% coverage for user-facing model calls.
- Missing cost-per-session attribution: Token consumption alone is meaningless without session linkage. Failing to aggregate
estimated_cost_usd per session_id obscures margin erosion and prevents budget forecasting.
- Schema drift without versioning: Adding fields to events without backward compatibility breaks historical dashboards. Always include a
schema_version field and maintain a migration contract in your analytics pipeline.
- Synchronous event emission on the critical path: Blocking model responses to await analytics delivery increases latency and degrades UX. Emit events asynchronously via a worker thread or background queue with backpressure handling.
Best Practices from Production:
- Use deterministic session IDs that persist across page reloads and model retries.
- Implement automatic cost alerts when
estimated_cost_usd exceeds session thresholds.
- Cache model response hashes to detect duplicate generations and reduce redundant token spend.
- Align analytics schema with OpenTelemetry semantic conventions for AI/ML telemetry to enable cross-tool compatibility.
Production Bundle
Action Checklist
Decision Matrix
| Scenario | Recommended Approach | Why | Cost Impact |
|---|
| Early-stage AI feature (<10k MAU) | Client-assisted + server token accounting | Low infrastructure overhead, fast iteration, accurate cost tracking | Low initial, scales linearly |
| Production generative UI (>100k MAU) | Server-only emission + OpenTelemetry pipeline | Reduces client payload, enables sampling, aligns with observability standards | Medium initial, high ROI on cost control |
| Enterprise/compliance-heavy | On-prem warehouse + PII-hardened schema + audit trails | Meets data residency, enables legal hold, prevents leakage | High initial, mitigates regulatory risk |
Configuration Template
// ai-analytics.config.ts
export const AI_ANALYTICS_CONFIG = {
schema: {
version: '1.0',
requiredFields: [
'event_id', 'session_id', 'feature_id', 'model_id',
'model_version', 'ttft_ms', 'total_latency_ms',
'prompt_tokens', 'completion_tokens', 'estimated_cost_usd',
'implicit_quality_score'
],
optionalFields: ['user_id', 'cached_tokens', 'explicit_rating', 'routing_strategy']
},
batching: {
maxBatchSize: 100,
flushIntervalMs: 5000,
backoffMultiplier: 1.5,
maxRetries: 3
},
pii: {
enabled: true,
patterns: ['email', 'phone', 'ssn', 'credit_card'],
maxPromptLength: 256,
hashAlgorithm: 'sha256'
},
quality: {
implicitWeights: {
regenerated: -0.3,
edited: -0.15,
latencyPenalty: 0.6,
tokenEfficiencyThreshold: 2.5
},
explicitParticipationTarget: 0.05 // 5% rating rate baseline
},
cost: {
alertThresholdPerSessionUsd: 0.50,
cacheHitDiscount: 0.15,
fallbackModelMultiplier: 1.2
}
};
Quick Start Guide
- Install dependencies:
npm install @codcompass/ai-analytics uuid crypto
- Initialize the emitter: Import
AIBatchEmitter and AI_ANALYTICS_CONFIG, configure your transport endpoint (PostHog, Amplitude, or custom warehouse).
- Wrap your model call: Use
trackAIInteraction around your LLM invocation, passing TTFT, token counts, and user behavior flags.
- Verify in dashboard: Query
ai_interaction events, validate implicit_quality_score distribution, and confirm estimated_cost_usd aggregates per session.
- Set alerts: Configure budget thresholds and latency SLOs to trigger on
ttft_ms > 1500 or session_cost > 0.50.