export class OutputValidator {
static evaluate(rawOutput: string, constraints: OutputConstraint[]): ValidationReport {
const violations: string[] = [];
for (const constraint of constraints) {
switch (constraint.type) {
case 'contains':
if (!rawOutput.includes(constraint.value as string)) {
violations.push(`Missing required substring: "${constraint.value}"`);
}
break;
case 'excludes':
if (rawOutput.includes(constraint.value as string)) {
violations.push(`Contains forbidden substring: "${constraint.value}"`);
}
break;
case 'length':
if (rawOutput.length > (constraint.value as number)) {
violations.push(`Output exceeds max length: ${rawOutput.length} > ${constraint.value}`);
}
break;
case 'json':
try {
JSON.parse(rawOutput);
} catch {
violations.push('Invalid JSON structure');
}
break;
}
}
return { passed: violations.length === 0, violations };
}
}
**Architecture Rationale:** Constraints are evaluated independently, allowing parallel validation and granular failure reporting. This design prevents a single malformed field from masking other structural issues.
### Step 2: Prompt Versioning & Regression Thresholds
Prompt changes should never break existing functionality silently. A versioned registry with threshold-based regression testing ensures backward compatibility.
```typescript
import { createHash } from 'crypto';
export interface PromptEntry {
version: string;
template: string;
hash: string;
registeredAt: Date;
}
export class PromptVersionControl {
private store: Map<string, PromptEntry> = new Map();
register(name: string, version: string, template: string): void {
const key = `${name}::${version}`;
this.store.set(key, {
version,
template,
hash: createHash('sha256').update(template).digest('hex'),
registeredAt: new Date()
});
}
async runRegression(
name: string,
baselineVersion: string,
candidateVersion: string,
executor: (prompt: string) => Promise<string>,
threshold: number = 0.85
): Promise<boolean> {
const baseline = this.store.get(`${name}::${baselineVersion}`);
const candidate = this.store.get(`${name}::${candidateVersion}`);
if (!baseline || !candidate) throw new Error('Version not found');
const testCases = this.loadTestCases(name);
let baselinePasses = 0;
let candidatePasses = 0;
for (const tc of testCases) {
const baselineOutput = await executor(baseline.template + tc.input);
const candidateOutput = await executor(candidate.template + tc.input);
if (OutputValidator.evaluate(baselineOutput, tc.constraints).passed) baselinePasses++;
if (OutputValidator.evaluate(candidateOutput, tc.constraints).passed) candidatePasses++;
}
const passRate = candidatePasses / testCases.length;
return passRate >= threshold;
}
private loadTestCases(name: string): Array<{ input: string; constraints: OutputConstraint[] }> {
// Load from external JSON/TS config
return [];
}
}
Architecture Rationale: SHA-256 hashing prevents accidental prompt drift. The threshold parameter (default 0.85) acknowledges probabilistic variance while enforcing a minimum reliability floor. This replaces brittle pass/fail gates with statistically meaningful regression boundaries.
Step 3: Structured Output Enforcement
For production systems, raw text is insufficient. Zod schemas provide runtime validation that catches structural violations before they propagate.
import { z } from 'zod';
export const AnalysisSchema = z.object({
confidence: z.number().min(0).max(1),
tags: z.array(z.string()).max(5),
summary: z.string().min(20).max(500),
metadata: z.object({
model: z.string(),
latency_ms: z.number().optional()
}).passthrough()
});
export type AnalysisResult = z.infer<typeof AnalysisSchema>;
export async function validateStructuredOutput(raw: string): Promise<AnalysisResult> {
const cleaned = raw.replace(/```json\n?|\n?```/g, '').trim();
const parsed = JSON.parse(cleaned);
return AnalysisSchema.parse(parsed);
}
Architecture Rationale: .passthrough() on metadata allows provider-specific fields without breaking validation. Regex cleanup handles common markdown wrapping from LLMs. This pattern ensures business logic only receives contract-compliant data.
Step 4: Deterministic Mocking for Unit Tests
Unit tests must run fast and offline. A fixture engine decouples test execution from API costs and network latency.
export type FixtureMap = Map<RegExp, string>;
export class FixtureEngine {
constructor(private fixtures: FixtureMap) {}
async complete(prompt: string): Promise<string> {
for (const [pattern, response] of this.fixtures) {
if (pattern.test(prompt)) return response;
}
return JSON.stringify({ error: 'No matching fixture' });
}
async *stream(prompt: string): AsyncGenerator<string> {
const full = await this.complete(prompt);
for (const char of full) {
yield char;
}
}
}
Architecture Rationale: Regex-based matching allows flexible prompt patterns without exact string coupling. Streaming simulation validates consumer-side chunk handling. This enables deterministic unit testing while preserving async/iterator semantics.
Pitfall Guide
1. Exact String Assertion Trap
Explanation: Asserting expect(output).toBe(expected) fails on minor phrasing changes, temperature variance, or model updates.
Fix: Replace with constraint validation or semantic similarity scoring (e.g., cosine similarity on embeddings) for critical paths.
2. Prompt Hardcoding in Test Logic
Explanation: Embedding prompts directly in test files couples validation to implementation, making prompt iteration impossible without test rewrites.
Fix: Externalize prompts to versioned configuration files or a dedicated prompt registry. Tests should only reference prompt keys.
3. Ignoring Token Budget Enforcement
Explanation: LLMs respect max_tokens differently across providers. Unvalidated truncation can cut off JSON mid-stream or drop critical fields.
Fix: Assert output length against token estimates, and implement fallback parsing for truncated responses.
4. Mock-Production Reality Gap
Explanation: Fixtures return perfectly formatted JSON, but production APIs return markdown-wrapped text, rate limit errors, or partial streams.
Fix: Run shadow tests against real endpoints in CI. Inject realistic failure modes (429s, 500s, malformed chunks) into mock suites.
5. Schema Over-Constraint
Explanation: Zod schemas with strict .required() fields cause false failures when models omit optional metadata or reorder keys.
Fix: Use .optional(), .catch(), and .passthrough() strategically. Validate critical fields strictly; allow graceful degradation for auxiliary data.
6. Context Window Blindness
Explanation: Tests pass with short inputs but fail in production when conversation history exceeds context limits, causing silent truncation.
Fix: Inject realistic context padding in test cases. Validate that critical instructions appear before the truncation boundary.
7. Latency & Retry Neglect
Explanation: AI endpoints exhibit variable latency. Tests that assume instant responses miss timeout handling, retry logic, and circuit breaker behavior.
Fix: Simulate network jitter, enforce timeout assertions, and verify exponential backoff implementations in integration suites.
Production Bundle
Action Checklist
Decision Matrix
| Scenario | Recommended Approach | Why | Cost Impact |
|---|
| Internal dashboard with low user impact | Property-based constraints + mock fixtures | Fast iteration, low API cost, acceptable variance | Minimal |
| Customer-facing chatbot with strict compliance | Structured schema validation + regression thresholds | Prevents hallucination propagation, enforces contractual output | Moderate (validation overhead) |
| High-volume batch processing | Constraint validation + token budget enforcement | Optimizes throughput, prevents truncation failures | Low (reduces retry costs) |
| Multi-model routing system | Hybrid regression suite + shadow testing | Ensures consistent behavior across provider updates | High (requires parallel execution) |
Configuration Template
// test/ai-harness.config.ts
import { defineConfig } from 'vitest/config';
import { PromptVersionControl } from './prompt-registry';
import { FixtureEngine } from './fixture-engine';
export default defineConfig({
test: {
globals: true,
environment: 'node',
setupFiles: ['./test/setup.ts'],
coverage: {
provider: 'v8',
include: ['src/ai/**']
}
},
define: {
'AI_TEST_MODE': JSON.stringify(process.env.AI_TEST_MODE || 'mock'),
'AI_REGRESSION_THRESHOLD': JSON.stringify(0.85)
}
});
// test/setup.ts
import { beforeAll } from 'vitest';
import { PromptVersionControl } from '../src/prompt-registry';
import { FixtureEngine } from '../src/fixture-engine';
beforeAll(async () => {
const registry = new PromptVersionControl();
registry.register('summarizer', 'v1', 'Summarize the following text concisely:');
registry.register('summarizer', 'v2', 'Provide a brief summary of:');
global.promptRegistry = registry;
global.fixtureEngine = new FixtureEngine(new Map([
[/summarize/i, JSON.stringify({ summary: 'Test summary', confidence: 0.92 })],
[/extract/i, JSON.stringify({ entities: ['John', 'Doe'], format: 'json' })]
]));
});
Quick Start Guide
- Initialize the harness: Install
zod and vitest. Create prompt-registry.ts, fixture-engine.ts, and output-validator.ts using the templates above.
- Configure environment toggles: Set
AI_TEST_MODE=mock for local runs and AI_TEST_MODE=shadow for CI validation against real endpoints.
- Write constraint tests: Define
OutputConstraint[] for each feature. Replace expect().toBe() with OutputValidator.evaluate().
- Enable regression gates: Run
PromptVersionControl.runRegression() in your CI pipeline before merging prompt changes. Block merges if pass rate drops below threshold.
- Deploy with monitoring: Instrument validation failure rates, latency percentiles, and token truncation events. Alert on schema parse failures exceeding 2% of requests.