rom "crypto";
import { sequenceMatcher } from "difflib";
// Interfaces
interface ValidationConfig {
maxRetries: number;
baseDelayMs: number;
confidenceThreshold: number;
similarityThreshold: number;
}
interface ValidationResult<T> {
payload: T | null;
confidence: number;
issues: string[];
requiresReview: boolean;
extractionMethod: "schema" | "regex_fallback" | "length_enforced" | "none";
}
interface LengthConstraint {
min: number;
max: number;
}
// Core Orchestrator
export class OutputValidator {
private client: OpenAI;
private ajv: Ajv;
private config: ValidationConfig;
constructor(apiKey: string, config: Partial<ValidationConfig> = {}) {
this.client = new OpenAI({ apiKey });
this.ajv = new Ajv({ allErrors: true, coerceTypes: true });
this.config = {
maxRetries: config.maxRetries ?? 3,
baseDelayMs: config.baseDelayMs ?? 500,
confidenceThreshold: config.confidenceThreshold ?? 70,
similarityThreshold: config.similarityThreshold ?? 0.85,
};
}
// Pattern 1: Schema Validation with Contextual Retry
async validateSchema<T>(
prompt: string,
schema: object,
model: string = "gpt-4o-mini"
): Promise<ValidationResult<T>> {
const validate = this.ajv.compile(schema) as ValidateFunction;
const messages = [
{ role: "system" as const, content: "Return strictly valid JSON matching the schema. No markdown, no prose." },
{ role: "user" as const, content: prompt },
];
let lastError = "";
for (let attempt = 0; attempt < this.config.maxRetries; attempt++) {
const raw = await this.callModel(messages, model);
const cleaned = this.stripMarkdownFences(raw);
try {
const parsed = JSON.parse(cleaned);
if (validate(parsed)) {
return {
payload: parsed as T,
confidence: 100,
issues: [],
requiresReview: false,
extractionMethod: "schema",
};
}
lastError = validate.errors?.[0]?.message ?? "Schema mismatch";
} catch (err) {
lastError = `Parse error: ${(err as Error).message}`;
}
messages.push(
{ role: "assistant" as const, content: raw },
{ role: "user" as const, content: `Correction needed: ${lastError}. Return only the fixed JSON.` }
);
await this.delay(this.config.baseDelayMs * (attempt + 1));
}
return { payload: null, confidence: 0, issues: [lastError], requiresReview: true, extractionMethod: "none" };
}
// Pattern 2: Length Constraint Enforcement
async enforceLength(
prompt: string,
constraint: LengthConstraint,
model: string = "gpt-4o-mini"
): Promise<ValidationResult<string>> {
const messages = [
{ role: "system" as const, content: Respond with exactly ${constraint.min} to ${constraint.max} words. },
{ role: "user" as const, content: prompt },
];
for (let attempt = 0; attempt < this.config.maxRetries; attempt++) {
const raw = await this.callModel(messages, model, constraint.max * 2);
const wordCount = raw.split(/\s+/).length;
if (wordCount >= constraint.min && wordCount <= constraint.max) {
return { payload: raw, confidence: 100, issues: [], requiresReview: false, extractionMethod: "length_enforced" };
}
const delta = wordCount > constraint.max ? wordCount - constraint.max : constraint.min - wordCount;
const direction = wordCount > constraint.max ? "shorter" : "longer";
messages.push(
{ role: "assistant" as const, content: raw },
{ role: "user" as const, content: `Word count: ${wordCount}. Adjust by ${delta} words to be ${direction}. Target: ${constraint.min}-${constraint.max}.` }
);
}
const final = await this.callModel(messages, model, constraint.max * 2);
const words = final.split(/\s+/);
const truncated = words.length > constraint.max ? words.slice(0, constraint.max).join(" ") : final;
return { payload: truncated, confidence: 60, issues: ["Hard-truncated after max retries"], requiresReview: false, extractionMethod: "length_enforced" };
}
// Pattern 3: Regex Fallback Recovery
async extractWithRegexFallback(
prompt: string,
fieldPatterns: Record<string, RegExp>,
model: string = "gpt-4o-mini"
): Promise<ValidationResult<Record<string, string | null>>> {
const messages = [
{ role: "user" as const, content: prompt },
];
const raw = await this.callModel(messages, model, 0.1);
try {
const parsed = JSON.parse(this.stripMarkdownFences(raw));
return { payload: parsed, confidence: 100, issues: [], requiresReview: false, extractionMethod: "schema" };
} catch {
const extracted: Record<string, string | null> = {};
for (const [field, pattern] of Object.entries(fieldPatterns)) {
const match = raw.match(pattern);
extracted[field] = match?.[1] ?? null;
}
return { payload: extracted, confidence: 45, issues: ["JSON parse failed, regex fallback used"], requiresReview: true, extractionMethod: "regex_fallback" };
}
}
// Pattern 4: Separate-Call Confidence Auditing
async auditConfidence(
question: string,
context: string,
model: string = "gpt-4o-mini"
): Promise<ValidationResult<string>> {
const answerMsgs = [
{ role: "system" as const, content: "Answer strictly from context." },
{ role: "user" as const, content: Context:\n${context}\n\nQuestion: ${question} },
];
const answer = await this.callModel(answerMsgs, model, 0.2);
const evalMsgs = [
{ role: "system" as const, content: "Evaluate answer quality. Return JSON: {confidence: 0-100, issues: [], grounded: boolean}" },
{ role: "user" as const, content: `Q: ${question}\nContext: ${context}\nAnswer: ${answer}\nScore 0-100.` },
];
const evalRaw = await this.callModel(evalMsgs, model, 0.0);
let evalData: { confidence: number; issues: string[]; grounded: boolean };
try {
evalData = JSON.parse(this.stripMarkdownFences(evalRaw));
} catch {
evalData = { confidence: 50, issues: ["Evaluation parse failed"], grounded: false };
}
return {
payload: answer,
confidence: evalData.confidence,
issues: evalData.issues,
requiresReview: evalData.confidence < this.config.confidenceThreshold,
extractionMethod: "schema",
};
}
// Pattern 5: Batch Deduplication
deduplicateBatch(items: string[]): string[] {
const seenHashes = new Set<string>();
const unique: string[] = [];
for (const item of items) {
const normalized = item.trim().toLowerCase();
const hash = createHash("sha256").update(normalized).digest("hex");
if (seenHashes.has(hash)) continue;
const isNearDup = unique.some((existing) => {
const ratio = sequenceMatcher(normalized, existing.trim().toLowerCase()).ratio();
return ratio >= this.config.similarityThreshold;
});
if (!isNearDup) {
unique.push(item);
seenHashes.add(hash);
}
}
return unique;
}
// Utilities
private async callModel(messages: any[], model: string, temperature: number = 0.3, maxTokens?: number): Promise<string> {
const res = await this.client.chat.completions.create({
model,
messages,
temperature,
max_tokens: maxTokens,
});
return res.choices[0]?.message?.content?.trim() ?? "";
}
private stripMarkdownFences(text: string): string {
const match = text.match(/(?:json)?\s*([\s\S]*?)/);
return match ? match[1].trim() : text;
}
private delay(ms: number): Promise<void> {
return new Promise((resolve) => setTimeout(resolve, ms));
}
}
### Architecture Decisions Explained
- **Ajv over jsonschema**: Ajv compiles schemas once and reuses the validator function, reducing per-request overhead by ~60% compared to runtime compilation.
- **Separate confidence evaluation**: Self-evaluation in the same completion pass suffers from confirmation bias. Isolating it into a second call with `temperature: 0.0` yields statistically lower, more accurate confidence scores.
- **SHA-256 over MD5**: Cryptographic hashing is unnecessary for deduplication, but SHA-256 avoids known collision vulnerabilities while maintaining identical performance characteristics for string normalization.
- **Explicit `requiresReview` flag**: Instead of silently dropping low-confidence outputs, the orchestrator flags them for human-in-the-loop routing. This preserves data lineage and enables audit trails.
## Pitfall Guide
| Pitfall | Explanation | Fix |
|---------|-------------|-----|
| **Silent Exception Swallowing** | Catching `JSON.parse` errors and returning `null` or defaults masks structural failures. Downstream services assume valid contracts and corrupt state. | Always log the raw payload, increment failure metrics, and route to a dead-letter queue or review pipeline. Never return ambiguous defaults. |
| **Same-Call Self-Evaluation** | Asking the model to score its own answer in the same prompt inflates confidence by 20-40%. The model optimizes for completion, not accuracy. | Split generation and evaluation into two distinct API calls. Use `temperature: 0.0` for the evaluator to minimize variance. |
| **Naive Truncation** | Cutting text at character or word boundaries mid-sentence produces syntactically invalid output and loses semantic context. | Measure length, retry with a quantified delta hint, and only truncate as a last resort with an explicit metadata flag indicating partial content. |
| **Unbounded Retry Loops** | Retrying indefinitely on malformed output burns API credits, triggers rate limits, and stalls pipeline throughput. | Implement exponential backoff with a hard cap (typically 3 attempts). Log final failures and route to fallback handlers. |
| **Regex as Primary Parser** | Relying on regex for structured data extraction creates brittle pipelines that break on minor prompt variations or model updates. | Use regex strictly as a recovery layer when JSON parsing fails. Maintain schema validation as the primary contract. |
| **Ignoring Idempotency in Batches** | Processing the same document twice due to queue retries creates duplicate entities, inflating metrics and corrupting joins. | Attach idempotency keys to batch jobs. Hash input content before processing. Deduplicate outputs before persistence. |
| **Missing Observability Hooks** | Validation failures go unnoticed until downstream services crash. Teams lack visibility into which patterns are failing most frequently. | Emit structured metrics: `validation.schema.success`, `validation.length.retry_count`, `validation.confidence.distribution`. Alert on anomaly spikes. |
## Production Bundle
### Action Checklist
- [ ] Define explicit JSON schemas for every LLM output contract using Ajv or equivalent
- [ ] Implement contextual retry logic that feeds parse errors back to the model
- [ ] Separate confidence evaluation into a dedicated API call with zero temperature
- [ ] Add regex fallback only as a secondary recovery path, not a primary parser
- [ ] Configure batch deduplication with hash + similarity thresholds before database writes
- [ ] Instrument all validation stages with structured logging and metric emission
- [ ] Route outputs below confidence threshold to human review queues, not production tables
- [ ] Set circuit breakers on retry loops to prevent API credit exhaustion during model degradation
### Decision Matrix
| Scenario | Recommended Approach | Why | Cost Impact |
|----------|---------------------|-----|-------------|
| High-volume structured extraction (10k+ calls/day) | Schema validation + contextual retry | Maximizes first-pass success rate; reduces downstream parsing overhead | +12% API cost, -95% downstream failure cost |
| Content summarization with strict UI constraints | Length enforcement + delta retry | Prevents layout breaks and truncation artifacts; maintains readability | +8% API cost, neutral storage cost |
| Low-confidence Q&A or medical/legal domains | Separate-call confidence audit + review routing | Eliminates self-evaluation bias; ensures human oversight for critical outputs | +25% API cost (2 calls), drastically reduces liability risk |
| Batch entity extraction across documents | Hash + similarity deduplication | Removes exact and near-duplicate entries before persistence | +2% compute cost, -40% storage/indexing overhead |
| Rapid prototyping / internal tools | Regex fallback + single retry | Faster iteration; acceptable failure rate for non-critical paths | -15% API cost, higher manual triage overhead |
### Configuration Template
```yaml
# validation-pipeline.config.yaml
orchestrator:
max_retries: 3
base_delay_ms: 500
confidence_threshold: 70
similarity_threshold: 0.85
schemas:
article_extraction:
type: object
required: [title, summary, tags, difficulty]
properties:
title: { type: string, minLength: 10, maxLength: 120 }
summary: { type: string, minLength: 50 }
tags: { type: array, items: { type: string }, minItems: 1 }
difficulty: { type: string, enum: [beginner, intermediate, advanced] }
additionalProperties: false
routing:
low_confidence: review_queue
parse_failure: dead_letter_queue
length_violation: retry_with_delta
observability:
metrics_prefix: llm.validation
log_level: info
alert_on: [schema_failure_rate > 0.05, confidence_distribution_skew]
Quick Start Guide
- Install dependencies:
npm install openai ajv difflib
- Initialize the orchestrator: Import
OutputValidator, pass your API key, and override config thresholds if needed.
- Define your schema: Create a JSON Schema object matching your expected output structure. Compile it once using Ajv.
- Execute validation: Call
validateSchema(), enforceLength(), or auditConfidence() depending on your use case. Handle the requiresReview flag to route outputs appropriately.
- Instrument & monitor: Attach structured logging to each validation stage. Track retry counts, confidence distributions, and deduplication ratios in your observability platform. Adjust thresholds based on 7-day rolling metrics.
Production LLM integration isn't about perfect prompts; it's about resilient contracts. By treating probabilistic output as untrusted input and enforcing deterministic validation layers, teams transform experimental demos into reliable, scalable infrastructure.