pe LogisticsRecord = z.infer<typeof LogisticsSchema>;
async function extractLogistics(rawText: string): Promise<LogisticsRecord | null> {
const prompt = `
Extract shipment details from the provided text.
Return ONLY valid JSON matching the required schema.
Use null or omit fields if information is absent.
Do not include explanations or markdown formatting.
Schema:
{
"shipmentId": "string (UUID)",
"origin": "string",
"destination": "string",
"weightKg": "number",
"hazardous": "boolean",
"specialHandling": ["string"]
}
Input: ${rawText}
`;
const response = await llmClient.complete(prompt);
try {
const cleaned = response.replace(/json\s?|\s?/g, '').trim();
return LogisticsSchema.parse(JSON.parse(cleaned));
} catch {
return null; // Triggers retry or manual review queue
}
}
**Why this works:** The model handles fuzzy language normalization, while the application enforces the contract. Validation failures route to a retry queue with a narrowed prompt or escalate to human review. This eliminates format drift and ensures downstream services never receive malformed payloads.
### 2. Deterministic Context Injection for Drafting
Generative drafting fails when models are asked to recall policies, pricing, or account states. The solution is to decouple fact retrieval from language synthesis. Your application owns the truth; the model owns the tone.
**Architecture Decision:** Build a verified context object from authoritative sources before invoking the LLM. Never allow the model to infer business rules.
```typescript
interface DraftContext {
customerTier: 'standard' | 'premium' | 'enterprise';
orderStatus: string;
refundEligible: boolean;
refundAmount: number;
policyVersion: string;
}
function assembleContext(order: Order, account: Account): DraftContext {
return {
customerTier: account.tier,
orderStatus: order.status,
refundEligible: account.tier === 'premium' || order.status === 'delivered',
refundAmount: order.refundEligible ? order.total : 0,
policyVersion: '2024-Q3'
};
}
async function generateSupportReply(context: DraftContext): Promise<string> {
const prompt = `
Draft a customer support response using ONLY the provided facts.
Do not invent policy details, pricing, or account states.
Keep the response under 150 words. Use a professional tone.
Facts:
${JSON.stringify(context, null, 2)}
`;
return llmClient.complete(prompt);
}
Why this works: The model operates as a stylistic transformer rather than a knowledge source. Business logic remains in typed application code, making updates predictable and auditable. When policies change, you update the context assembler, not the prompt.
3. Confidence-Gated Classification Routing
Automated classification is highly effective when paired with explicit confidence scoring. Forcing a model to make binary decisions on ambiguous inputs creates silent failures. Instead, route based on confidence bands.
Architecture Decision: Request both a label and a confidence score. Implement tiered routing: high confidence auto-processes, medium confidence queues with suggestions, low confidence falls back to legacy logic.
interface ClassificationResult {
category: 'billing' | 'technical' | 'account' | 'other';
confidence: number; // 0.0 to 1.0
}
async function classifyTicket(text: string): Promise<ClassificationResult> {
const prompt = `
Classify the support ticket into one of: billing, technical, account, other.
Return JSON with "category" and "confidence" (0.0-1.0).
Do not output anything else.
Input: ${text}
`;
const raw = await llmClient.complete(prompt);
return JSON.parse(raw) as ClassificationResult;
}
async function routeTicket(text: string): Promise<void> {
const result = await classifyTicket(text);
if (result.confidence >= 0.85) {
await autoProcess(result.category, text);
} else if (result.confidence >= 0.60) {
await queueForReview(text, result);
} else {
await fallbackToLegacyRouting(text);
}
}
Why this works: Confidence thresholds create a controlled automation ramp. You can start conservative, inspect medium-confidence cases, and gradually adjust thresholds as eval data proves reliability. This prevents catastrophic misrouting while enabling progressive automation.
4. Curated Corpus Retrieval (RAG)
Retrieval-Augmented Generation fails when source material is inconsistent, duplicated, or poorly scoped. The model cannot compensate for garbage input. Corpus hygiene precedes chunking strategy.
Architecture Decision: Clean, deduplicate, and tag source documents before indexing. Restrict retrieval to specific product boundaries, customer tiers, or service areas. Smaller, high-signal contexts outperform large, noisy ones.
interface DocumentChunk {
id: string;
content: string;
metadata: {
product: string;
lastUpdated: string;
owner: string;
version: string;
};
}
async function retrieveScopedContext(
query: string,
scope: { product: string; tier: string }
): Promise<DocumentChunk[]> {
// 1. Filter by metadata before vector search
const filtered = await vectorStore.search(query, {
filter: {
product: scope.product,
tier: scope.tier,
lastUpdated: { $gte: '2024-01-01' }
},
limit: 3
});
// 2. Deduplicate and validate freshness
return filtered
.filter(chunk => chunk.metadata.owner !== 'deprecated')
.sort((a, b) => b.metadata.lastUpdated.localeCompare(a.metadata.lastUpdated))
.slice(0, 2);
}
Why this works: Metadata filtering reduces noise before embedding comparison. Freshness checks prevent stale policy injection. Scoping ensures the model receives only relevant context, reducing hallucination and token waste.
Pitfall Guide
1. Prompt Drift from Schema Changes
Explanation: Product teams add fields, rename enums, or change status codes without updating prompts or validation schemas. The pipeline continues processing, but outputs silently degrade or fail validation.
Fix: Version prompts alongside application code. Implement schema migration tests that run against a static eval set before deployment. Use contract testing to verify prompt output matches the current Zod/TypeScript interface.
Explanation: Eval sets built on clean, hand-picked examples fail to capture OCR noise, copied email threads, mixed languages, or sarcastic phrasing. Production breaks on edge cases that never appeared in staging.
Fix: Build eval datasets from actual production logs, including failures. Inject synthetic noise (truncated text, special characters, missing fields) during testing. Run adversarial prompts weekly to measure degradation.
3. Missing Idempotency and Retry Semantics
Explanation: LLM calls timeout, rate-limit, or return malformed JSON. Without idempotency keys and queue semantics, retries create duplicate records or lost jobs.
Fix: Assign a deterministic jobId to each request. Use a message queue with exactly-once processing guarantees. Implement exponential backoff with jitter. Log all retry attempts with correlation IDs for tracing.
4. Vague Success Metrics
Explanation: Teams measure "helpfulness" or "user satisfaction" without quantifiable baselines. Improvements cannot be validated, and regressions go unnoticed.
Fix: Define metrics before deployment: exact field match rate, validation pass percentage, handle time reduction, deflection rate, or human acceptance rate. Track these daily and alert on threshold breaches.
5. Over-Chunking Knowledge Bases
Explanation: Splitting documents into tiny fragments loses semantic context. Retrieving multiple small chunks creates contradictory or incomplete answers.
Fix: Chunk by logical sections (headings, paragraphs, code blocks). Maintain minimum chunk size (250-500 tokens). Use overlapping windows (10-15%) to preserve context boundaries. Validate retrieval quality against a gold-standard Q&A set.
6. Treating LLM Output as Trusted Data
Explanation: Downstream services consume model output without validation, assuming the prompt guarantees correctness. This leads to data corruption and cascading failures.
Fix: Never trust LLM output. Always run through a strict validator. Treat the model as an untrusted external API. Implement circuit breakers that halt processing if validation failure rates exceed 5%.
7. Unbounded Token Consumption
Explanation: Open-ended prompts generate verbose outputs, inflating costs and latency. No caps exist on response length or context window usage.
Fix: Enforce max_tokens limits. Use structured output formats to reduce verbosity. Implement context window budgeting: reserve 20% for model output, 80% for input. Monitor cost per task and alert on anomalies.
Production Bundle
Action Checklist
Decision Matrix
| Scenario | Recommended Approach | Why | Cost Impact |
|---|
| High-volume form parsing | Schema-Bound Extraction | Deterministic validation prevents downstream corruption | Low (fixed tokens per record) |
| Customer email drafting | Deterministic Context Injection | Prevents policy hallucination; maintains brand voice | Medium (context assembly + generation) |
| Ticket routing/classification | Confidence-Gated Routing | Enables progressive automation with safe fallbacks | Low (classification is cheap; review is human) |
| Internal knowledge search | Curated Corpus Retrieval | Small, clean contexts outperform large noisy ones | Medium (vector storage + scoped search) |
| Open-ended brainstorming | Not Recommended for Production | Unbounded output, no validation path, high drift risk | High (variable tokens, manual review required) |
Configuration Template
// llm-pipeline.config.ts
export const PipelineConfig = {
extraction: {
schema: 'LogisticsSchema',
maxRetries: 3,
retryDelayMs: 1000,
fallbackQueue: 'manual-review'
},
drafting: {
contextSource: 'orderService',
maxTokens: 200,
temperature: 0.3,
policyVersion: '2024-Q3'
},
routing: {
highConfidence: 0.85,
mediumConfidence: 0.60,
autoProcessQueue: 'ticket-auto',
reviewQueue: 'ticket-review',
fallbackQueue: 'ticket-legacy'
},
retrieval: {
maxChunks: 3,
minFreshnessDays: 90,
scopeFilter: ['product', 'tier'],
deduplicate: true
},
observability: {
traceLatency: true,
trackTokenUsage: true,
alertOnValidationFailureRate: 0.05,
evalRunSchedule: '0 0 * * 5' // Every Friday
}
};
Quick Start Guide
- Pick a narrow workflow: Select a repetitive text-processing task your team already handles manually (e.g., invoice parsing, ticket classification, status updates).
- Define the contract: Write a strict TypeScript/Zod schema for the expected output. Build a static eval set of 50-100 real examples, including edge cases.
- Implement validation first: Create the extraction or classification function with runtime schema validation. Route failures to a review queue. Do not deploy without this guardrail.
- Add confidence routing: If classifying, implement threshold bands. Auto-process high confidence, queue medium, fallback low. Monitor pass rates for 7 days.
- Instrument and iterate: Enable latency, token, and validation tracking. Run weekly evals against your dataset. Adjust thresholds or prompts only when data proves improvement.