g them creates ambiguous signals.
3. Async Batch Processing: Evaluation calls are I/O bound. We use Promise.allSettled to evaluate multiple samples concurrently without blocking.
4. Threshold-Based Alerting: Raw scores are useless without context. We implement configurable thresholds that trigger pipeline remediation workflows.
Implementation
import { OpenAI } from "openai";
import type { ChatCompletionMessageParam } from "openai/resources/chat/completions";
interface EvaluationSample {
question: string;
retrievedContext: string[];
generatedAnswer: string;
groundTruth?: string; // Optional, required only for Context Recall
}
interface MetricResult {
name: string;
score: number;
details: string;
passed: boolean;
}
interface EvaluationConfig {
thresholds: {
faithfulness: number;
answerRelevancy: number;
contextPrecision: number;
contextRecall: number;
};
judgeModel: string;
maxRetries: number;
}
class RagPipelineEvaluator {
private client: OpenAI;
private config: EvaluationConfig;
constructor(apiKey: string, config: EvaluationConfig) {
this.client = new OpenAI({ apiKey });
this.config = config;
}
async evaluate(sample: EvaluationSample): Promise<MetricResult[]> {
const [faithfulness, relevancy, precision, recall] = await Promise.allSettled([
this.computeFaithfulness(sample),
this.computeAnswerRelevancy(sample),
this.computeContextPrecision(sample),
sample.groundTruth ? this.computeContextRecall(sample) : Promise.resolve(null)
]);
return [
faithfulness.status === "fulfilled" ? faithfulness.value : this.createErrorResult("Faithfulness"),
relevancy.status === "fulfilled" ? relevancy.value : this.createErrorResult("Answer Relevancy"),
precision.status === "fulfilled" ? precision.value : this.createErrorResult("Context Precision"),
recall.status === "fulfilled" && recall.value ? recall.value : { name: "Context Recall", score: 0, details: "Skipped (no ground truth)", passed: true }
];
}
private async computeFaithfulness(sample: EvaluationSample): Promise<MetricResult> {
const prompt: ChatCompletionMessageParam[] = [
{ role: "system", content: "You are an evaluation judge. Analyze the generated answer against the retrieved context. Return a JSON object with 'score' (0-1) and 'reason'." },
{ role: "user", content: `Question: ${sample.question}\nContext: ${sample.retrievedContext.join("\n")}\nAnswer: ${sample.generatedAnswer}\n\nEvaluate: What fraction of the answer's claims are directly supported by the context?` }
];
const response = await this.callJudge(prompt);
const score = response.score;
return {
name: "Faithfulness",
score,
details: response.reason,
passed: score >= this.config.thresholds.faithfulness
};
}
private async computeAnswerRelevancy(sample: EvaluationSample): Promise<MetricResult> {
const prompt: ChatCompletionMessageParam[] = [
{ role: "system", content: "You are an evaluation judge. Generate 3 hypothetical questions that the provided answer would directly address. Then calculate the average semantic similarity to the original question. Return JSON with 'score' (0-1) and 'reason'." },
{ role: "user", content: `Original Question: ${sample.question}\nGenerated Answer: ${sample.generatedAnswer}\n\nEvaluate: How directly does the answer address the original question?` }
];
const response = await this.callJudge(prompt);
return {
name: "Answer Relevancy",
score: response.score,
details: response.reason,
passed: response.score >= this.config.thresholds.answerRelevancy
};
}
private async computeContextPrecision(sample: EvaluationSample): Promise<MetricResult> {
const prompt: ChatCompletionMessageParam[] = [
{ role: "system", content: "You are an evaluation judge. Evaluate each retrieved chunk for its usefulness in answering the question. Return JSON with 'score' (0-1, ranking-aware) and 'reason'." },
{ role: "user", content: `Question: ${sample.question}\nRetrieved Chunks:\n${sample.retrievedContext.map((c, i) => `[${i+1}] ${c}`).join("\n")}\n\nEvaluate: Are the most useful chunks ranked first? Penalize noisy or poorly ordered retrieval.` }
];
const response = await this.callJudge(prompt);
return {
name: "Context Precision",
score: response.score,
details: response.reason,
passed: response.score >= this.config.thresholds.contextPrecision
};
}
private async computeContextRecall(sample: EvaluationSample): Promise<MetricResult> {
if (!sample.groundTruth) throw new Error("Ground truth required for Context Recall");
const prompt: ChatCompletionMessageParam[] = [
{ role: "system", content: "You are an evaluation judge. Decompose the ground truth answer into factual statements. Determine what percentage can be attributed to the retrieved context. Return JSON with 'score' (0-1) and 'reason'." },
{ role: "user", content: `Question: ${sample.question}\nContext: ${sample.retrievedContext.join("\n")}\nGround Truth: ${sample.groundTruth}\n\nEvaluate: How much of the ground truth is covered by the retrieved context?` }
];
const response = await this.callJudge(prompt);
return {
name: "Context Recall",
score: response.score,
details: response.reason,
passed: response.score >= this.config.thresholds.contextRecall
};
}
private async callJudge(prompt: ChatCompletionMessageParam[]): Promise<{ score: number; reason: string }> {
const completion = await this.client.chat.completions.create({
model: this.config.judgeModel,
messages: prompt,
response_format: { type: "json_object" },
temperature: 0
});
const raw = completion.choices[0].message.content || "{}";
return JSON.parse(raw);
}
private createErrorResult(metric: string): MetricResult {
return { name: metric, score: 0, details: "Evaluation failed", passed: false };
}
}
Why This Architecture Works
- Judge Model Isolation: The evaluator uses a dedicated
judgeModel parameter. In production, you should route evaluation calls to a cost-optimized model (e.g., gpt-4o-mini or claude-3-haiku) rather than your primary generation model. This reduces evaluation costs by 60-80% while maintaining scoring consistency.
- Temperature Zero for Judges: LLM judges must be deterministic. Setting
temperature: 0 eliminates variance in scoring across repeated evaluations of the same sample.
- Promise.allSettled Pattern: Evaluation pipelines often process hundreds of samples. Using
allSettled ensures one failed metric computation doesn't collapse the entire batch. You can log failures and continue scoring.
- Threshold-Driven Pass/Fail: Raw scores (e.g.,
0.78) are meaningless without business context. The passed flag enables automated CI gates, alerting, and rollback triggers.
Pitfall Guide
1. Treating All Metrics as Equally Critical
Explanation: Teams often enforce uniform thresholds across all four metrics. This is inefficient. A customer support bot prioritizes Faithfulness and Answer Relevancy, while a legal research tool demands high Context Recall.
Fix: Implement role-based threshold profiles. Define metric weights per use case and only block deployments when critical metrics breach their specific limits.
2. Ignoring the Ground-Truth Dependency for Context Recall
Explanation: Context Recall requires a reference answer to decompose into attributable statements. Teams attempting to compute it without ground truth receive meaningless scores or runtime errors.
Fix: Maintain a curated evaluation dataset with verified answers for your primary use cases. Use synthetic data generation carefully, but validate synthetic ground truth against domain experts before feeding it into the recall metric.
3. Over-Reliance on LLM Judges Without Calibration
Explanation: LLM-as-judge scoring exhibits model bias. A judge model trained on similar data as your generator may over-score outputs, while a stricter model may under-score them.
Fix: Calibrate your judge against a human-labeled validation set of 50-100 samples. Measure inter-rater agreement (Cohen's Kappa) and adjust thresholds until judge scores align with human consensus. Rotate judge models quarterly to detect drift.
4. Chasing High Scores Without Adjusting Retrieval Architecture
Explanation: Teams tweak prompts to improve Context Precision or Faithfulness, ignoring that the root cause is often chunking strategy or embedding quality. Prompt engineering has diminishing returns on retrieval defects.
Fix: When Context Precision drops, implement a cross-encoder reranker (Cohere Rerank, BGE-Reranker, Jina) as a second-stage filter. When Context Recall lags, increase top-K retrieval depth or switch to parent-document chunking before touching the prompt.
5. Running Evaluations Only Post-Deployment
Explanation: Treating RAG evaluation as a quarterly audit misses regression bugs introduced by embedding model updates, vector store migrations, or prompt changes.
Fix: Integrate the evaluator into your CI/CD pipeline. Run evaluation suites on every pull request that modifies retrieval logic, chunking parameters, or system prompts. Fail builds when critical metrics drop below baseline.
6. Misinterpreting Low Answer Relevancy as a Generator-Only Issue
Explanation: A low Answer Relevancy score often points to the generator, but it can also indicate that the retriever returned tangentially related chunks. The LLM faithfully summarizes noise, producing a coherent but off-topic response.
Fix: Cross-reference Answer Relevancy with Context Precision. If both are low, the retriever is pulling adjacent but irrelevant documents. Implement query rewriting or semantic query expansion before vector search.
7. Using High Temperature for Factual RAG Tasks
Explanation: Temperature controls output variance. Values above 0.3 introduce creative drift, causing the generator to fill context gaps with parametric memory rather than admitting uncertainty.
Fix: Lock temperature to 0 or 0.1 for all factual RAG pipelines. If the system needs to express uncertainty, handle it via prompt instructions ("If the context lacks sufficient information, state: 'I cannot answer based on the provided documents.'"), not stochastic sampling.
Production Bundle
Action Checklist
Decision Matrix
| Scenario | Recommended Approach | Why | Cost Impact |
|---|
| Low-budget internal tool | Use gpt-4o-mini as judge, evaluate 100 samples/week, threshold at 0.65 | Balances accuracy with compute spend; sufficient for internal SLAs | ~$15-30/month |
| Customer-facing support bot | Use claude-3-5-sonnet as judge, evaluate 500 samples/week, threshold at 0.85, enforce CI gates | High accuracy requirement; hallucinations directly impact brand trust | ~$120-200/month |
| Latency-sensitive real-time API | Pre-compute metrics on static eval set, cache results, only evaluate on pipeline changes | Avoids evaluation overhead during user requests; shifts cost to offline batch | Near-zero runtime impact |
| Legal/medical compliance | Human-in-the-loop validation for Context Recall, strict Faithfulness threshold (0.95), audit logs | Regulatory requirements demand verifiable grounding and traceability | High (human review + premium judge models) |
Configuration Template
// rag-eval.config.ts
export const evaluationConfig = {
thresholds: {
faithfulness: 0.85,
answerRelevancy: 0.80,
contextPrecision: 0.75,
contextRecall: 0.80
},
judgeModel: "gpt-4o-mini",
maxRetries: 3,
retryDelayMs: 1000,
batchSize: 50,
ciGateEnabled: true,
alerting: {
enabled: true,
webhook: process.env.EVAL_ALERT_WEBHOOK,
minScoreDrop: 0.05
},
caching: {
enabled: true,
ttlSeconds: 86400,
storage: "redis"
}
};
Quick Start Guide
- Install dependencies:
npm install openai ioredis (or your preferred cache/LLM SDK)
- Create evaluation dataset: Compile 50-100 question/context/answer triples. Add ground truth for Context Recall.
- Initialize evaluator: Import the
RagPipelineEvaluator class, pass your API key and the configuration template above.
- Run batch evaluation: Call
evaluator.evaluate(sample) in a loop or batch processor. Log results to your monitoring stack.
- Integrate with CI: Add a GitHub Actions workflow that runs the evaluation suite on
main branch pushes. Fail the pipeline if passed: false exceeds your threshold tolerance.
By treating RAG evaluation as a first-class engineering discipline rather than an afterthought, you transform vague "it seems fine" deployments into measurable, debuggable, and continuously improving systems. The metrics don't just tell you what's broken; they tell you exactly where to look and how to fix it.