requires query-context-answer triplets. Context must include ground-truth chunk IDs to enable precision/recall calculations.
2. Implementation: The RAG Triad Metrics
The following TypeScript implementation demonstrates the core metrics. This code uses a modular evaluator pattern suitable for integration into CI/CD pipelines or production monitoring.
Metric A: Context Precision
Context Precision measures the signal-to-noise ratio in the retrieved chunks. Low precision indicates the retriever is returning irrelevant data, which increases the risk of the LLM being distracted or confused.
import { LLMClient, EvaluationResult } from './llm-integration';
interface Chunk {
id: string;
content: string;
metadata: Record<string, any>;
}
interface GroundTruth {
query: string;
relevantChunkIds: string[];
expectedAnswer: string;
}
export class RetrievalEvaluator {
private llmClient: LLMClient;
constructor(llmClient: LLMClient) {
this.llmClient = llmClient;
}
/**
* Calculates the fraction of retrieved chunks that are relevant to the query.
* High precision ensures the LLM receives focused context.
*/
async calculateContextPrecision(
query: string,
retrievedChunks: Chunk[],
groundTruth: GroundTruth
): Promise<EvaluationResult> {
const relevantSet = new Set(groundTruth.relevantChunkIds);
const retrievedIds = retrievedChunks.map(c => c.id);
// Deterministic intersection check
const relevantRetrieved = retrievedIds.filter(id => relevantSet.has(id));
const precision = retrievedIds.length > 0
? relevantRetrieved.length / retrievedIds.length
: 0;
// Optional: LLM judge for semantic relevance if ground truth is incomplete
const semanticScore = await this.assessSemanticRelevance(query, retrievedChunks);
return {
metric: 'context_precision',
score: precision,
semanticScore,
details: {
totalRetrieved: retrievedIds.length,
relevantFound: relevantRetrieved.length,
missingRelevant: groundTruth.relevantChunkIds.filter(id => !retrievedIds.includes(id))
}
};
}
private async assessSemanticRelevance(query: string, chunks: Chunk[]): Promise<number> {
// Use a lightweight judge model for semantic scoring
const prompt = `
Query: "${query}"
Chunks: ${chunks.map(c => c.content).join('\n---\n')}
Rate the overall relevance of these chunks to the query on a scale of 0.0 to 1.0.
Return only the number.
`;
const response = await this.llmClient.generate(prompt, { model: 'judge-small' });
return parseFloat(response.text) || 0;
}
}
Metric B: Faithfulness (Groundedness)
Faithfulness measures whether the generated answer is strictly supported by the retrieved context. This is the primary defense against hallucination.
export class GenerationEvaluator {
private llmClient: LLMClient;
constructor(llmClient: LLMClient) {
this.llmClient = llmClient;
}
/**
* Evaluates if every claim in the answer is grounded in the context.
* Breaks the answer into claims and verifies each against the source.
*/
async evaluateFaithfulness(
answer: string,
context: string
): Promise<EvaluationResult> {
// Step 1: Extract claims
const claimExtractionPrompt = `
Extract all factual claims from the following answer.
Return a JSON array of strings. Each string must be a single verifiable claim.
Answer: ${answer}
`;
const claimsResponse = await this.llmClient.generate(claimExtractionPrompt, { model: 'judge-medium' });
const claims: string[] = JSON.parse(claimsResponse.text);
if (claims.length === 0) {
return { metric: 'faithfulness', score: 1.0, details: { claims: [] } };
}
// Step 2: Verify each claim
const verificationPromises = claims.map(async (claim) => {
const verifyPrompt = `
Context: ${context}
Claim: ${claim}
Is this claim directly supported by the context? Answer YES or NO.
`;
const verdict = await this.llmClient.generate(verifyPrompt, { model: 'judge-small' });
return {
claim,
supported: verdict.text.trim().toUpperCase() === 'YES'
};
});
const verdicts = await Promise.all(verificationPromises);
const supportedCount = verdicts.filter(v => v.supported).length;
const faithfulnessScore = supportedCount / claims.length;
return {
metric: 'faithfulness',
score: faithfulnessScore,
details: {
totalClaims: claims.length,
supportedClaims: supportedCount,
unsupportedClaims: verdicts.filter(v => !v.supported).map(v => v.claim)
}
};
}
}
Metric C: Answer Relevance
Answer Relevance ensures the output addresses the user's query, not just the context. An answer can be faithful but irrelevant if the retriever returned off-topic information.
export class RelevanceEvaluator {
private llmClient: LLMClient;
constructor(llmClient: LLMClient) {
this.llmClient = llmClient;
}
/**
* Measures how well the answer addresses the original query.
* Uses semantic similarity and LLM judgment to detect non-responsive answers.
*/
async evaluateAnswerRelevance(
query: string,
answer: string
): Promise<EvaluationResult> {
const prompt = `
Query: "${query}"
Answer: "${answer}"
Does the answer directly address the query?
Rate relevance from 0.0 (irrelevant) to 1.0 (perfectly responsive).
Return only the number.
`;
const response = await this.llmClient.generate(prompt, { model: 'judge-small' });
const score = parseFloat(response.text) || 0;
return {
metric: 'answer_relevance',
score,
details: {
query,
answerLength: answer.length
}
};
}
}
3. Hyperparameter Optimization
Evaluation is useless without action. Use the metrics to sweep critical hyperparameters:
- Chunk Size: Smaller chunks improve precision but may fragment context. Larger chunks improve recall but introduce noise. Sweep chunk sizes (e.g., 256, 512, 1024 tokens) and plot Context Precision vs. Context Recall. Select the "knee" of the curve.
- Top-K Retrieval: Retrieving more chunks increases recall but can overwhelm the LLM context window and trigger position bias. Sweep K values and monitor Faithfulness. High K often degrades Faithfulness as the LLM struggles to focus.
- Embedding Models: Hold retrieval parameters constant and swap embedding models. Measure Context Recall to isolate the embedding model's impact on retrieval quality.
Pitfall Guide
1. The "Gold Standard" Trap
Explanation: Assuming your ground-truth answers are perfect. If the ground truth is incomplete or biased, your evaluation metrics will be skewed.
Fix: Use multiple annotators or LLM consensus to validate ground truth. Regularly audit the evaluation set for quality.
2. Position Bias Ignorance
Explanation: LLMs exhibit position bias, where information at the beginning and end of the context window is weighted more heavily than the middle. Standard retrieval may place critical chunks in the "middle slump."
Fix: Implement context reordering strategies. Place the most relevant chunks at the start and end of the context window. Monitor Faithfulness scores to detect position bias effects.
3. Chunk Size Myopia
Explanation: Optimizing only for token count ignores semantic boundaries. Fixed-size chunking can split sentences or concepts, degrading retrieval quality.
Fix: Use semantic chunking or hierarchical chunking. Evaluate chunking strategies using Context Recall, not just precision.
Explanation: Entity recall relies on NER models that may miss domain-specific terms or handle synonyms poorly.
Fix: Augment entity extraction with domain-specific dictionaries. Use fuzzy matching for entity comparison. Validate entity recall against manual spot checks.
5. Evaluation Cost Blindness
Explanation: Running LLM-as-a-judge on every query in production can be prohibitively expensive and introduce latency.
Fix: Implement sampling strategies. Evaluate a representative subset of queries. Use caching for repeated queries. Deploy lightweight judge models for high-volume monitoring.
6. The "Completeness" Illusion
Explanation: High Faithfulness and Answer Relevance can mask incomplete answers. If the retriever misses half the relevant context, the LLM may faithfully summarize only what it received.
Fix: Always measure Context Recall alongside Faithfulness. Use Entity Recall as a proxy for completeness when ground truth is unavailable.
7. Static Evaluation Sets
Explanation: Evaluation sets become stale as the corpus and query distribution evolve. Metrics may look good in the lab but fail in production.
Fix: Implement continuous evaluation. Regularly update the evaluation set with production queries. Use shadow mode to evaluate new configurations against live traffic.
Production Bundle
Action Checklist
Decision Matrix
| Scenario | Recommended Approach | Why | Cost Impact |
|---|
| Low Budget / MVP | Deterministic Metrics + Sampling | Reduces LLM judge costs while maintaining core visibility. | Low |
| High Accuracy / Compliance | Full Triad + LLM Judges | Ensures rigorous validation of faithfulness and relevance. | High |
| Real-Time Monitoring | Lightweight Judges + Caching | Minimizes latency impact on production traffic. | Medium |
| Batch Evaluation | Heavy Judges + Full Coverage | Maximizes accuracy for offline tuning and analysis. | Medium |
| Long-Tail Focus | Entity Recall + Semantic Chunking | Improves retrieval coverage for niche queries. | Low |
Configuration Template
// rag-eval.config.ts
export const RagEvalConfig = {
evaluationSet: {
path: './data/eval-set.json',
format: 'triplet', // query, context, answer
groundTruthRequired: true
},
metrics: {
contextPrecision: {
enabled: true,
threshold: 0.75,
judgeModel: 'judge-small'
},
faithfulness: {
enabled: true,
threshold: 0.90,
judgeModel: 'judge-medium',
claimExtraction: true
},
answerRelevance: {
enabled: true,
threshold: 0.80,
judgeModel: 'judge-small'
},
contextRecall: {
enabled: true,
threshold: 0.85,
method: 'entity' // or 'ground-truth'
}
},
hyperparameterSweep: {
chunkSizes: [256, 512, 1024],
topKValues: [3, 5, 10],
embeddingModels: ['text-embedding-3-small', 'text-embedding-3-large']
},
production: {
monitoring: {
enabled: true,
samplingRate: 0.1, // 10% of queries
alertThresholds: {
contextPrecision: 0.6,
faithfulness: 0.7
}
}
}
};
Quick Start Guide
- Install Dependencies: Set up your TypeScript project and install the LLM client library for your judge models.
- Prepare Evaluation Set: Create a JSON file with query-context-answer triplets, including ground-truth chunk IDs for a subset of queries.
- Run Initial Sweep: Execute the hyperparameter sweep using the configuration template to identify optimal chunk size and top-K values.
- Deploy Monitor: Integrate the evaluation pipeline into your CI/CD or production monitoring system with alerting thresholds.
- Iterate: Review metric trends weekly, update the evaluation set, and refine hyperparameters based on production data.