isolation. Every chunk carries source provenance, document type, and creation timestamps to enable post-retrieval filtering.
4. Metric Alignment: Cosine similarity is explicitly enforced across the embedding model, vector index, and query path to prevent distance metric mismatches.
Implementation
import { v4 as uuidv4 } from 'uuid';
// ─── Domain Interfaces ───────────────────────────────────────────────────────
interface DocumentChunk {
id: string;
content: string;
metadata: Record<string, string | number | boolean>;
}
interface ChunkingStrategy {
segment(rawText: string): DocumentChunk[];
}
interface EmbeddingProvider {
compute(texts: string[]): Promise<number[][]>;
getDimension(): number;
}
interface VectorIndex {
initialize(dimensions: number): Promise<void>;
upsert(chunks: DocumentChunk[], vectors: number[][]): Promise<void>;
query(queryVector: number[], limit: number): Promise<DocumentChunk[]>;
}
// ─── Semantic Chunker Implementation ─────────────────────────────────────────
class SemanticSegmenter implements ChunkingStrategy {
private threshold: number;
constructor(similarityThreshold: number = 0.75) {
this.threshold = similarityThreshold;
}
segment(rawText: string): DocumentChunk[] {
const sentences = rawText.split(/(?<=[.!?])\s+/).filter(s => s.trim().length > 0);
const chunks: DocumentChunk[] = [];
let buffer: string[] = [];
let lastEmbedding: number[] | null = null;
// Placeholder: In production, this would call an embedding service
// For demonstration, we simulate cosine similarity calculation
const computeSimilarity = (a: number[], b: number[]): number => {
const dot = a.reduce((sum, val, i) => sum + val * b[i], 0);
const normA = Math.sqrt(a.reduce((sum, val) => sum + val ** 2, 0));
const normB = Math.sqrt(b.reduce((sum, val) => sum + val ** 2, 0));
return dot / (normA * normB);
};
for (const sentence of sentences) {
// Simulate embedding generation (replace with actual API call)
const currentEmbedding = this.simulateEmbedding(sentence);
if (!lastEmbedding) {
buffer.push(sentence);
lastEmbedding = currentEmbedding;
continue;
}
const similarity = computeSimilarity(lastEmbedding, currentEmbedding);
if (similarity >= this.threshold) {
buffer.push(sentence);
} else {
chunks.push({
id: uuidv4(),
content: buffer.join(' '),
metadata: { type: 'semantic', boundary: 'high_similarity' }
});
buffer = [sentence];
lastEmbedding = currentEmbedding;
}
}
if (buffer.length > 0) {
chunks.push({
id: uuidv4(),
content: buffer.join(' '),
metadata: { type: 'semantic', boundary: 'final' }
});
}
return chunks;
}
private simulateEmbedding(text: string): number[] {
// Deterministic placeholder for demonstration
return Array.from({ length: 384 }, (_, i) => Math.sin(text.charCodeAt(i % text.length) + i) * 0.5);
}
}
// ─── Vector Index Abstraction (Qdrant-style) ─────────────────────────────────
class QdrantVectorStore implements VectorIndex {
private collectionName: string;
private dimensions: number = 0;
private storage: Map<string, { vector: number[]; chunk: DocumentChunk }> = new Map();
constructor(collection: string) {
this.collectionName = collection;
}
async initialize(dimensions: number): Promise<void> {
this.dimensions = dimensions;
console.log(`[VectorStore] Initialized collection "${this.collectionName}" with ${dimensions} dimensions.`);
}
async upsert(chunks: DocumentChunk[], vectors: number[][]): Promise<void> {
if (chunks.length !== vectors.length) {
throw new Error('Chunk and vector count mismatch during upsert.');
}
for (let i = 0; i < chunks.length; i++) {
this.storage.set(chunks[i].id, {
vector: vectors[i],
chunk: chunks[i]
});
}
console.log(`[VectorStore] Upserted ${chunks.length} vectors.`);
}
async query(queryVector: number[], limit: number): Promise<DocumentChunk[]> {
const scored = Array.from(this.storage.entries()).map(([id, entry]) => {
const similarity = this.cosineSimilarity(queryVector, entry.vector);
return { chunk: entry.chunk, score: similarity };
});
scored.sort((a, b) => b.score - a.score);
return scored.slice(0, limit).map(item => item.chunk);
}
private cosineSimilarity(a: number[], b: number[]): number {
const dot = a.reduce((sum, val, i) => sum + val * b[i], 0);
const normA = Math.sqrt(a.reduce((sum, val) => sum + val ** 2, 0));
const normB = Math.sqrt(b.reduce((sum, val) => sum + val ** 2, 0));
return normA === 0 || normB === 0 ? 0 : dot / (normA * normB);
}
}
// ─── RAG Orchestrator ────────────────────────────────────────────────────────
interface RAGConfig {
chunker: ChunkingStrategy;
embedder: EmbeddingProvider;
vectorStore: VectorIndex;
topK: number;
}
class RAGOrchestrator {
private config: RAGConfig;
constructor(config: RAGConfig) {
this.config = config;
}
async ingest(rawDocuments: string[]): Promise<void> {
const allChunks: DocumentChunk[] = [];
const allTexts: string[] = [];
for (const doc of rawDocuments) {
const chunks = this.config.chunker.segment(doc);
allChunks.push(...chunks);
allTexts.push(...chunks.map(c => c.content));
}
const vectors = await this.config.embedder.compute(allTexts);
await this.config.vectorStore.initialize(this.config.embedder.getDimension());
await this.config.vectorStore.upsert(allChunks, vectors);
}
async answer(userQuery: string): Promise<DocumentChunk[]> {
const queryVectors = await this.config.embedder.compute([userQuery]);
return this.config.vectorStore.query(queryVectors[0], this.config.topK);
}
}
The architecture separates concerns explicitly. The SemanticSegmenter handles boundary detection without coupling to storage. The QdrantVectorStore abstracts distance calculations and persistence. The RAGOrchestrator coordinates ingestion and retrieval while remaining agnostic to the underlying models. This design enables independent scaling, unit testing of each layer, and straightforward migration between vector backends.
Pitfall Guide
Production RAG systems fail predictably when teams ignore retrieval engineering fundamentals. The following mistakes are consistently observed in early deployments.
| Pitfall | Explanation | Fix |
|---|
| Hardcoded Vector Dimensions | Assuming all embedding models output 384 dimensions. Swapping to all-mpnet-base-v2 (768) or sentence-t5-xxl (4096) breaks index initialization and causes silent data corruption. | Dynamically query the embedding provider for dimensionality before initializing the vector store. Store dimensions in configuration, not constants. |
| Ignoring Chunk Boundary Artifacts | Splitting text at arbitrary character counts severs technical terms, code blocks, or table rows. The LLM receives fragmented context that degrades reasoning accuracy. | Use semantic or recursive chunking with overlap buffers. Preserve markdown/HTML structure by splitting on semantic delimiters rather than raw byte counts. |
| Metric Mismatch in Similarity Search | Training embeddings with cosine distance but querying with Euclidean or dot-product metrics. The ranking order becomes mathematically inconsistent, returning low-relevance documents. | Enforce a single distance metric across the entire pipeline. Document the metric in configuration and validate it during index creation. |
| Synchronous Embedding Calls | Generating embeddings one-by-one in a request loop. This blocks the event loop, increases p99 latency, and exhausts API rate limits under concurrent load. | Batch embeddings using async queues. Implement backpressure and retry logic with exponential backoff. |
| Storing Raw Text Without Provenance | Chunks lack source URLs, document IDs, or timestamps. When the LLM hallucinates, there is no way to trace the error back to the original document for correction. | Attach structured metadata to every chunk. Include source_uri, doc_type, created_at, and version fields. Enable metadata filtering during retrieval. |
| Over-Reliance on Top-K Without Re-Ranking | Fetching 10 documents via vector similarity and feeding them all to the LLM. Many are marginally relevant, wasting context window and diluting signal. | Implement a lightweight cross-encoder re-ranker after initial retrieval. Filter chunks below a relevance threshold before augmentation. |
| Treating RAG as Stateless | Assuming every query requires full retrieval. Ignoring cache invalidation strategies leads to stale answers when source documents are updated. | Implement a retrieval cache with TTL. Version vector collections and trigger incremental re-indexing on document updates. |
Production Bundle
Action Checklist
Decision Matrix
| Scenario | Recommended Approach | Why | Cost Impact |
|---|
| Rapid prototyping / internal demo | Fixed-size chunking + all-MiniLM-L6-v2 + Chroma | Minimal setup, low infrastructure cost, fast iteration | Low compute, low storage |
| Enterprise knowledge base | Semantic chunking + all-mpnet-base-v2 + Qdrant | High recall, metadata filtering, scalable cloud deployment | Medium compute, medium storage |
| PostgreSQL-heavy infrastructure | Recursive chunking + all-mpnet-base-v2 + pgvector | Leverages existing DB, ACID compliance, unified backup strategy | Low infra overhead, higher query latency |
| High-accuracy compliance / legal | Semantic + metadata filter + sentence-t5-xxl + Qdrant | Maximum contextual depth, strict provenance tracking, audit-ready | High compute, high storage, low latency tolerance |
Configuration Template
// rag.config.ts
import { SemanticSegmenter } from './chunking/semantic-segmenter';
import { QdrantVectorStore } from './storage/qdrant-vector-store';
export interface RAGPipelineConfig {
embeddingModel: 'all-MiniLM-L6-v2' | 'all-mpnet-base-v2' | 'sentence-t5-xxl';
chunkingStrategy: 'semantic' | 'recursive' | 'fixed';
vectorBackend: 'chroma' | 'qdrant' | 'pgvector';
topK: number;
similarityThreshold: number;
batchSize: number;
metadataSchema: Record<string, string>;
}
export const defaultConfig: RAGPipelineConfig = {
embeddingModel: 'all-mpnet-base-v2',
chunkingStrategy: 'semantic',
vectorBackend: 'qdrant',
topK: 5,
similarityThreshold: 0.75,
batchSize: 64,
metadataSchema: {
source_uri: 'string',
doc_type: 'string',
created_at: 'timestamp',
version: 'string'
}
};
export function validateConfig(config: RAGPipelineConfig): void {
if (config.topK < 1 || config.topK > 20) {
throw new Error('topK must be between 1 and 20 for optimal context window utilization.');
}
if (config.similarityThreshold < 0.5 || config.similarityThreshold > 0.95) {
throw new Error('similarityThreshold should be tuned between 0.5 and 0.95 based on domain density.');
}
if (config.batchSize % 8 !== 0) {
console.warn('Batch size is not aligned to GPU warp boundaries. Consider multiples of 8.');
}
}
Quick Start Guide
- Install dependencies:
npm install uuid @types/uuid (add your preferred vector client and embedding SDK)
- Initialize the pipeline: Import
defaultConfig, run validateConfig(), and instantiate SemanticSegmenter, QdrantVectorStore, and RAGOrchestrator
- Ingest documents: Call
orchestrator.ingest(['doc1.txt', 'doc2.txt']) to chunk, embed, and upsert in a single async flow
- Query the system: Call
orchestrator.answer('What are the deployment requirements?') to retrieve top-k chunks
- Feed to LLM: Pass the retrieved
content fields into your generator prompt with explicit citation instructions
This workflow establishes a production-grade retrieval foundation. From here, layer in monitoring, re-ranking, and cache invalidation to match your accuracy and latency SLAs.