et_encoding("cl100k_base");
private config: ChunkConfig;
constructor(config: ChunkConfig) {
this.config = config;
}
public partition(sourceText: string): string[] {
const tokens = this.encoder.encode(sourceText);
if (tokens.length <= this.config.maxTokens) {
return [sourceText];
}
const chunks: string[] = [];
let currentIndex = 0;
while (currentIndex < tokens.length) {
const endIndex = Math.min(
currentIndex + this.config.maxTokens,
tokens.length
);
const chunkTokens = tokens.slice(currentIndex, endIndex);
const chunkText = this.encoder.decode(chunkTokens);
chunks.push(chunkText);
currentIndex += (this.config.maxTokens - this.config.overlapTokens);
}
return chunks;
}
}
**Architecture Rationale:** Using `tiktoken` ensures token counts align exactly with what the embedding model will process. The overlap mechanism prevents context loss at split boundaries by carrying trailing tokens into the next segment. This approach is deliberately lightweight, avoiding heavy NLP dependencies while maintaining structural awareness through configurable delimiters.
### Step 2: Structure-First Preprocessing
Raw text rarely contains explicit semantic boundaries. Production documents (Markdown, HTML, PDFs) embed hierarchy through headings, lists, and code blocks. Parsing this structure before segmentation drastically improves retrieval relevance.
```typescript
interface DocumentNode {
heading: string | null;
content: string;
metadata: Record<string, string>;
}
export class StructureParser {
public extractNodes(rawMarkdown: string): DocumentNode[] {
const lines = rawMarkdown.split("\n");
const nodes: DocumentNode[] = [];
let currentHeading: string | null = null;
let buffer: string[] = [];
for (const line of lines) {
const headingMatch = line.match(/^(#{1,6})\s+(.+)$/);
if (headingMatch) {
if (buffer.length > 0) {
nodes.push({
heading: currentHeading,
content: buffer.join("\n").trim(),
metadata: { source: "markdown", level: headingMatch[1].length.toString() },
});
buffer = [];
}
currentHeading = headingMatch[2].trim();
} else {
buffer.push(line);
}
}
if (buffer.length > 0) {
nodes.push({
heading: currentHeading,
content: buffer.join("\n").trim(),
metadata: { source: "markdown", level: "0" },
});
}
return nodes;
}
}
Architecture Rationale: Extracting nodes by heading hierarchy allows downstream embedding to associate vectors with explicit topical labels. This enables hybrid search strategies where vector similarity is combined with metadata filtering. The parser is deliberately decoupled from the segmenter, allowing independent scaling and testing.
Step 3: Semantic Boundary Validation
Recursive splitting handles structure, but it cannot detect topic shifts within long paragraphs. Semantic validation uses lightweight embedding comparisons to identify where conceptual density drops, triggering a split even if token limits haven't been reached.
import { createClient } from "@supabase/supabase-js";
export class SemanticValidator {
private supabase = createClient(
process.env.SUPABASE_URL!,
process.env.SUPABASE_ANON_KEY!
);
public async detectTopicShifts(textSegments: string[]): Promise<number[]> {
const shifts: number[] = [];
const threshold = 0.75;
for (let i = 0; i < textSegments.length - 1; i++) {
const [embedA, embedB] = await Promise.all([
this.getEmbedding(textSegments[i]),
this.getEmbedding(textSegments[i + 1]),
]);
const similarity = this.cosineSimilarity(embedA, embedB);
if (similarity < threshold) {
shifts.push(i + 1);
}
}
return shifts;
}
private async getEmbedding(text: string): Promise<number[]> {
const { data } = await this.supabase.rpc("generate_embedding", {
input_text: text,
});
return data as number[];
}
private cosineSimilarity(a: number[], b: number[]): number {
const dotProduct = a.reduce((sum, val, i) => sum + val * b[i], 0);
const normA = Math.sqrt(a.reduce((sum, val) => sum + val ** 2, 0));
const normB = Math.sqrt(b.reduce((sum, val) => sum + val ** 2, 0));
return dotProduct / (normA * normB);
}
}
Architecture Rationale: Semantic validation is applied selectively to avoid unnecessary compute. By comparing adjacent segments and triggering splits only when similarity drops below a calibrated threshold, the system preserves coherence while catching abrupt topic changes. The threshold is tunable based on domain specificity; legal and medical texts typically require lower thresholds (0.65–0.70) to capture nuanced shifts.
Raw chunks lose provenance. Production systems must attach structural, temporal, and source metadata to each segment before vector insertion. This enables precise filtering during retrieval and supports auditability.
interface IndexedChunk {
id: string;
content: string;
embedding: number[];
metadata: {
sourceDocument: string;
headingPath: string[];
chunkIndex: number;
tokenCount: number;
createdAt: string;
};
}
export class ChunkIndexer {
public async persist(chunks: IndexedChunk[]): Promise<void> {
const payload = chunks.map((c) => ({
id: c.id,
content: c.content,
embedding: c.embedding,
source_document: c.metadata.sourceDocument,
heading_path: c.metadata.headingPath,
chunk_index: c.metadata.chunkIndex,
token_count: c.metadata.tokenCount,
created_at: c.metadata.createdAt,
}));
await this.supabase.from("document_segments").upsert(payload, {
onConflict: "id",
});
}
}
Architecture Rationale: Upsert operations prevent duplicate embeddings during re-indexing. Storing heading_path as an array enables hierarchical filtering. Explicit token_count tracking allows dynamic prompt assembly that respects LLM context windows. This structure transforms raw text into queryable, auditable knowledge units.
Pitfall Guide
1. The Arbitrary Token Trap
Explanation: Splitting text at fixed token intervals without respecting linguistic boundaries fractures sentences and merges unrelated concepts. Embeddings generated from these fragments lose semantic focus, causing retrieval to return partial or misleading context.
Fix: Always use recursive or delimiter-aware splitting. Configure delimiters to match your domain (e.g., ["\n\n", "\n", ". ", "? ", "! "]). Validate splits against sentence boundaries before embedding.
2. Over-Overlap Inflation
Explanation: Excessive overlap (e.g., 40–50%) creates redundant vectors, inflating storage costs and retrieval latency. The vector database returns near-duplicate results, wasting generation tokens and confusing the LLM with repetitive context.
Fix: Cap overlap at 10–20% of chunk size. Monitor retrieval diversity metrics; if top-k results contain >30% semantic duplication, reduce overlap and increase chunk size instead.
3. Structure Blindness
Explanation: Treating Markdown, HTML, or PDFs as flat text ignores author-intended hierarchy. Headings, lists, and code blocks naturally segment topics. Ignoring them forces the embedding model to infer structure that already exists.
Fix: Parse structural markers before segmentation. Attach heading paths as metadata. Use structure-aware splitters for documentation-heavy workloads.
4. Code-as-Prose Mistake
Explanation: Applying natural language chunking to source code breaks function definitions, class declarations, and import statements mid-stream. The resulting embeddings lack syntactic context, making code retrieval unreliable for AI assistants.
Fix: Use AST-aware parsers (e.g., tree-sitter, @babel/parser) to split by function, class, or module boundaries. Preserve import statements and type definitions within the same chunk.
5. Static Sizing for Dynamic Content
Explanation: Using identical chunk sizes across heterogeneous documents (e.g., legal contracts, API docs, chat logs) ignores content density. Dense technical sections require smaller chunks; narrative sections tolerate larger ones.
Fix: Implement adaptive chunking. Analyze token density and semantic variance per section. Dynamically adjust maxTokens based on content type metadata.
6. Skipping Retrieval Evaluation
Explanation: Teams deploy chunking strategies without measuring retrieval precision, recall, or MRR (Mean Reciprocal Rank). Poor segmentation goes undetected until user complaints surface.
Fix: Establish a golden dataset of query-chunk pairs. Run automated evaluation after every segmentation change. Track metrics like hit_rate@k and context_precision.
Explanation: Storing chunks without source attribution, timestamps, or hierarchical paths makes debugging retrieval failures impossible. It also prevents hybrid search strategies that combine vector similarity with exact metadata filtering.
Fix: Enrich every chunk with sourceDocument, headingPath, chunkIndex, and tokenCount. Index metadata fields for fast filtering.
Production Bundle
Action Checklist
Decision Matrix
| Scenario | Recommended Approach | Why | Cost Impact |
|---|
| Internal knowledge base (Markdown/Confluence) | Structure-Aware + Recursive | Preserves author hierarchy, reduces noise | Low storage, moderate compute |
| Legal/Compliance documents | Semantic Boundary + Overlap | Captures nuanced topic shifts, prevents context loss | High compute, high precision |
| Code repository indexing | AST/Code-Aware Splitting | Maintains syntactic integrity, enables accurate code retrieval | Medium compute, language-specific tooling |
| Rapid prototype / MVP | Recursive + 15% Overlap | Fast implementation, predictable behavior, good baseline | Minimal compute, low storage |
| High-volume customer support logs | Sentence-Based + Adaptive Sizing | Handles variable length, preserves conversational context | Low compute, moderate storage |
Configuration Template
export const CHUNKING_CONFIG = {
recursive: {
maxTokens: 512,
overlapTokens: 80,
delimiters: ["\n\n", "\n", ". ", "? ", "! ", "; "],
stripWhitespace: true,
},
semantic: {
enabled: true,
similarityThreshold: 0.72,
windowSize: 2,
embeddingModel: "text-embedding-3-small",
},
metadata: {
attachHeadingPath: true,
trackTokenCount: true,
includeSourceTimestamp: true,
enableHybridFiltering: true,
},
evaluation: {
goldenDatasetPath: "./data/eval_queries.jsonl",
metrics: ["hit_rate@5", "context_precision", "mrr"],
runOnDeploy: true,
},
};
Quick Start Guide
- Initialize the segmenter: Import the
RecursiveSegmenter and StructureParser classes. Pass your document text and configuration object.
- Parse structure: Run
StructureParser.extractNodes() to isolate headings and content blocks. Attach metadata paths.
- Partition & validate: Feed each node into the segmenter. Run
SemanticValidator.detectTopicShifts() on long outputs to catch boundary fragmentation.
- Enrich & index: Map results to
IndexedChunk format. Attach token counts, source paths, and timestamps. Upsert to your vector database.
- Evaluate: Run your golden query dataset against the new index. Verify
hit_rate@5 exceeds 0.85. Adjust overlap or semantic threshold if precision drops.