+/).filter(s => s.trim().length > 0);
const maxOverlap = Math.floor(maxLen * overlapRatio);
const segments: ChunkMetadata[] = [];
let buffer = "";
for (const sentence of sentences) {
const candidate = buffer.length ? ${buffer} ${sentence} : sentence;
if (candidate.length > maxLen && buffer.length) {
segments.push({
source: filePath,
content: buffer.trim(),
hash: computeHash(buffer.trim())
});
buffer = buffer.slice(-maxOverlap) + ${sentence};
} else {
buffer = candidate;
}
}
if (buffer.trim().length) {
segments.push({
source: filePath,
content: buffer.trim(),
hash: computeHash(buffer.trim())
});
}
return segments;
}
**Why this design:** The overlap ratio (10β15%) captures context that bleeds across chunk boundaries. SHA-256 hashing enables incremental indexing by skipping already-processed segments. Sentence-boundary splitting prevents semantic fragmentation, which directly improves cosine similarity accuracy.
### Step 2: Vector Embedding & Storage
Embeddings are generated via Ollama's local API and persisted using `sqlite-vec`. The vector table mirrors the relational table to maintain source traceability.
```typescript
import Database from "better-sqlite3";
import * as vec from "sqlite-vec";
export class VectorRepository {
private db: Database.Database;
constructor(dbPath: string) {
this.db = new Database(dbPath);
vec.load(this.db);
this.initializeSchema();
}
private initializeSchema(): void {
this.db.exec(`
CREATE TABLE IF NOT EXISTS doc_segments (
id INTEGER PRIMARY KEY AUTOINCREMENT,
source_path TEXT NOT NULL,
segment_hash TEXT UNIQUE NOT NULL,
raw_text TEXT NOT NULL
);
CREATE VIRTUAL TABLE IF NOT EXISTS segment_vectors USING vec0(
id INTEGER PRIMARY KEY,
vector FLOAT[768]
);
`);
}
async persistSegment(id: number, hash: string, source: string, text: string, embedding: number[]): Promise<void> {
const insertTx = this.db.transaction(() => {
const metaStmt = this.db.prepare(
"INSERT OR IGNORE INTO doc_segments (id, source_path, segment_hash, raw_text) VALUES (?, ?, ?, ?)"
);
metaStmt.run(id, source, hash, text);
const vecStmt = this.db.prepare(
"INSERT INTO segment_vectors (id, vector) VALUES (?, ?)"
);
vecStmt.run(id, JSON.stringify(embedding));
});
insertTx();
}
close(): void {
this.db.close();
}
}
Why this design: sqlite-vec uses a flat index by default, which is optimal for datasets under 1M vectors. The INSERT OR IGNORE clause leverages the unique hash constraint to skip duplicates automatically. Wrapping inserts in a transaction prevents partial writes during indexing failures. The 768-dimensional constraint matches nomic-embed-text exactly; mismatched dimensions will cause silent query failures.
Step 3: Similarity Search & Context Assembly
Retrieval queries the vector table, joins metadata, and formats results for the generation stage.
export class RetrievalEngine {
constructor(private repo: VectorRepository) {}
async fetchContext(queryVector: number[], topK = 5): Promise<Array<{ source: string; text: string; score: number }>> {
const stmt = this.repo.db.prepare(`
SELECT d.source_path, d.raw_text, v.distance
FROM segment_vectors v
JOIN doc_segments d ON d.id = v.id
WHERE v.vector MATCH ?
ORDER BY v.distance
LIMIT ?
`);
const rows = stmt.all(JSON.stringify(queryVector), topK) as Array<{
source_path: string;
raw_text: string;
distance: number;
}>;
return rows.map(r => ({
source: r.source_path,
text: r.raw_text,
score: 1 - r.distance
}));
}
}
Why this design: sqlite-vec's MATCH operator triggers cosine similarity under the hood. Distance values range from 0 (identical) to 2 (opposite). Converting to 1 - distance yields a 0β1 relevance score. The join ensures source paths are always available for citation. Limiting topK to 3β5 prevents context window overflow while maintaining sufficient signal for the LLM.
Step 4: Local LLM Generation
The generation stage formats retrieved context into a strict prompt and streams the response through Ollama's OpenAI-compatible endpoint.
export class GenerationClient {
private readonly endpoint = "http://127.0.0.1:11434/v1/chat/completions";
private readonly model = "qwen2.5:7b";
async synthesize(query: string, contextBlocks: Array<{ source: string; text: string }>): Promise<string> {
const formatted = contextBlocks
.map((b, i) => `[${i + 1}] ${b.source}\n${b.text}`)
.join("\n\n---\n\n");
const payload = {
model: this.model,
messages: [
{
role: "user",
content: `You are a technical analyst. Answer strictly using the provided context.
If the context lacks sufficient information, state that explicitly.
Cite sources using bracketed numbers.
CONTEXT:
${formatted}
QUERY: ${query}
RESPONSE:`
}
],
stream: false,
temperature: 0.2,
max_tokens: 1024
};
const res = await fetch(this.endpoint, {
method: "POST",
headers: { "Content-Type": "application/json" },
body: JSON.stringify(payload)
});
if (!res.ok) throw new Error(`Ollama API error: ${res.status}`);
const data = await res.json();
return data.choices[0].message.content.trim();
}
}
Why this design: Low temperature (0.2) reduces hallucination in retrieval-augmented contexts. max_tokens caps generation to prevent runaway outputs. The prompt explicitly enforces citation formatting and negative constraints ("state that explicitly"). Using 127.0.0.1 instead of localhost avoids IPv6 resolution delays on some Node versions.
Architecture Rationale
The pipeline avoids network-bound vector databases because sqlite-vec eliminates daemon management, connection pooling, and serialization overhead. Ollama's local API provides OpenAI compatibility, allowing seamless SDK swaps if cloud fallback becomes necessary. Node 22's native fetch removes dependency on axios or node-fetch, reducing bundle size. The modular class structure enables unit testing, mock injection, and future migration to worker threads for parallel indexing.
Pitfall Guide
1. Arbitrary Character Chunking
Splitting on fixed byte counts fractures sentences and injects artificial boundaries. Retrieval precision drops 15β30% when semantic units are broken.
Fix: Always split on sentence terminators ([.!?]) and apply a sliding window with 10β15% overlap.
2. Hardcoding Embedding Dimensions
Different models output different vector lengths. nomic-embed-text uses 768, but mxbai-embed-large uses 1024. Mismatched dimensions cause silent sqlite-vec failures.
Fix: Parameterize dimension size in schema creation and validate against the active model's specification.
3. Blocking the Event Loop with Synchronous SQLite
better-sqlite3 is synchronous by design. Running heavy indexing on the main thread freezes the CLI or server.
Fix: Offload indexing to node:worker_threads or batch inserts with db.transaction() to minimize context switches.
4. Overloading Ollama's Concurrency Limits
Ollama defaults to a single concurrent request per model. Parallel embedding or generation calls will queue and timeout.
Fix: Implement a simple semaphore or queue. For embedding, batch requests or serialize calls. For generation, limit concurrent users or use streaming.
5. Missing Content Hash Caching
Re-indexing identical documents regenerates embeddings unnecessarily, wasting CPU cycles and inflating the vector table.
Fix: Store SHA-256 hashes of chunk content. Use INSERT OR IGNORE with a unique constraint to skip duplicates automatically.
6. Poor Prompt Isolation
Injecting raw context without delimiters allows the LLM to confuse context with instructions. This causes citation leakage and hallucination.
Fix: Wrap context in explicit markers (CONTEXT:, ---, QUERY:). Enforce strict system prompts that forbid external knowledge.
7. Neglecting Vector Index Maintenance
sqlite-vec flat indexes degrade slightly as row count grows. Without periodic optimization, query latency increases non-linearly.
Fix: Run VACUUM on the database after bulk deletes. For datasets >500k vectors, switch to sqlite-vec's HNSW virtual table with tuned lists and probe parameters.
Production Bundle
Action Checklist
Decision Matrix
| Scenario | Recommended Approach | Why | Cost Impact |
|---|
| <100k documents, single developer | Local Ollama + sqlite-vec | Zero infrastructure, instant deployment, full privacy | $0 |
| 100kβ500k documents, team access | Local Ollama + Qdrant (Docker) | Better concurrency, network access, HNSW scaling | ~$50/mo (host) |
| >500k documents, multi-region | Cloud RAG (Pinecone + OpenAI) | Autoscaling, global latency, managed backups | $200β$800/mo |
| Air-gapped / compliance-heavy | Local Ollama + sqlite-vec | No egress, offline operation, audit-ready | $0 |
Configuration Template
// config.ts
export const OLLAMA_CONFIG = {
baseUrl: "http://127.0.0.1:11434",
embeddingModel: "nomic-embed-text",
generationModel: "qwen2.5:7b",
timeoutMs: 15000,
maxConcurrent: 2
} as const;
export const SQLITE_CONFIG = {
dbPath: "./knowledge_base.db",
pragmas: [
"PRAGMA journal_mode=WAL;",
"PRAGMA synchronous=NORMAL;",
"PRAGMA cache_size=-64000;",
"PRAGMA temp_store=MEMORY;"
],
chunkSize: 1000,
overlapRatio: 0.12,
embeddingDim: 768
} as const;
export const RETRIEVAL_CONFIG = {
topK: 5,
minScore: 0.65,
promptTemplate: "strict_citation"
} as const;
Quick Start Guide
- Pull Models: Run
ollama pull nomic-embed-text and ollama pull qwen2.5:7b to cache weights locally.
- Install Dependencies: Execute
npm i better-sqlite3 sqlite-vec to install the vector engine and SQLite bindings.
- Initialize Repository: Instantiate
VectorRepository with your target .db path. The schema auto-creates on first run.
- Index Documents: Call
segmentDocument() on your markdown/text files, then persistSegment() for each chunk. Hashing skips duplicates automatically.
- Query: Pass a natural language question to
RetrievalEngine.fetchContext(), then pipe results to GenerationClient.synthesize(). Output includes cited sources and direct answers.
The pipeline compiles to a single Node process, requires no external services, and operates entirely within your machine's memory and storage. For internal knowledge bases, legal repositories, or engineering wikis, this architecture delivers cloud-equivalent retrieval quality with zero data exposure and deterministic latency.