kiDir = path.join(projectRoot, '.knowledgebase', 'output');
}
async validateSources(): Promise<string[]> {
const files = await fs.readdir(this.sourceDir);
const markdownFiles = files.filter(f => f.endsWith('.md'));
if (markdownFiles.length === 0) {
throw new Error('No source documents found. Create .knowledgebase/inputs/ with architecture docs.');
}
return markdownFiles;
}
async readSource(filename: string): Promise<string> {
const filePath = path.join(this.sourceDir, filename);
return fs.readFile(filePath, 'utf-8');
}
}
### Step 2: Configure the Domain Schema
The wiki structure must adapt to your project's domain. A schema file defines entity types, relationship rules, and page templates. This prevents generic chunking and forces structured extraction.
```typescript
// src/knowledge/schema-config.ts
export interface WikiSchema {
domain: 'code' | 'research' | 'business';
entities: string[];
relationships: { source: string; target: string; type: string }[];
pageTemplates: {
entity: string;
concept: string;
synthesis: string;
};
extractionRules: {
pattern: RegExp;
category: string;
}[];
}
export const defaultCodeSchema: WikiSchema = {
domain: 'code',
entities: ['Module', 'Endpoint', 'DatabaseTable', 'Service', 'Configuration'],
relationships: [
{ source: 'Module', target: 'Endpoint', type: 'exposes' },
{ source: 'Service', target: 'DatabaseTable', type: 'queries' },
{ source: 'Configuration', target: 'Service', type: 'configures' }
],
pageTemplates: {
entity: '# {name}\n**Type:** {type}\n**Location:** {path}\n**Description:** {desc}\n**References:** {refs}',
concept: '# {name}\n**Category:** {category}\n**Context:** {context}\n**Related:** {related}',
synthesis: '# {name}\n**Question:** {question}\n**Answer:** {answer}\n**Sources:** {sources}'
},
extractionRules: [
{ pattern: /export\s+(class|interface|function)\s+(\w+)/g, category: 'Module' },
{ pattern: /(app|router)\.(get|post|put|delete)\(['"]([^'"]+)['"]/g, category: 'Endpoint' },
{ pattern: /CREATE\s+TABLE\s+(\w+)/gi, category: 'DatabaseTable' }
]
};
Step 3: Build the Ingestion Pipeline
The pipeline reads immutable sources, applies extraction rules, generates cross-referenced markdown pages, and writes them to the wiki directory. This runs once per sync cycle, not per query.
// src/knowledge/ingestion-pipeline.ts
import { SourceManager } from './source-manager';
import { WikiSchema } from './schema-config';
export class IngestionPipeline {
constructor(
private sourceManager: SourceManager,
private schema: WikiSchema
) {}
async execute(): Promise<void> {
const sources = await this.sourceManager.validateSources();
const extractedEntities: Map<string, any> = new Map();
for (const file of sources) {
const content = await this.sourceManager.readSource(file);
this.schema.extractionRules.forEach(rule => {
let match;
const regex = new RegExp(rule.pattern);
while ((match = regex.exec(content)) !== null) {
const name = match[2] || match[1];
if (!extractedEntities.has(name)) {
extractedEntities.set(name, {
name,
category: rule.category,
source: file,
context: content.substring(Math.max(0, match.index - 200), match.index + 200)
});
}
}
});
}
await this.generateWikiPages(extractedEntities);
}
private async generateWikiPages(entities: Map<string, any>): Promise<void> {
// Logic to format entities into schema templates, resolve cross-references,
// and write markdown files to the wiki directory.
// Health scoring and orphan detection run here.
console.log(`Wiki sync complete. ${entities.size} entities indexed.`);
}
}
Step 4: Register the MCP Server
The Model Context Protocol enables standardized tool exposure to AI agents. The server runs as a local subprocess, communicating via stdio. This eliminates network overhead and keeps data within the developer's machine.
// src/mcp/knowledge-server.ts
import { Server } from '@modelcontextprotocol/sdk/server/index.js';
import { StdioServerTransport } from '@modelcontextprotocol/sdk/server/stdio.js';
import { IngestionPipeline } from '../knowledge/ingestion-pipeline';
export class KnowledgeBaseServer {
private server: Server;
constructor() {
this.server = new Server(
{ name: 'persistent-knowledge-server', version: '1.0.0' },
{ capabilities: { tools: {} } }
);
this.registerTools();
}
private registerTools(): void {
// Tool: query_wiki
this.server.setRequestHandler('tools/call', async (request) => {
const { name, arguments: args } = request.params;
if (name === 'query_wiki') {
const question = args?.question as string;
// Direct lookup in generated wiki pages, synthesize answer with citations
return { content: [{ type: 'text', text: `Synthesized answer for: ${question}` }] };
}
if (name === 'sync_knowledge') {
const pipeline = new IngestionPipeline(new SourceManager(process.cwd()), {} as any);
await pipeline.execute();
return { content: [{ type: 'text', text: 'Knowledge base synchronized.' }] };
}
throw new Error(`Unknown tool: ${name}`);
});
}
async start(): Promise<void> {
const transport = new StdioServerTransport();
await this.server.connect(transport);
console.log('Knowledge server listening on stdio...');
}
}
Architecture Rationale
Why Markdown Over Vector Databases? Vector stores require embedding models, sync pipelines, and external dependencies. Markdown files are natively git-trackable, diffable, and instantly readable by both humans and LLMs. The trade-off is slightly higher token usage per query, but this is offset by eliminating embedding latency and vector search overhead.
Why stdio Communication? Network-based MCP servers introduce latency, authentication complexity, and potential data leakage. stdio keeps the agent-tool communication local, deterministic, and secure. It aligns with how Claude, Copilot, and Cursor natively consume MCP tools.
Why Immutable Sources? Separating human-authored inputs from machine-generated outputs creates a clear boundary for audits. If the wiki degrades, you rebuild from sources. If sources change, you trigger a sync. This prevents drift and ensures reproducibility.
Pitfall Guide
1. Treating Generated Wiki as Source
Explanation: Developers occasionally edit the generated markdown pages directly, expecting changes to persist. The next sync cycle overwrites these edits because the pipeline treats the wiki as ephemeral output.
Fix: Enforce strict read-only permissions on the wiki directory. Route all modifications through the source layer or use the save_answer_as_page tool, which safely appends to the wiki without breaking the sync contract.
2. Ignoring Health Scores & Orphan Nodes
Explanation: The wiki accumulates pages over time. Without monitoring, orphaned concepts (pages with no incoming references) and stale entities drift into the knowledge base, degrading answer quality.
Fix: Run the health diagnostic tool after every sync. Address orphan nodes by adding explicit cross-references in related pages. Archive pages that haven't been referenced in 3+ sync cycles.
3. Overloading the Ingestion Pipeline
Explanation: Dumping entire codebases into the source directory causes the extraction engine to generate thousands of low-signal pages. The agent spends tokens navigating noise instead of high-value architecture.
Fix: Curate sources intentionally. Limit inputs to architecture diagrams, API contracts, deployment guides, and critical workflow documentation. Let the agent discover trivial files on-demand.
4. Skipping Domain Schema Customization
Explanation: Using the default schema on specialized projects (e.g., data pipelines, embedded systems, research papers) results in misaligned entity extraction. The wiki generates generic pages that miss domain-specific relationships.
Fix: Override the WikiSchema configuration before first sync. Define custom extraction patterns, relationship types, and page templates that match your project's terminology and structure.
5. Neglecting Cross-Reference Validation
Explanation: The LLM generates pages independently. Without explicit cross-reference rules, pages become isolated islands. Agents receive fragmented answers instead of connected knowledge graphs.
Fix: Implement a post-sync validation step that scans for broken or missing references. Use the dependency graph tool to visualize coupling and manually link disconnected clusters.
6. Running Without Watch Mode in Active Development
Explanation: Codebases evolve rapidly. Static syncs become stale within hours. Agents query outdated wiki pages, leading to hallucinated implementations or deprecated API references.
Fix: Enable background watch mode during active development. Configure it to trigger incremental syncs on file changes, focusing only on modified source documents rather than full rebuilds.
7. Mixing Environment Secrets with Source Docs
Explanation: Developers accidentally include .env files, API keys, or internal credentials in the source directory. The ingestion pipeline extracts and indexes them, exposing sensitive data to agent queries.
Fix: Implement a pre-sync scanner that rejects files containing secret patterns. Maintain a .knowledgeignore file similar to .gitignore to explicitly exclude sensitive paths.
Production Bundle
Action Checklist
Decision Matrix
| Scenario | Recommended Approach | Why | Cost Impact |
|---|
| Small team, rapid iteration | Watch mode + incremental sync | Keeps wiki current without full rebuilds | Low compute, minimal latency |
| Large monorepo, strict compliance | Manual sync + health validation | Prevents drift, ensures audit trail | Higher manual overhead, zero risk |
| Multi-domain project (code + research) | Custom schema per domain | Aligns extraction with domain terminology | Medium setup time, high accuracy |
| Air-gapped / high-security environment | Local stdio + markdown storage | No external APIs, zero network exposure | Zero cloud cost, full data control |
| Legacy codebase, sparse docs | AI-assisted doc generation + sync | Bootstraps wiki from code patterns | Higher initial LLM cost, fast ROI |
Configuration Template
{
"knowledgeBase": {
"version": "1.0",
"schema": {
"domain": "code",
"entities": ["Service", "Endpoint", "DatabaseTable", "Middleware", "Configuration"],
"relationships": [
{"source": "Service", "target": "Endpoint", "type": "exposes"},
{"source": "Middleware", "target": "Endpoint", "type": "protects"},
{"source": "Service", "target": "DatabaseTable", "type": "queries"}
],
"extractionPatterns": [
{"regex": "export\\s+(class|interface)\\s+(\\w+)", "category": "Service"},
{"regex": "(app|router)\\.(get|post|put|delete)\\(['\"]([^'\"]+)['\"]", "category": "Endpoint"},
{"regex": "process\\.env\\.(\\w+)", "category": "Configuration"}
]
},
"sync": {
"mode": "watch",
"debounceMs": 1500,
"maxConcurrency": 3,
"healthThreshold": 85
},
"ignore": [
"**/node_modules/**",
"**/.git/**",
"**/*.test.ts",
"**/.env*",
"**/secrets/**"
]
}
}
Quick Start Guide
- Create the knowledge directory: Run
mkdir -p .knowledgebase/inputs in your project root. Add 3-5 markdown files covering architecture, authentication, database schema, and deployment.
- Configure the schema: Copy the configuration template above into
.knowledgebase/config.json. Adjust extraction patterns to match your framework's syntax.
- Initialize the server: Install the MCP SDK, register the stdio transport, and expose the
query_wiki, sync_knowledge, and health_check tools. Start the subprocess.
- Connect your agent: Configure Claude, Copilot, or Cursor to use the local MCP server. Test with a query like
query_wiki("How does authentication flow work?").
- Enable continuous sync: Launch the watch daemon. Verify that modifying a source document triggers an incremental wiki update within 2 seconds.