=> ({
name: col.column_name,
dataType: col.data_type,
isNullable: col.is_nullable === 'YES',
description: this.generateColumnDescription(col),
enumValues: this.extractEnumConstraints(col),
unit: this.detectMeasurementUnit(col.column_name),
references: this.resolveForeignKey(col)
}))
}));
}
private inferBusinessPurpose(tableName: string): string {
const map: Record<string, string> = {
tenant_accounts: 'Core identity and subscription state for each organization',
billing_invoices: 'Financial records tracking charges, payments, and refunds',
feature_usage: 'Event-level telemetry for product feature adoption'
};
return map[tableName] || 'Application data store';
}
}
**Why this choice:** Enrichment happens at ingestion time, not query time. This prevents redundant LLM calls for metadata generation and ensures consistent schema representation across all queries. Explicit `references` fields eliminate join guesswork.
### Step 2: Vector Indexing for Semantic Retrieval
When databases exceed 50 tables, static prompt injection becomes unsustainable. We embed table and column descriptions into a vector store. At query time, the user's natural language input is embedded and matched against the index to retrieve only relevant topology.
```typescript
import { createClient } from '@supabase/supabase-js'; // Example vector client
class SchemaRetriever {
constructor(private vectorClient: any) {}
async findRelevantSchema(userQuery: string, topK: number = 4): Promise<TableSchema[]> {
const queryEmbedding = await this.generateEmbedding(userQuery);
const matches = await this.vectorClient.search({
index: 'schema_metadata_index',
vector: queryEmbedding,
limit: topK,
filter: { type: 'table_or_column' }
});
return this.hydrateSchemaFromMatches(matches);
}
private async generateEmbedding(text: string): Promise<number[]> {
// Integration with OpenAI/text-embedding-3-small or equivalent
const response = await fetch('https://api.openai.com/v1/embeddings', {
method: 'POST',
headers: { 'Authorization': `Bearer ${process.env.OPENAI_API_KEY}` },
body: JSON.stringify({ model: 'text-embedding-3-small', input: text })
});
const data = await response.json();
return data.data[0].embedding;
}
}
Why this choice: Vector search reduces context window usage by 60-80% on average. It scales linearly with database growth. The topK parameter allows tuning based on query complexity without bloating prompts.
Step 3: Structured Prompt Assembly
The retrieved schema must be formatted to maximize LLM comprehension. We use a strict template that separates structural definitions from semantic hints. Foreign keys are explicitly declared. Enumerations and units are inline.
class PromptAssembler {
assemble(userQuestion: string, schemaContext: TableSchema[]): string {
const schemaBlock = schemaContext.map(table => {
const colDefs = table.columns.map(col => {
let line = ` - ${col.name} (${col.dataType})`;
if (col.isNullable) line += ', nullable';
if (col.references) line += ` β ${col.references.table}.${col.references.column}`;
if (col.description) line += ` // ${col.description}`;
if (col.enumValues) line += ` [valid: ${col.enumValues.join(', ')}]`;
if (col.unit) line += ` (unit: ${col.unit})`;
return line;
}).join('\n');
return `Table: ${table.name}\nDescription: ${table.description}\nColumns:\n${colDefs}`;
}).join('\n\n');
return `You are a SQL query generator. Use ONLY the provided schema to construct the query.
Do not invent tables or columns. Use explicit JOIN conditions based on foreign key declarations.
### Available Schema
${schemaBlock}
### User Question
${userQuestion}
### Output Format
Return only the SQL query. Wrap in \`\`\`sql blocks.`;
}
}
Why this choice: The template enforces strict boundaries. Inline comments and bracketed enumerations reduce semantic drift. Explicit foreign key arrows (β) give the model a deterministic join path. The output constraint prevents conversational filler.
Step 4: Execution and Validation
Generated SQL should never execute blindly. A lightweight validation layer checks syntax, verifies table/column existence against the injected schema, and enforces read-only execution modes.
class QueryValidator {
validate(sql: string, allowedSchema: TableSchema[]): { valid: boolean; errors: string[] } {
const errors: string[] = [];
const allowedTables = new Set(allowedSchema.map(t => t.name));
const allowedColumns = new Set(allowedSchema.flatMap(t => t.columns.map(c => c.name)));
// Basic structural checks (production systems use sql-parser libraries)
if (!/^\s*SELECT\s/i.test(sql)) errors.push('Query must be read-only SELECT');
const tableMatches = sql.match(/FROM\s+(\w+)/gi);
tableMatches?.forEach(match => {
const table = match.split(/\s+/)[1].toLowerCase();
if (!allowedTables.has(table)) errors.push(`Unauthorized table: ${table}`);
});
return { valid: errors.length === 0, errors };
}
}
Why this choice: Validation catches hallucination before execution. It enforces security boundaries and provides immediate feedback for retry loops.
Pitfall Guide
1. The "Kitchen Sink" Schema Dump
Explanation: Injecting every table definition into every prompt. This floods the context window with irrelevant tokens, degrading the model's attention mechanism and increasing latency.
Fix: Implement retrieval-augmented filtering. Only inject tables and columns that semantically match the query intent. Cache schema embeddings and update them on migration events.
2. Silent Foreign Key Assumptions
Explanation: Relying on the model to infer relationships from column naming patterns (e.g., user_id implies users.id). Models frequently guess incorrectly on composite keys or non-standard naming.
Fix: Explicitly declare foreign key mappings in the schema metadata. Use the references field in your enrichment layer and render it as β parent_table.parent_column in the prompt.
3. Abbreviated Column Naming
Explanation: Short or cryptic column names (ts, amt, st) break semantic alignment. The model must guess whether ts means timestamp, timeseries, or transaction_status.
Fix: Enforce canonical naming conventions at the database level. If legacy abbreviations exist, add a description field that maps the abbreviation to its business meaning. Never rely on the model to reverse-engineer naming conventions.
4. Missing Enum and Value Constraints
Explanation: Columns like status or tier contain discrete values. Without explicit lists, models generate invalid filters like WHERE status = 'active' when the actual value is 'ACTIVE' or 'enabled'.
Fix: Extract enum constraints from database check constraints or application code. Inject them as [valid: value1, value2] in the column definition. This eliminates case-sensitivity and spelling hallucinations.
5. Ignoring Data Type Semantics
Explanation: Numeric columns often store scaled values (e.g., cents instead of dollars, milliseconds instead of seconds). Models unaware of scaling produce mathematically incorrect aggregations.
Fix: Add a unit annotation to relevant columns. Include scaling instructions in the prompt template: Divide monetary columns by 100 for display. This preserves precision while ensuring correct output formatting.
6. Static Schema Caching Without Drift Detection
Explanation: Caching enriched schema metadata indefinitely. Database migrations alter column types, drop tables, or rename fields. Stale metadata causes query failures or silent data corruption.
Fix: Version your schema snapshots. Trigger cache invalidation via database migration hooks or CDC streams. Implement a fallback mechanism that re-enriches on validation failure.
7. Over-Reliance on Single-Pass Generation
Explanation: Expecting the model to generate perfect SQL on the first attempt. Complex queries involving window functions, CTEs, or multi-table aggregations often require iterative refinement.
Fix: Implement a self-correction loop. If validation fails or execution returns zero rows, feed the error message back to the model with the original schema context. Limit retries to 2-3 iterations to control cost.
Production Bundle
Action Checklist
Decision Matrix
| Scenario | Recommended Approach | Why | Cost Impact |
|---|
| Small DB (<30 tables) | Static enriched prompt | Retrieval overhead outweighs benefits; full schema fits comfortably in context | Low (fixed token cost) |
| Medium DB (30-150 tables) | Keyword + vector hybrid filtering | Balances precision with latency; reduces irrelevant tokens by ~60% | Medium (embedding + search calls) |
| Enterprise DB (150+ tables) | Full retrieval-augmented pipeline | Prevents context window saturation; maintains accuracy at scale | High (vector infrastructure + per-query retrieval) |
| Real-time analytics dashboard | Pre-computed schema views + caching | Query patterns are predictable; caching eliminates repeated retrieval | Low (high cache hit rate) |
Configuration Template
// schema-pipeline.config.ts
export const SchemaPipelineConfig = {
enrichment: {
batchSize: 50,
descriptionModel: 'gpt-4o-mini',
fallbackDescriptions: true,
includeSampleRows: 1
},
retrieval: {
vectorModel: 'text-embedding-3-small',
dimension: 1536,
topK: 5,
similarityThreshold: 0.72,
indexName: 'prod_schema_index'
},
prompt: {
maxTokens: 4096,
temperature: 0.1,
enforceReadOnly: true,
includeFKArrows: true,
includeEnumBrackets: true
},
validation: {
allowedDialects: ['postgresql', 'mysql'],
maxExecutionTimeMs: 5000,
blockWriteOperations: true
},
caching: {
ttlSeconds: 3600,
invalidationTriggers: ['migration_complete', 'ddl_change'],
fallbackStrategy: 're_enrich_on_failure'
}
};
Quick Start Guide
- Extract Metadata: Run a script against your database to pull table names, column definitions, data types, and foreign key constraints from
information_schema.
- Enrich & Embed: Pipe the raw metadata through an enrichment service that adds business descriptions and FK mappings. Generate embeddings for each table/column description and store them in a vector database.
- Deploy Retrieval & Assembly: Implement a query handler that embeds the user's natural language input, retrieves the top-K relevant schema fragments, and formats them using the strict prompt template.
- Validate & Execute: Pass the generated SQL through a read-only validator. Execute against a replica database with strict timeout limits. Log queries and accuracy metrics for continuous improvement.