bles[tableName];
return columns?.has(columnName) ?? false;
}
private resolveAlias(alias: string): string {
// Maps query aliases to actual table names
return this.aliases[alias] || alias;
}
}
**Rationale:** Hardcoding schema checks into review processes is error-prone. A centralized registry enables deterministic validation. By resolving aliases first, the validator handles complex queries with multiple table references without false positives.
### Step 2: AST Parsing & Join Topology Validation
Join mistakes are the primary source of silent data distortion. Missing `ON` clauses produce Cartesian products. Incorrect join types drop or duplicate rows. Parsing the query into an Abstract Syntax Tree (AST) allows programmatic inspection of join relationships.
```typescript
interface JoinNode {
type: 'INNER' | 'LEFT' | 'RIGHT' | 'FULL';
leftTable: string;
rightTable: string;
onCondition: string | null;
}
class JoinValidator {
validateTopology(joins: JoinNode[]): ValidationResult {
const errors: string[] = [];
joins.forEach(join => {
if (!join.onCondition) {
errors.push(`Missing ON clause in ${join.type} JOIN between ${join.leftTable} and ${join.rightTable}`);
}
const fkExists = this.verifyForeignKey(join.leftTable, join.rightTable, join.onCondition!);
if (!fkExists) {
errors.push(`Invalid join key in ${join.leftTable} -> ${join.rightTable}`);
}
});
return { isValid: errors.length === 0, errors };
}
}
Rationale: String matching fails on complex queries. AST parsing isolates join nodes, enabling precise validation of foreign key relationships and join cardinality. This catches accidental cross joins and mismatched key types before execution.
Step 3: Aggregation Scope Enforcement
AI frequently confuses row-level filtering (WHERE) with group-level filtering (HAVING). It also omits non-aggregated columns from GROUP BY clauses, violating SQL standard compliance.
class AggregationValidator {
validateSelectClause(selectColumns: string[], groupByColumns: string[]): ValidationResult {
const errors: string[] = [];
selectColumns.forEach(col => {
const isAggregate = /^(COUNT|SUM|AVG|MIN|MAX)\(/i.test(col);
const isGrouped = groupByColumns.some(g => col.includes(g));
if (!isAggregate && !isGrouped) {
errors.push(`Column ${col} must appear in GROUP BY or be wrapped in an aggregate function`);
}
});
return { isValid: errors.length === 0, errors };
}
}
Rationale: Enforcing aggregation rules at parse time prevents runtime errors in strict SQL dialects (PostgreSQL, SQL Server) and ensures consistent behavior across databases. This step also flags misplaced HAVING conditions that reference non-aggregated columns.
Step 4: Tenant Scoping Injection
Multi-tenant applications require mandatory row-level filtering. AI has no context for tenant boundaries, making this the highest-risk validation step.
class SecurityInjector {
injectTenantFilter(query: string, tenantId: string): string {
const hasTenantFilter = /WHERE\s+.*tenant_id\s*=/i.test(query);
if (hasTenantFilter) return query;
const whereIndex = query.search(/WHERE\s+/i);
if (whereIndex !== -1) {
return query.replace(/WHERE\s+/i, `WHERE tenant_id = '${tenantId}' AND `);
}
const groupIndex = query.search(/GROUP\s+BY/i);
if (groupIndex !== -1) {
return query.replace(/GROUP\s+BY/i, `WHERE tenant_id = '${tenantId}' GROUP BY`);
}
return `${query} WHERE tenant_id = '${tenantId}'`;
}
}
Rationale: Relying on developers to manually add tenant filters is a security anti-pattern. Automated injection guarantees that every query respects data isolation boundaries. The injector prioritizes existing WHERE clauses to avoid syntax conflicts.
Step 5: Staging Execution & Delta Validation
Never execute unverified queries against production. A staging environment with representative data enables safe validation.
class ExecutionGuard {
async validateAgainstStaging(query: string, expectedRowCount: number): Promise<boolean> {
const stagingResult = await this.db.staging.execute(query);
const actualCount = stagingResult.rows.length;
const tolerance = 0.15; // 15% variance allowed for dynamic data
const isWithinTolerance = Math.abs(actualCount - expectedRowCount) / expectedRowCount <= tolerance;
if (!isWithinTolerance) {
console.warn(`Row count mismatch: expected ~${expectedRowCount}, got ${actualCount}`);
return false;
}
return true;
}
}
Rationale: Row count delta checking catches logical errors that pass syntax validation. A 15% tolerance accounts for natural data drift while flagging catastrophic join multiplication or missing filters.
Step 6: Transaction Wrapping for Mutations
Write operations require blast-radius containment. AI-generated UPDATE, INSERT, or DELETE statements must never execute without explicit preview.
class MutationGuard {
async dryRunWrite(query: string): Promise<{ affectedRows: number; sample: any[] }> {
await this.db.execute('BEGIN');
const countQuery = query.replace(/^(UPDATE|INSERT|DELETE)/i, 'SELECT COUNT(*) FROM').replace(/SET\s+.*/i, '');
const countResult = await this.db.execute(countQuery);
const sampleQuery = query.replace(/^(UPDATE|INSERT|DELETE)/i, 'SELECT * FROM').replace(/SET\s+.*/i, '').replace(/;$/, ' LIMIT 10');
const sampleResult = await this.db.execute(sampleQuery);
await this.db.execute('ROLLBACK');
return {
affectedRows: countResult.rows[0].count,
sample: sampleResult.rows
};
}
}
Rationale: Transaction wrapping with immediate rollback provides a safe preview mechanism. Developers can inspect affected row counts and sample data before committing. This eliminates accidental mass updates or deletions.
Pitfall Guide
1. The Plausible Result Fallacy
Explanation: Developers accept query output because the numbers "look reasonable." AI generates statistically likely results, not necessarily correct ones. A revenue dashboard showing $1.2M instead of $1.4M may pass casual inspection until audit season.
Fix: Implement deterministic validation against known test datasets. Never trust visual plausibility over schema-enforced correctness.
2. Implicit Cartesian Joins
Explanation: Missing ON clauses or malformed join conditions produce cross joins. The query executes successfully but multiplies row counts exponentially, corrupting aggregations.
Fix: Enforce mandatory ON clause validation in the AST parser. Flag any JOIN without an explicit condition as a critical error.
3. Aggregation Scope Confusion
Explanation: AI frequently places aggregate filters in WHERE clauses or omits non-aggregated columns from GROUP BY. This causes syntax errors in strict dialects or silent misaggregation in lenient ones.
Fix: Validate SELECT columns against GROUP BY lists. Rewrite WHERE conditions referencing aggregates to HAVING automatically during validation.
4. NULL Equality Blind Spots
Explanation: The != operator excludes NULL values silently. Queries filtering optional fields (e.g., discount_code != 'PROMO') drop rows where the column is NULL, skewing results.
Fix: Inject OR column IS NULL clauses automatically when validating inequality filters on nullable columns. Document this behavior in team SQL standards.
5. Cross-Tenant Data Bleed
Explanation: AI has no awareness of multi-tenant boundaries. Queries generated for "top performing pages" or "recent transactions" return global datasets unless explicitly scoped.
Fix: Treat tenant scoping as non-negotiable. Implement automated WHERE tenant_id = ? injection at the query builder layer. Never allow raw AI output to bypass this step.
6. Unbounded Production Reads
Explanation: Testing AI queries directly on production databases without LIMIT clauses can trigger full table scans, lock contention, or memory exhaustion on large datasets.
Fix: Enforce LIMIT 100 during validation. Strip the limit only after staging verification passes. Use read replicas for all AI-generated query testing.
7. Schema Drift Ignorance
Explanation: AI models are trained on static datasets. They cannot detect recent column renames, table deprecations, or index changes. Queries that worked last month may silently break after schema migrations.
Fix: Tie validation to a live schema registry updated via CI/CD pipelines. Reject queries referencing deprecated tables or renamed columns automatically.
Production Bundle
Action Checklist
Decision Matrix
| Scenario | Recommended Approach | Why | Cost Impact |
|---|
| Internal reporting dashboard | Schema-Guarded Pipeline + Staging Validation | Balances speed with data accuracy; prevents silent metric drift | Low infrastructure cost, moderate dev time |
| Multi-tenant SaaS analytics | Automated RLS Injection + Tenant Scoping Enforcement | Eliminates cross-tenant data leakage; meets compliance requirements | High compliance value, low runtime overhead |
| Ad-hoc data exploration | Raw AI Output + Manual Review + LIMIT 100 | Maximizes flexibility for non-critical queries; contains blast radius | Zero infrastructure cost, high human review time |
| Production write operations | Transaction Wrapping + Dry-Run Preview + Approval Gate | Prevents accidental mass mutations; enables rollback safety | Critical risk mitigation, minimal performance impact |
| Legacy schema with drift | Live Schema Registry + CI/CD Validation Hook | Catches deprecated references before deployment; maintains compatibility | Moderate pipeline complexity, high stability gain |
Configuration Template
// query-validator.config.ts
export const validationPipeline = {
schemaRegistry: {
source: 'live', // 'live' | 'snapshot'
refreshInterval: 300000, // 5 minutes
tables: ['tenant_profiles', 'billing_invoices', 'usage_events', 'audit_logs']
},
joinValidation: {
enforceOnClause: true,
validateForeignKey: true,
allowCrossJoins: false
},
aggregationRules: {
strictGroupBy: true,
rewriteHaving: true,
allowImplicitCasts: false
},
security: {
tenantColumn: 'tenant_id',
injectMandatoryFilter: true,
bypassForAdmin: false
},
execution: {
stagingEndpoint: 'postgresql://staging-db:5432/analytics',
rowTolerance: 0.15,
maxExecutionTime: 5000,
writeDryRun: true
}
};
Quick Start Guide
- Initialize Schema Registry: Connect the validator to your database's information schema or ORM metadata. Run
npx codcompass init-validator --schema-source=live to generate the initial registry.
- Configure Validation Rules: Copy the configuration template above and adjust table names, tenant columns, and tolerance thresholds to match your environment.
- Integrate Query Interceptor: Wrap your database client with the validation pipeline. Route all AI-generated queries through
QueryValidator.validate() before execution.
- Deploy Staging Mirror: Provision a read-only staging database with anonymized production data. Configure the execution guard to route validation queries to this endpoint.
- Enforce Pipeline in CI/CD: Add a pre-commit hook that rejects PRs containing raw SQL without validation wrapper calls. Require staging delta approval for all write operations.