g;
fallbackModel: string;
maxRemediationAttempts: number;
}
const DOMAIN_ROUTING: Record<SecurityDomain, ModelRoutingConfig> = {
database: { primaryModel: 'haiku-4.5', fallbackModel: 'gemini-2.5-pro', maxRemediationAttempts: 2 },
authentication: { primaryModel: 'haiku-4.5', fallbackModel: 'opus-4.6', maxRemediationAttempts: 3 },
'file-io': { primaryModel: 'gemini-2.5-pro', fallbackModel: 'opus-4.6', maxRemediationAttempts: 2 },
'command-execution': { primaryModel: 'haiku-4.5', fallbackModel: 'sonnet-4.5', maxRemediationAttempts: 1 },
configuration: { primaryModel: 'gemini-2.5-flash', fallbackModel: 'opus-4.6', maxRemediationAttempts: 2 }
};
**Rationale:** Routing by domain prevents the "one model fits all" anti-pattern. Haiku 4.5 excels at generating minimal database and authentication code, while Gemini 2.5 Pro handles complex file I/O patterns better. Opus 4.6 is reserved for high-stakes remediation where fix rate matters more than initial generation cleanliness.
### Step 2: Generation and Static Analysis Integration
The generation engine produces the initial code artifact, which is immediately passed to a security linter. The linter returns structured violations that feed the remediation loop.
```typescript
interface SecurityViolation {
ruleId: string;
severity: 'critical' | 'high' | 'medium';
lineRange: [number, number];
description: string;
}
interface GenerationResult {
code: string;
domain: SecurityDomain;
model: string;
violations: SecurityViolation[];
timestamp: Date;
}
class SecurityLinter {
async analyze(code: string): Promise<SecurityViolation[]> {
// In production, this wraps ESLint, Semgrep, or custom AST scanners
// Returns structured violations mapped to domain-specific rules
return [];
}
}
Rationale: Static analysis must run synchronously with generation. Delaying security checks until code review creates technical debt and breaks the feedback loop. The linter output must be machine-readable to enable automated remediation.
When violations are detected, the pipeline constructs a targeted prompt that includes the original code, the specific rule violations, and domain context. The model is asked to restructure the code without altering the public API.
interface RemediationRequest {
originalCode: string;
violations: SecurityViolation[];
domain: SecurityDomain;
targetModel: string;
}
class RemediationEngine {
async fix(request: RemediationRequest): Promise<GenerationResult> {
const prompt = this.buildRemediationPrompt(request);
const fixedCode = await this.callModel(request.targetModel, prompt);
const newViolations = await this.linter.analyze(fixedCode);
return {
code: fixedCode,
domain: request.domain,
model: request.targetModel,
violations: newViolations,
timestamp: new Date()
};
}
private buildRemediationPrompt(req: RemediationRequest): string {
return `
Domain: ${req.domain}
Original Code:
${req.originalCode}
Security Violations:
${req.violations.map(v => `- ${v.ruleId}: ${v.description}`).join('\n')}
Instructions:
1. Fix all listed violations
2. Preserve the original function signature
3. Do not introduce new dependencies
4. Return only the corrected code block
`;
}
}
Rationale: Remediation prompts must be explicit about constraints. AI models tend to refactor aggressively when given vague instructions. By locking the function signature and forbidding new dependencies, we ensure the fix integrates cleanly into existing codebases.
Step 4: Net Risk Calculation
The pipeline tracks initial vulnerability rate, fix rate, and residual risk. This metric replaces aggregate benchmarks as the primary success indicator.
interface SecurityMetrics {
initialVulnRate: number;
fixRate: number;
netRemainingRisk: number;
}
class RiskCalculator {
calculate(initialViolations: number, fixedViolations: number, totalFunctions: number): SecurityMetrics {
const initialRate = initialViolations / totalFunctions;
const fixRate = fixedViolations / initialViolations;
const remaining = initialViolations - fixedViolations;
const netRisk = remaining / totalFunctions;
return {
initialVulnRate: parseFloat((initialRate * 100).toFixed(1)),
fixRate: parseFloat((fixRate * 100).toFixed(1)),
netRemainingRisk: parseFloat((netRisk * 100).toFixed(1))
};
}
}
Rationale: Net remaining risk is the only metric that correlates with production security posture. It accounts for both generation quality and remediation effectiveness, providing a realistic view of what actually ships to staging.
Pitfall Guide
1. Chasing the Lowest Initial Vulnerability Rate
Explanation: Teams often select models based on clean initial generation, assuming fewer flags equals better security. This ignores remediation capacity. Haiku 4.5 generates the fewest initial violations but only fixes 38.2% of them, leaving a 30.0% net risk.
Fix: Evaluate models using net remaining risk after a full generation-and-remediation cycle. Prioritize fix rate over initial cleanliness for security-critical domains.
2. Ignoring Architectural Constraints in File I/O and Command Execution
Explanation: File operations with dynamic filenames and shell command execution inherently trigger static analysis rules like detect-non-literal-fs-filename and detect-child-process. These are not model failures; they are architectural constraints. Expecting 0% violation rates in these domains is unrealistic.
Fix: Apply domain-specific tolerance thresholds. For command execution, rely on runtime sandboxing and least-privilege execution rather than AI remediation. For file I/O, implement allowlist-based path resolution outside the AI pipeline.
Explanation: Many teams run AI generation, manually review flags, and discard or rewrite flagged code. This breaks the feedback loop and wastes the model's contextual understanding.
Fix: Automate the remediation loop. Feed ESLint or Semgrep violations directly back to the same model with structured instructions. Track fix rates per domain to identify which models handle which violation types best.
4. Overfitting to a Single Model’s Code Style
Explanation: Heavy models like Gemini 2.5 Pro generate complex, production-ready patterns that trigger more rules initially but provide deeper semantic context for fixes. Lightweight models produce minimal code that is easier to generate but harder to restructure securely.
Fix: Route prompts by domain complexity. Use lightweight models for simple configuration and authentication scaffolding. Reserve heavy models for database operations and complex file handling where architectural depth matters.
5. Skipping Domain-Specific Rule Tuning
Explanation: Applying a uniform security rule set across all domains creates false positives. Database rules like pg/no-select-all are irrelevant to authentication, while JWT payload rules don't apply to command execution.
Fix: Maintain separate rule configurations per domain. Load only relevant rules during static analysis to reduce noise and improve remediation accuracy. Tune thresholds based on domain risk profiles.
Explanation: Authentication remediation shows extreme variance. Opus 4.6 achieves 100% fix rate on authentication vulnerabilities, while Gemini 2.5 Flash only fixes 25%. Teams that route auth prompts to the wrong model will accumulate technical debt.
Fix: Lock authentication domains to models with proven remediation dominance. Implement manual review gates for JWT generation and password hashing, regardless of AI fix rates.
Explanation: Heavy models cost significantly more per token. Routing every prompt to Opus 4.6 or Gemini 2.5 Pro inflates infrastructure costs without proportional security gains in low-risk domains.
Fix: Implement cost-aware routing. Use lightweight models for configuration and simple database queries. Reserve premium models for authentication remediation and complex command execution fallbacks.
Production Bundle
Action Checklist
Decision Matrix
| Scenario | Recommended Approach | Why | Cost Impact |
|---|
| Authentication-heavy workloads | Route to Opus 4.6 for remediation | 100% fix rate on JWT and password vulnerabilities | High token cost, but reduces audit remediation overhead |
| Database CRUD operations | Use Haiku 4.5 for generation, fallback to Gemini 2.5 Pro | Minimal initial flags, Pro handles complex query restructuring | Low generation cost, moderate remediation cost |
| File I/O with dynamic paths | Accept architectural constraints, route to Gemini 2.5 Pro | 86% initial rate is best available; Pro adds path validation | Moderate cost, requires runtime sandboxing |
| Configuration and secrets | Route to Gemini 2.5 Flash | 21% initial rate, 100% remediation fix rate | Lowest cost, highest reliability |
| Command execution | Use Haiku 4.5, enforce runtime restrictions | AI cannot reliably fix shell access; 0% fix rate on Pro | Low generation cost, high operational control needed |
Configuration Template
# ai-security-pipeline.config.yml
routing:
database:
primary: haiku-4.5
fallback: gemini-2.5-pro
max_retries: 2
authentication:
primary: haiku-4.5
fallback: opus-4.6
max_retries: 3
file-io:
primary: gemini-2.5-pro
fallback: opus-4.6
max_retries: 2
command-execution:
primary: haiku-4.5
fallback: sonnet-4.5
max_retries: 1
configuration:
primary: gemini-2.5-flash
fallback: opus-4.6
max_retries: 2
security:
linter: semgrep
rule_sets:
database: [pg/no-select-all, pg/no-unsafe-query, pg/prefer-pool-query]
authentication: [jwt/no-sensitive-payload, jwt/algorithm-whitelist, bcrypt/rounds-min]
file-io: [detect-non-literal-fs-filename, no-arbitrary-file-access]
command-execution: [detect-child-process, no-shell-injection]
configuration: [no-hardcoded-credentials, no-unsafe-deserialization]
metrics:
track_net_risk: true
alert_threshold: 0.35
log_remediation_loops: true
Quick Start Guide
- Initialize the pipeline: Clone the routing and remediation modules, configure your API keys for the five target models, and load the YAML configuration template.
- Define your domain map: Tag incoming prompts with one of the five security domains. Use keyword matching or a lightweight classifier to automate this step.
- Run generation + analysis: Execute the primary model for the domain, pipe the output through your configured linter, and capture structured violations.
- Trigger remediation: If violations exist, route the code and violations to the fallback model. Repeat until max retries are reached or violations drop below threshold.
- Calculate net risk: Use the risk calculator to determine residual vulnerability percentage. Log results and adjust routing tables based on observed fix rates.