evidence: string;
}
interface SecurityVerdict {
score: number;
level: 'SAFE' | 'SUSPICIOUS' | 'DANGEROUS' | 'CRITICAL';
action: 'ALLOW' | 'WARN' | 'SANITIZE' | 'BLOCK' | 'REQUIRE_HUMAN_APPROVAL';
signals: RiskSignal[];
}
### Step 2: Implement the Weighted Scoring Engine
Instead of binary allow/deny checks, a scoring engine accumulates risk weights across multiple detection patterns. This enables graduated responses and reduces false positives.
```typescript
const RISK_SIGNATURES: Array<{ pattern: RegExp; category: string; weight: number }> = [
{ pattern: /ignore\s+previous\s+instructions?/gi, category: 'PROMPT_INJECTION', weight: 30 },
{ pattern: /reveal\s+(?:api\s+)?keys?|export\s+secrets?/gi, category: 'DATA_EXFILTRATION', weight: 80 },
{ pattern: /[\u200b-\u200d\ufeff]/g, category: 'HIDDEN_CHARACTERS', weight: 20 },
{ pattern: /(?:[A-Za-z0-9+\/]{40,}={0,2})/g, category: 'ENCODED_PAYLOAD', weight: 35 },
];
function evaluateInputRisk(surface: SecuritySurface): SecurityVerdict {
const signals: RiskSignal[] = [];
let cumulativeScore = 0;
for (const sig of RISK_SIGNATURES) {
const matches = surface.payload.match(sig.pattern);
if (matches) {
cumulativeScore += sig.weight;
signals.push({
category: sig.category,
weight: sig.weight,
evidence: `Detected ${matches.length} instance(s) of ${sig.category}`,
});
}
}
const level = cumulativeScore <= 20 ? 'SAFE' :
cumulativeScore <= 50 ? 'SUSPICIOUS' :
cumulativeScore <= 80 ? 'DANGEROUS' : 'CRITICAL';
const action = level === 'SAFE' ? 'ALLOW' :
level === 'SUSPICIOUS' ? 'WARN' :
level === 'DANGEROUS' ? 'SANITIZE' : 'BLOCK';
return { score: cumulativeScore, level, action, signals };
}
Architecture Rationale: Weighted scoring prevents over-blocking. A single injection phrase (+30) triggers a warning, while combined hidden characters (+20) and encoded payloads (+35) push the score to 85, triggering a block. This graduated model preserves usability while maintaining strict boundaries for high-risk inputs.
Step 3: Policy Override & Decision Routing
Business logic must sometimes supersede heuristic scoring. A policy resolver applies environment-specific rules before finalizing the verdict.
function resolveFinalAction(verdict: SecurityVerdict, policy: Record<string, string>): SecurityVerdict {
const policyOverride = policy[verdict.level];
if (policyOverride && policyOverride !== verdict.action) {
verdict.action = policyOverride as SecurityVerdict['action'];
}
return verdict;
}
Why this matters: Production environments require different risk tolerances. A staging environment might downgrade BLOCK to SANITIZE for testing, while a compliance-bound production environment might upgrade WARN to REQUIRE_HUMAN_APPROVAL. Policy overrides decouple security heuristics from business rules.
Step 4: Construct the Sanitization Pipeline
When inputs fall into the DANGEROUS tier, sanitization removes malicious artifacts while preserving legitimate content.
function sanitizePayload(raw: string): string {
return raw
.replace(/<!--[\s\S]*?-->/g, '')
.replace(/[\u200b-\u200d\ufeff]/g, '')
.replace(/(?:[A-Za-z0-9+\/]{40,}={0,2})/g, '[STRIPPED_ENCODING]')
.replace(/ignore\s+previous\s+instructions?/gi, '[FILTERED_DIRECTIVE]')
.trim();
}
Production Tip: Always validate sanitized output against the scoring engine a second time. Some payloads require iterative cleaning, especially when attackers chain multiple obfuscation techniques.
Tools represent the highest-risk surface. A dedicated guard intercepts arguments before execution, enforcing path allowlists, shell pattern blocking, and SSRF prevention.
interface ToolGuardResult {
status: 'ALLOW' | 'BLOCK' | 'REQUIRE_HUMAN_APPROVAL';
reason?: string;
}
function validateToolExecution(toolName: string, args: Record<string, unknown>): ToolGuardResult {
if (toolName === 'system.shell') {
const command = String(args.command ?? '');
const destructivePatterns = /\b(?:rm\s+-rf|mkfs|shutdown|reboot|sudo)\b/;
if (destructivePatterns.test(command)) {
return { status: 'BLOCK', reason: 'Destructive shell command detected' };
}
return { status: 'REQUIRE_HUMAN_APPROVAL', reason: 'Shell execution requires manual review' };
}
if (toolName === 'network.fetch') {
const target = String(args.url ?? '');
if (/^(?:http:\/\/localhost|127\.0\.0\.1|169\.254\.169\.254)/.test(target)) {
return { status: 'BLOCK', reason: 'SSRF target blocked' };
}
}
return { status: 'ALLOW' };
}
Architecture Decision: Tool guards operate independently from prompt scoring. A safe prompt can still trigger dangerous tool calls. Separating these concerns prevents lateral movement attacks that bypass text-based filters.
Pitfall Guide
1. Regex Overconfidence
Explanation: Relying solely on static regular expressions fails against obfuscated payloads, character substitution, or multi-stage attacks.
Fix: Normalize input (lowercase, strip whitespace variants), apply entropy checks for encoded strings, and use multi-stage scanning that validates context before flagging.
2. Hardcoded Thresholds
Explanation: Fixed scoring boundaries (0-20, 21-50, etc.) break when threat landscapes shift or when business requirements change.
Fix: Externalize thresholds to environment configuration. Implement dynamic recalibration based on historical false positive rates.
3. Ignoring Contextual Vectors
Explanation: Scanning only user prompts misses poisoned memory retrievals, compromised tool outputs, or injected system instructions.
Fix: Intercept every data surface in the agent loop. Apply the same scoring engine to memory chunks, RAG results, and intermediate messages.
4. Blocking Legitimate Encoded Data
Explanation: Base64 or hex strings are common in API responses, file uploads, and cryptographic operations. Blanket blocking causes operational failures.
Fix: Combine pattern matching with entropy analysis. Legitimate base64 typically exhibits high entropy but predictable structure. Flag only when combined with injection context or excessive length.
5. Synchronous Blocking in Async Flows
Explanation: Halting the entire agent loop for security checks introduces latency spikes and degrades conversational UX.
Fix: Implement non-blocking WARN and SANITIZE paths. Queue REQUIRE_HUMAN_APPROVAL requests asynchronously with TTL expiration and fallback routing.
6. Missing Human-in-the-Loop Fallback
Explanation: Hard BLOCK actions on critical scores can deadlock workflows, especially in autonomous or semi-autonomous agents.
Fix: Route critical scores to an approval queue with multi-channel notifications (Telegram, Slack, webhooks). Implement automatic escalation if no response occurs within a defined window.
7. Audit Trail Gaps
Explanation: Security middleware that doesn't log decisions fails compliance requirements and prevents post-incident analysis.
Fix: Emit structured logs for every verdict. Include trace IDs, surface type, raw payload hash, score breakdown, applied policy, and final action. Forward to SIEM or immutable storage.
Production Bundle
Action Checklist
Decision Matrix
| Scenario | Recommended Approach | Why | Cost Impact |
|---|
| Internal developer tooling | SANITIZE + WARN | Preserves workflow speed while catching obvious abuse | Low |
| Public-facing customer agent | REQUIRE_HUMAN_APPROVAL for CRITICAL | Prevents brand damage from unfiltered outputs | Medium (approval queue infra) |
| High-compliance financial agent | BLOCK + IMMEDIATE_LOG | Zero-tolerance policy required by regulations | High (strict false positive tuning) |
| Multi-agent orchestration | Policy Bundles + Surface-Specific Rules | Different agents handle different risk profiles | Medium (config management overhead) |
Configuration Template
security_middleware:
scoring:
thresholds:
safe: 20
suspicious: 50
dangerous: 80
critical: 100
signatures:
- category: PROMPT_INJECTION
weight: 30
pattern: "ignore\\s+previous\\s+instructions?"
- category: DATA_EXFILTRATION
weight: 80
pattern: "reveal\\s+(?:api\\s+)?keys?"
- category: HIDDEN_CHARACTERS
weight: 20
pattern: "[\\u200b-\\u200d\\ufeff]"
- category: ENCODED_PAYLOAD
weight: 35
pattern: "(?:[A-Za-z0-9+\\/]{40,}={0,2})"
policy_overrides:
SAFE: "ALLOW"
SUSPICIOUS: "WARN"
DANGEROUS: "SANITIZE"
CRITICAL: "BLOCK"
tool_guards:
system.shell:
block_patterns: ["rm\\s+-rf", "mkfs", "shutdown", "reboot", "sudo"]
default_action: "REQUIRE_HUMAN_APPROVAL"
network.fetch:
block_targets: ["localhost", "127.0.0.1", "169.254.169.254"]
default_action: "ALLOW"
human_approval:
channel: "telegram"
ttl_minutes: 15
fallback_action: "BLOCK"
observability:
log_level: "INFO"
include_raw_payload: false
trace_header: "x-security-trace-id"
siem_endpoint: "/api/v1/audit"
Quick Start Guide
- Install dependencies: Add the middleware package to your agent runtime. Ensure TypeScript 5.0+ and Node.js 18+ are active.
- Configure environment variables: Set
SECURITY_THRESHOLDS_ENV (dev/staging/prod), APPROVAL_CHANNEL_TOKEN, and SIEM_FORWARD_URL. Copy the configuration template and adjust weights to match your risk tolerance.
- Register middleware hooks: Attach the scoring engine to your agent's input/output pipeline. Wrap all external tool definitions with the execution guard. Verify that memory retrieval and RAG chunks pass through the same evaluation contract.
- Validate with adversarial inputs: Run the built-in test suite against injection phrases, encoded payloads, and shell commands. Confirm that scores align with expected tiers and that sanitization preserves legitimate content.
- Enable audit logging: Deploy structured logging with trace correlation. Forward verdicts to your observability stack. Verify that human approval requests route correctly and that TTL expiration triggers fallback actions.
This middleware architecture transforms agent security from an afterthought into a deterministic, auditable, and production-ready control plane. By decoupling scoring from policy, enforcing graduated remediation, and maintaining full pipeline visibility, teams can deploy autonomous systems without sacrificing safety or compliance.