: string;
description: string;
parameters: Record<string, unknown>;
handler: (args: TArgs) => Promise<TResult>;
requiresApproval: boolean;
}
const archiveExpiredLogs: ToolDefinition<{ retentionDays: number }, { archivedCount: number }> = {
name: "archive_expired_logs",
description: "Moves log entries older than the specified threshold to cold storage. Only affects the audit_logs table.",
parameters: {
retentionDays: { type: "number", minimum: 1, maximum: 90 }
},
requiresApproval: true,
handler: async ({ retentionDays }) => {
const safeDays = Math.min(Math.max(retentionDays, 1), 90);
const cutoff = new Date(Date.now() - safeDays * 24 * 60 * 60 * 1000);
const result = await db.query(
`UPDATE audit_logs SET status = 'archived', archived_at = $1 WHERE status = 'active' AND created_at < $2`,
[cutoff, cutoff]
);
return { archivedCount: result.rowCount ?? 0 };
}
};
**Architecture Rationale:** Hard-capping parameters at the handler level prevents the agent from bypassing constraints through creative prompting. The `requiresApproval` flag decouples safety policy from business logic, enabling centralized governance.
### 2. Implementation-Enforced Execution Gates
Destructive or irreversible operations must never execute without explicit authorization. A confirmation gate should intercept the tool call, serialize the intended action, and route it through an approval workflow before the handler is invoked.
```typescript
type ApprovalStatus = "pending" | "approved" | "rejected";
class ExecutionGateway {
private approvalQueue: Map<string, { status: ApprovalStatus; resolve: (v: boolean) => void }> = new Map();
async requestApproval(toolName: string, args: Record<string, unknown>): Promise<boolean> {
const requestId = crypto.randomUUID();
const payload = { toolName, args, requestId, timestamp: new Date().toISOString() };
// Emit to external approval system (Slack, UI, webhook, etc.)
await eventBus.emit("agent:approval_request", payload);
return new Promise((resolve) => {
this.approvalQueue.set(requestId, { status: "pending", resolve });
setTimeout(() => {
if (this.approvalQueue.has(requestId)) {
this.approvalQueue.delete(requestId);
resolve(false); // Timeout defaults to rejection
}
}, 300_000); // 5-minute timeout
});
}
async processApproval(requestId: string, approved: boolean): Promise<void> {
const entry = this.approvalQueue.get(requestId);
if (entry) {
entry.status = approved ? "approved" : "rejected";
entry.resolve(approved);
this.approvalQueue.delete(requestId);
}
}
async executeWithGate<T>(tool: ToolDefinition<any, T>, args: any): Promise<T> {
if (tool.requiresApproval) {
const approved = await this.requestApproval(tool.name, args);
if (!approved) throw new Error(`Execution denied for ${tool.name}`);
}
return tool.handler(args);
}
}
Architecture Rationale: Synchronous terminal prompts do not scale to production. This gateway pattern externalizes approval to asynchronous channels while maintaining a deterministic execution flow. Timeouts default to rejection, preventing indefinite blocking.
3. Environment Abstraction & Credential Isolation
Agents should never receive direct connection strings or environment-specific endpoints. Instead, they interact with abstracted tool interfaces while infrastructure layers resolve credentials based on deployment context.
class EnvironmentResolver {
private static instance: EnvironmentResolver;
private config: Record<string, string>;
private constructor() {
this.config = {
DB_HOST: process.env.DATABASE_HOST ?? "",
DB_PORT: process.env.DATABASE_PORT ?? "5432",
DB_NAME: process.env.DATABASE_NAME ?? "",
DB_USER: process.env.DATABASE_USER ?? "",
DB_PASS: process.env.DATABASE_PASSWORD ?? ""
};
}
static getInstance(): EnvironmentResolver {
if (!EnvironmentResolver.instance) {
EnvironmentResolver.instance = new EnvironmentResolver();
}
return EnvironmentResolver.instance;
}
getConnectionUri(): string {
const { DB_HOST, DB_PORT, DB_NAME, DB_USER, DB_PASS } = this.config;
return `postgresql://${DB_USER}:${DB_PASS}@${DB_HOST}:${DB_PORT}/${DB_NAME}`;
}
}
Architecture Rationale: By injecting environment variables at the container or function level, staging agents physically cannot resolve production endpoints. This eliminates cross-environment contamination regardless of agent instructions. Multi-agent orchestrators must enforce the same isolation per sub-agent, preventing permission inheritance.
4. Structured Observability Pipeline
Agent reasoning traces are not debugging noise; they are audit logs. Every tool invocation, argument payload, and intermediate decision step must be captured before execution. This enables real-time alerting and precise post-incident reconstruction.
class AgentAuditTracer {
private logger: Logger;
constructor(logger: Logger) {
this.logger = logger;
}
async traceExecution<T>(
toolName: string,
args: Record<string, unknown>,
execution: () => Promise<T>
): Promise<T> {
const traceId = crypto.randomUUID();
const startTime = performance.now();
this.logger.info({
event: "agent.tool_call.initiated",
traceId,
toolName,
args,
timestamp: new Date().toISOString()
});
try {
const result = await execution();
const duration = performance.now() - startTime;
this.logger.info({
event: "agent.tool_call.completed",
traceId,
toolName,
durationMs: duration,
timestamp: new Date().toISOString()
});
return result;
} catch (error) {
const duration = performance.now() - startTime;
this.logger.error({
event: "agent.tool_call.failed",
traceId,
toolName,
durationMs: duration,
error: error instanceof Error ? error.message : String(error),
timestamp: new Date().toISOString()
});
throw error;
}
}
}
Architecture Rationale: Pre-execution logging captures intent before state changes occur. Structured events enable metric aggregation, anomaly detection, and automated alerting on high-risk tool names. The trace ID links reasoning steps to actual database mutations, creating a complete decision-to-action chain.
Pitfall Guide
Explanation: Teams define safety boundaries in tool descriptions or system prompts, assuming the model will respect them. LLMs optimize for goal completion and will ignore descriptive warnings if a more direct path exists.
Fix: Move all constraints to the handler implementation. Validate inputs, enforce hard limits, and reject out-of-scope operations programmatically.
2. The "Read-Only" Illusion
Explanation: Assuming agents with only SELECT permissions cannot cause harm. Read access enables data exfiltration, schema enumeration, and downstream trigger activation that can indirectly modify state.
Fix: Apply least-privilege at the database role level. Restrict schema visibility, disable trigger execution for agent roles, and monitor query patterns for reconnaissance behavior.
3. Prompt-Dependent Safety
Explanation: Relying on instructions like "never delete production data" as the primary safety mechanism. Prompts are suggestions, not enforcement. Adversarial or ambiguous phrasing easily bypasses them.
Fix: Treat prompts as intent signals, not security controls. Enforce safety through capability scoping, approval gates, and runtime validation.
4. Inherited Orchestrator Permissions
Explanation: Multi-agent systems where sub-agents inherit the orchestrator's full permission set. A single compromised or misdirected sub-agent can access resources outside its intended scope.
Fix: Implement per-agent capability manifests. The orchestrator should dynamically provision temporary, scoped credentials to sub-agents and revoke them upon task completion.
5. Silent Dry-Run Failures
Explanation: Dry-run modes that suppress errors or skip validation steps to "simulate" execution. This creates false confidence and masks permission or schema issues that will surface in production.
Fix: Run dry modes against isolated staging environments with identical schema structures. Validate execution paths, permission checks, and argument serialization without suppressing failures.
6. Unstructured Reasoning Logs
Explanation: Storing agent chain-of-thought as plain text blobs. This makes querying, alerting, and correlation with tool calls nearly impossible during incident response.
Fix: Serialize reasoning steps as structured JSON events with matching trace IDs. Index them alongside tool execution logs for unified querying and timeline reconstruction.
7. Missing Idempotency Controls
Explanation: Agents retrying failed operations without idempotency keys, causing duplicate mutations, double charges, or cascading state corruption.
Fix: Attach unique operation IDs to every tool call. Implement idempotency checks in handlers to detect and safely ignore duplicate requests.
Production Bundle
Action Checklist
Decision Matrix
| Scenario | Recommended Approach | Why | Cost Impact |
|---|
| Internal Dev Automation | Scoped tools + dry-run validation | Low blast radius; speed prioritized over manual approval | Minimal infrastructure overhead |
| Customer-Facing Workflows | Capability scoping + async approval gates | Prevents accidental data loss; maintains user trust | Moderate latency from approval routing |
| High-Risk Financial/Compliance Ops | Strict least-privilege + mandatory human sign-off + full audit trail | Regulatory requirements demand deterministic control and complete traceability | Higher operational cost; requires dedicated approval infrastructure |
Configuration Template
// agent-safety.config.ts
import { ExecutionGateway } from "./ExecutionGateway";
import { AgentAuditTracer } from "./AgentAuditTracer";
import { EnvironmentResolver } from "./EnvironmentResolver";
export const agentSafetyConfig = {
gateway: new ExecutionGateway({
approvalTimeoutMs: 300_000,
defaultRejectOnTimeout: true,
approvalChannel: "webhook://internal-approval-service"
}),
tracer: new AgentAuditTracer({
logLevel: "info",
structuredOutput: true,
alertOnDestructive: ["archive_expired_logs", "purge_stale_records", "update_schema"]
}),
envResolver: EnvironmentResolver.getInstance(),
policies: {
maxConcurrentToolCalls: 3,
retryLimit: 2,
idempotencyWindowMs: 60_000,
requireApprovalFor: ["write", "delete", "update", "send"]
}
};
Quick Start Guide
- Define scoped tools: Replace generic execution handlers with narrowly bounded operations. Enforce parameter limits and schema restrictions directly in the handler code.
- Deploy the approval gateway: Integrate the
ExecutionGateway into your agent loop. Route destructive operations through your existing notification or UI approval system.
- Abstract environment access: Remove all hardcoded credentials from agent configurations. Inject environment-specific secrets at deployment time and resolve connections through a centralized resolver.
- Instrument execution traces: Wrap every tool call with the
AgentAuditTracer. Configure your logging pipeline to index traceId, toolName, and args for real-time querying and alerting.
- Validate in staging: Run adversarial prompts and ambiguous instructions against an isolated environment. Verify that constraints, approval gates, and observability signals function as expected before promoting to production.