| 'medium' | 'high' | 'critical'>;
requiresApproval: string[];
runtimeLimitMs: number;
}
const policyRegistry: Record<string, WorkflowPolicy> = {
'invoice_reconciliation': {
workflowId: 'invoice_reconciliation',
allowedTools: ['billing.fetch_invoices', 'billing.match_payments', 'docs.query_terms'],
blockedTools: ['billing.refund_transaction', 'admin.delete_records'],
maxToolCalls: 10,
maxEstimatedCostCents: 150,
riskTiers: {
'billing.fetch_invoices': 'low',
'billing.match_payments': 'medium',
'docs.query_terms': 'low'
},
requiresApproval: ['billing.match_payments'],
runtimeLimitMs: 75000
}
};
**Rationale**: Decoupling policy from execution allows independent versioning, A/B testing of limits, and tenant-specific overrides without modifying core routing logic.
### Step 2: Implement Context Pruning
Context window bloat is the silent killer of agent reliability. The pruner filters the MCP tool registry based on the active workflow and tenant permissions.
```typescript
interface ToolDefinition {
name: string;
description: string;
parameters: Record<string, unknown>;
riskTier: 'low' | 'medium' | 'high' | 'critical';
}
function pruneToolContext(
fullRegistry: ToolDefinition[],
policy: WorkflowPolicy,
tenantPermissions: string[]
): ToolDefinition[] {
return fullRegistry.filter(tool => {
const isAllowed = policy.allowedTools.includes(tool.name);
const isBlocked = policy.blockedTools.includes(tool.name);
const hasPermission = tenantPermissions.includes(tool.name);
const isWithinRisk = policy.riskTiers[tool.name] !== 'critical';
return isAllowed && !isBlocked && hasPermission && isWithinRisk;
});
}
Rationale: Pruning occurs before prompt assembly. This reduces token consumption, eliminates routing ambiguity, and prevents the model from attempting operations outside the tenant's scope.
Step 3: Build the Execution Gateway
The gateway sits between the agent and the MCP server. It validates budgets, enforces approval gates, and records telemetry before forwarding requests.
interface ToolExecutionRequest {
tenantId: string;
workflowId: string;
toolName: string;
payload: unknown;
estimatedCostCents: number;
}
interface ExecutionResult {
success: boolean;
data?: unknown;
error?: string;
policyFlags: string[];
}
class ToolExecutionGateway {
private ledger: UsageLedger;
private policyStore: PolicyRegistry;
async execute(request: ToolExecutionRequest): Promise<ExecutionResult> {
const policy = this.policyStore.get(request.workflowId);
const currentUsage = await this.ledger.getTenantUsage(request.tenantId, request.workflowId);
const flags: string[] = [];
if (!policy.allowedTools.includes(request.toolName)) {
return { success: false, error: 'TOOL_NOT_IN_SCOPE', policyFlags: flags };
}
if (currentUsage.callCount >= policy.maxToolCalls) {
return { success: false, error: 'CALL_LIMIT_REACHED', policyFlags: flags };
}
if (currentUsage.cumulativeCost + request.estimatedCostCents > policy.maxEstimatedCostCents) {
return { success: false, error: 'BUDGET_EXHAUSTED', policyFlags: flags };
}
if (policy.requiresApproval.includes(request.toolName)) {
flags.push('AWAITING_HUMAN_REVIEW');
await this.ledger.recordPendingApproval(request);
return { success: false, error: 'APPROVAL_GATE', policyFlags: flags };
}
const result = await this.invokeMcpTool(request.toolName, request.payload);
await this.ledger.recordExecution(request, result);
return { success: true, data: result, policyFlags: flags };
}
private async invokeMcpTool(toolName: string, payload: unknown): Promise<unknown> {
// Circuit breaker and timeout wrapper around MCP client
return await withTimeout(() => mcpClient.callTool(toolName, payload), 15000);
}
}
Rationale: The gateway enforces policy at runtime without modifying the model's behavior. Circuit breakers prevent downstream API saturation, and approval gates ensure high-risk operations require explicit consent. Telemetry is captured synchronously to guarantee ledger consistency.
Step 4: Instrument the Usage Ledger
Accurate cost attribution requires structured event streaming. The ledger tracks token consumption, API invocations, tenant spend, and policy decisions.
interface LedgerEvent {
eventId: string;
timestamp: number;
tenantId: string;
workflowId: string;
toolName: string;
costCents: number;
tokenCount: number;
policyDecision: 'allowed' | 'blocked' | 'pending_approval';
metadata: Record<string, string | number | boolean>;
}
class UsageLedger {
private eventStream: EventStream;
async recordExecution(request: ToolExecutionRequest, result: unknown): Promise<void> {
const event: LedgerEvent = {
eventId: generateUuid(),
timestamp: Date.now(),
tenantId: request.tenantId,
workflowId: request.workflowId,
toolName: request.toolName,
costCents: request.estimatedCostCents,
tokenCount: await estimateTokenUsage(request.payload),
policyDecision: 'allowed',
metadata: { resultSize: JSON.stringify(result).length }
};
await this.eventStream.push(event);
}
}
Rationale: Centralized telemetry enables real-time budget monitoring, anomaly detection, and compliance reporting. Structured events integrate seamlessly with existing observability stacks (Prometheus, Datadog, OpenTelemetry).
Pitfall Guide
1. Context Saturation from Tool Overloading
Explanation: Loading every registered tool into the system prompt forces the model to parse irrelevant definitions, increasing token costs and degrading routing accuracy. The model may select broad, unsafe tools when narrow alternatives exist.
Fix: Implement workflow-scoped tool filtering. Only inject definitions required for the active intent. Use semantic routing to map user queries to predefined workflows before context assembly.
2. Flattened Risk Classification
Explanation: Treating read-only and write operations identically removes safety boundaries. A search tool and a payment refund tool carry vastly different blast radii, yet unclassified systems apply identical execution rules.
Fix: Assign explicit risk tiers (low, medium, high, critical). Enforce approval gates for high and critical operations. Disable destructive tools by default and require explicit tenant opt-in.
3. Static Credential Sprawl
Explanation: Embedding long-lived API keys in agent prompts or tool configurations creates a single point of failure. If a workflow leaks credentials, the entire tenant ecosystem becomes vulnerable.
Fix: Rotate to short-lived, scoped tokens. Use OAuth 2.0 for user-impersonated actions and tenant-scoped service accounts for backend automation. Store secrets in a vault (HashiCorp Vault, AWS Secrets Manager) and inject them at runtime via the execution gateway.
4. Silent Budget Exhaustion
Explanation: Hard failures without contextual feedback degrade user trust. When an agent abruptly stops due to a limit, operators cannot diagnose whether the issue stems from policy, downstream latency, or model routing errors.
Fix: Implement graceful degradation. Return actionable messages that explain the constraint and suggest next steps (e.g., "Workflow limit reached. Narrow the date range or request admin approval for extended scope."). Expose budget states in the UI dashboard.
5. Incomplete Telemetry Coverage
Explanation: Logging only final responses leaves operational blind spots. Without tracking intermediate tool calls, retry attempts, and policy decisions, teams cannot reconstruct failure paths or optimize cost allocation.
Fix: Stream structured events at every decision point. Capture tool availability, selection rationale, execution cost, tenant context, and policy flags. Integrate with distributed tracing to correlate agent behavior with downstream service performance.
6. Over-Optimization on Day One
Explanation: Chasing perfect cost attribution before production data arrives leads to premature complexity. Teams often build custom pricing engines that drift from actual usage patterns.
Fix: Start with estimated units and flat-rate caps. Refine attribution models as telemetry accumulates. Use sampling for high-frequency tools and aggregate costs at the workflow level before drilling into individual API calls.
Production Bundle
Action Checklist
Decision Matrix
| Scenario | Recommended Approach | Why | Cost Impact |
|---|
| Low-risk read operations | Direct execution with logging | Minimal blast radius; high throughput required | Low inference cost, negligible API fees |
| Medium-risk data updates | Scoped execution with tenant caps | Balances automation with financial predictability | Moderate API costs, controlled by workflow limits |
| High-risk financial actions | Approval gate + audit trail | Compliance requirement; prevents unauthorized charges | Higher operational overhead, reduced error costs |
| Multi-tenant SaaS deployment | Centralized policy registry + tenant overrides | Enables plan-based feature gating and spend control | Infrastructure cost scales with policy engine complexity |
| Legacy system integration | Gateway abstraction + credential vault | Isolates unstable APIs and secures access tokens | Initial integration cost, long-term stability gains |
Configuration Template
policy:
version: "1.2.0"
workflows:
- id: "customer_support_triage"
allowed_tools:
- "crm.search_contacts"
- "tickets.fetch_history"
- "knowledge_base.query"
blocked_tools:
- "crm.update_record"
- "billing.issue_refund"
limits:
max_tool_calls: 8
max_estimated_cost_cents: 120
runtime_limit_ms: 60000
risk_tiers:
crm.search_contacts: low
tickets.fetch_history: low
knowledge_base.query: low
approval_required: []
tenant_overrides:
enterprise:
max_tool_calls: 15
max_estimated_cost_cents: 300
startup:
max_tool_calls: 5
max_estimated_cost_cents: 80
telemetry:
retention_days: 90
export_format: "json"
alert_thresholds:
daily_spend_cents: 5000
failed_call_rate_percent: 15
context_token_spike_percent: 20
Quick Start Guide
- Define Workflow Boundaries: Identify the top 5 agent workflows and map each to a minimal set of required tools. Document allowed, blocked, and approval-gated operations.
- Deploy the Policy Registry: Load workflow configurations into a version-controlled store. Implement a lightweight lookup service that resolves policies by tenant and workflow ID.
- Integrate the Execution Gateway: Wrap MCP client calls with the gateway. Add budget validation, risk tier checks, and approval routing before forwarding requests to downstream services.
- Instrument Telemetry: Configure structured event streaming for all policy decisions and tool executions. Connect the ledger to your observability stack and set alert thresholds for budget anomalies.
- Validate with Synthetic Traffic: Run controlled test workflows to verify context pruning, budget enforcement, and failure messaging. Adjust limits based on observed token consumption and downstream latency before enabling production traffic.