ication tests, compares outputs against acceptance criteria, and attaches structured proof to the task record.
5. Supervisor Loop: The control plane that resolves dependencies, enforces approval gates, manages AFK execution, and transitions state only when evidence thresholds are met.
Implementation (TypeScript)
import { execa } from 'execa';
import { createHash } from 'crypto';
import { EventEmitter } from 'events';
// Core state definitions
type TaskStatus = 'pending' | 'ready' | 'executing' | 'verifying' | 'completed' | 'blocked' | 'failed';
interface EvidenceArtifact {
type: 'test_output' | 'diff' | 'process_log' | 'http_response' | 'screenshot' | 'blocker_reason';
path: string;
hash: string;
metadata: Record<string, unknown>;
}
interface ExecutionTask {
id: string;
objective: string;
dependencies: string[];
acceptanceCriteria: string[];
riskLevel: 'low' | 'medium' | 'high' | 'critical';
status: TaskStatus;
evidence: EvidenceArtifact[];
workerConfig?: {
command: string;
args: string[];
timeoutMs: number;
};
}
class ExecutionOrchestrator extends EventEmitter {
private taskGraph: Map<string, ExecutionTask> = new Map();
private approvalQueue: Set<string> = new Set();
private afkMode: boolean = false;
constructor(private workspaceRoot: string) {
super();
}
// Register a new task into the execution graph
registerTask(task: ExecutionTask): void {
this.taskGraph.set(task.id, { ...task, status: 'pending', evidence: [] });
this.resolveDependencies(task.id);
}
// Check if all dependencies are completed before marking ready
private resolveDependencies(taskId: string): void {
const task = this.taskGraph.get(taskId);
if (!task) return;
const depsMet = task.dependencies.every(depId => {
const dep = this.taskGraph.get(depId);
return dep?.status === 'completed';
});
if (depsMet && task.status === 'pending') {
task.status = 'ready';
this.emit('task.ready', task);
}
}
// Spawn worker and capture output
async executeTask(taskId: string): Promise<void> {
const task = this.taskGraph.get(taskId);
if (!task || task.status !== 'ready') throw new Error('Task not ready for execution');
task.status = 'executing';
this.emit('task.executing', task);
if (!task.workerConfig) {
throw new Error('No worker configuration provided');
}
try {
const result = await execa(task.workerConfig.command, task.workerConfig.args, {
cwd: this.workspaceRoot,
timeout: task.workerConfig.timeoutMs,
reject: false
});
await this.collectEvidence(task, result);
await this.verifyAndTransition(task);
} catch (error) {
task.status = 'failed';
task.evidence.push({
type: 'process_log',
path: `logs/${taskId}_error.log`,
hash: createHash('sha256').update(String(error)).digest('hex'),
metadata: { error: String(error) }
});
this.emit('task.failed', task);
}
}
// Capture structured evidence from worker output
private async collectEvidence(task: ExecutionTask, result: any): Promise<void> {
const artifacts: EvidenceArtifact[] = [];
// Capture test output or process logs
if (result.stdout) {
artifacts.push({
type: 'process_log',
path: `artifacts/${task.id}_stdout.log`,
hash: createHash('sha256').update(result.stdout).digest('hex'),
metadata: { exitCode: result.exitCode, duration: result.duration }
});
}
// Capture file diffs if worker modifies workspace
const diffPath = `artifacts/${task.id}_diff.patch`;
// In production, run git diff or file watcher here
artifacts.push({
type: 'diff',
path: diffPath,
hash: createHash('sha256').update(`diff-${task.id}-${Date.now()}`).digest('hex'),
metadata: { scope: task.objective }
});
task.evidence = artifacts;
}
// Verify evidence against acceptance criteria before state transition
private async verifyAndTransition(task: ExecutionTask): Promise<void> {
task.status = 'verifying';
// Check acceptance criteria
const criteriaMet = task.acceptanceCriteria.every(criteria => {
// In production: parse test results, validate diffs, run regression checks
return task.evidence.some(e => e.metadata?.scope?.includes(criteria) || e.type === 'test_output');
});
// Enforce approval gates for high-risk tasks
if (task.riskLevel === 'critical' && !this.approvalQueue.has(task.id)) {
task.status = 'blocked';
this.emit('task.blocked', task, 'Approval gate pending for critical task');
return;
}
if (criteriaMet && task.evidence.length > 0) {
task.status = 'completed';
this.emit('task.completed', task);
// Trigger downstream dependency resolution
this.taskGraph.forEach(t => {
if (t.dependencies.includes(task.id)) this.resolveDependencies(t.id);
});
} else {
task.status = 'failed';
task.evidence.push({
type: 'blocker_reason',
path: `logs/${task.id}_verification_failure.log`,
hash: createHash('sha256').update('criteria_not_met').digest('hex'),
metadata: { missingCriteria: task.acceptanceCriteria.filter(c => !task.evidence.some(e => e.metadata?.scope?.includes(c))) }
});
this.emit('task.failed', task);
}
}
// AFK execution loop with conservative assumptions
async runAfkLoop(): Promise<void> {
this.afkMode = true;
while (this.afkMode) {
const readyTasks = Array.from(this.taskGraph.values()).filter(t => t.status === 'ready');
if (readyTasks.length === 0) {
const blockedTasks = Array.from(this.taskGraph.values()).filter(t => t.status === 'blocked');
if (blockedTasks.length > 0) {
console.warn('AFK loop halted: unresolved blockers detected');
break;
}
await new Promise(r => setTimeout(r, 5000));
continue;
}
for (const task of readyTasks) {
await this.executeTask(task.id);
}
}
}
}
Architecture Decisions & Rationale
Separation of Supervisor and Worker: The runtime never delegates state management to external CLIs. Codex CLI or Claude Code execute commands, but the orchestrator owns the task graph, evidence validation, and state transitions. This prevents workers from self-reporting completion without verification.
Explicit Evidence Schema: Evidence is structured, not free-form text. Each artifact carries a type, path, cryptographic hash, and metadata. This enables deterministic verification, audit trails, and automated regression checks.
Dependency-First Execution: Tasks remain pending until all dependencies report completed. This eliminates race conditions and ensures that foundational changes (e.g., API schema updates) are verified before dependent tasks (e.g., UI integration) begin.
Approval Gates for Critical Paths: High-risk tasks automatically transition to blocked until explicit approval is granted. This prevents silent deployments, destructive operations, or backward-incompatible changes from proceeding without human oversight.
AFK Conservatism: The autonomous loop avoids optimistic continuation. It halts on unresolved blockers, skips non-critical clarification prompts, and maintains strict evidence requirements. AFK accelerates execution but never bypasses safety invariants.
Pitfall Guide
1. Treating LLM Output as Verification
Explanation: Assuming that a model's summary ("I fixed the timeout bug") constitutes proof of completion. LLMs optimize for coherence, not correctness.
Fix: Route all worker output through an evidence pipeline. Require test execution, diff validation, or log parsing before state transition. Never accept textual claims as closure.
2. Skipping Dependency Resolution
Explanation: Executing tasks in parallel or out of order without verifying upstream completion. This causes cascading failures when foundational changes are incomplete.
Fix: Implement a strict dependency graph. Tasks remain pending until all referenced task IDs report completed. Use topological sorting for execution order.
3. Weak Evidence Definitions
Explanation: Accepting vague artifacts like "checked the file" or "ran a command" as evidence. This provides no audit trail or reproducibility.
Fix: Define an evidence schema requiring structured types (test_output, diff, process_log, blocker_reason). Mandate cryptographic hashing and metadata attachment. Reject tasks with missing or malformed evidence.
4. Unbounded AFK Execution
Explanation: Allowing autonomous loops to continue indefinitely without safety boundaries. This can trigger destructive operations, exhaust API quotas, or ignore critical blockers.
Fix: Implement explicit halt conditions: unresolved blockers, missing approvals, credential boundaries, or destructive command filters. Log all AFK decisions and require manual resume for critical paths.
5. Ignoring Approval Gates for High-Risk Tasks
Explanation: Executing migrations, deployments, or schema changes without human verification. This leads to production incidents and data loss.
Fix: Tag tasks with risk levels. Automatically transition critical or high tasks to blocked until explicit approval is recorded. Maintain an approval queue with audit timestamps.
6. Terminal Session Leaks
Explanation: Spawning long-running processes (dev servers, watchers, REPLs) without cleanup or isolation. This consumes resources, causes port conflicts, and leaves orphaned processes.
Fix: Implement session lifecycle management. Assign unique session IDs, enforce workspace boundaries, log all I/O, and provide explicit kill/restart commands. Monitor resource usage and auto-terminate idle sessions.
7. Cross-Worker State Drift
Explanation: Running multiple subagents or CLI workers that modify the same workspace without synchronization. This causes merge conflicts, overwritten files, and inconsistent state.
Fix: Use workspace partitioning or file locking. Assign exclusive file scopes to workers. Run a post-execution merge verification step. Maintain a central state store that all workers read from but only the supervisor writes to.
Production Bundle
Action Checklist
Decision Matrix
| Scenario | Recommended Approach | Why | Cost Impact |
|---|
| Single-file refactor with clear tests | Conversational agent with tool calls | Low complexity, fast feedback loop | Minimal compute cost |
| Multi-module migration with backward compatibility | Evidence-driven state runtime | Requires dependency tracking, approval gates, and regression verification | Moderate compute + storage for evidence artifacts |
| Long-running audit or heavy test suite | Remote CLI worker + supervisor verification | Isolates resource-heavy work, prevents local environment drift | Higher infrastructure cost, but safer execution |
| Security/bug bounty scope validation | Intake-gated agent with strict boundaries | Prevents unauthorized scanning, enforces program rules | Low compute, high compliance value |
| Rapid prototyping with uncertain requirements | AFK mode with conservative assumptions | Accelerates exploration while halting on real blockers | Moderate compute, reduces idle wait time |
Configuration Template
runtime:
workspace: ./project-root
state_store: ./state/execution_graph.json
evidence_dir: ./artifacts
log_level: info
tasks:
default_risk: low
approval_required:
- critical
- high
evidence_schema:
required_fields:
- type
- path
- hash
- metadata
allowed_types:
- test_output
- diff
- process_log
- http_response
- screenshot
- blocker_reason
workers:
codex_cli:
command: codex
args: ["exec", "--headless"]
timeout_ms: 300000
workspace_bound: true
claude_code:
command: claude
args: ["--print", "--output-format", "json"]
timeout_ms: 240000
workspace_bound: true
afk:
enabled: false
conservative_assumptions: true
halt_on_blockers: true
max_concurrent_tasks: 3
idle_timeout_ms: 60000
security:
destructive_commands:
- rm -rf
- sudo
- chmod 777
approval_bypass: false
credential_scopes:
- production
- billing
- infrastructure
Quick Start Guide
- Initialize the runtime: Create a new project directory, install dependencies (
npm i execa crypto), and scaffold the ExecutionOrchestrator class. Configure the workspace root and evidence directory.
- Define your execution plan: Write a YAML or JSON plan specifying objectives, task dependencies, acceptance criteria, and risk levels. Register each task using
orchestrator.registerTask().
- Configure workers: Set up Codex CLI or Claude Code configurations in the runtime. Ensure commands run in headless mode, respect workspace boundaries, and output structured results.
- Run verification loop: Execute
orchestrator.executeTask() for ready tasks. The runtime will spawn the worker, capture evidence, validate against acceptance criteria, and transition state. Monitor events via orchestrator.on('task.completed', ...).
- Enable AFK or remote execution: For long-running workflows, set
afk.enabled: true and deploy workers to isolated environments. The supervisor will continue processing ready tasks, halt on blockers, and maintain a complete audit trail of all state transitions and evidence artifacts.