// GB RAM minimum
quantizationMode: 'W8A16' | 'W8A8' | 'FP16';
fallbackEnabled: boolean;
complianceZone: 'strict-local' | 'perimeter-cloud' | 'hybrid';
}
### Step 2: Implement Local GUI Bridge
The local backend captures screen state, dispatches input events, and runs inference on-device. It leverages quantization to balance throughput and memory footprint.
```typescript
class LocalExecutionBridge implements ExecutionBackend {
readonly type = 'local' as const;
private modelHandle: any;
private screenCapture: ScreenCaptureService;
private inputDispatcher: InputEventDispatcher;
async initialize(config: RuntimeConfig): Promise<void> {
const availableRAM = await getSystemMemoryGB();
if (availableRAM < config.hardwareThreshold) {
throw new Error(`Insufficient RAM: ${availableRAM}GB < ${config.hardwareThreshold}GB required`);
}
// Load quantized model based on config
this.modelHandle = await loadLocalModel({
quantization: config.quantizationMode,
device: 'unified-memory' // Optimized for Apple Silicon / modern GPUs
});
this.screenCapture = new ScreenCaptureService();
this.inputDispatcher = new InputEventDispatcher();
}
async observeState(): Promise<EnvironmentSnapshot> {
const screenshot = await this.screenCapture.capture();
const activeWindows = await this.screenCapture.listActiveProcesses();
return { screenshot, activeWindows, timestamp: Date.now() };
}
async executeAction(action: AgentAction): Promise<ActionResult> {
switch (action.type) {
case 'click':
await this.inputDispatcher.sendClick(action.coordinates);
break;
case 'type':
await this.inputDispatcher.sendText(action.payload);
break;
case 'shortcut':
await this.inputDispatcher.sendHotkey(action.payload);
break;
}
return { status: 'completed', actionId: action.id };
}
async teardown(): Promise<void> {
await this.modelHandle.release();
this.screenCapture.dispose();
}
}
Step 3: Implement Cloud Sandbox Client
The cloud backend provisions ephemeral environments, handles web-native tasks, and enforces isolation boundaries.
class CloudSandboxClient implements ExecutionBackend {
readonly type = 'cloud' as const;
private sessionToken: string | null = null;
private apiEndpoint: string;
async initialize(config: RuntimeConfig): Promise<void> {
this.apiEndpoint = process.env.CLOUD_SANDBOX_API || '';
const response = await fetch(`${this.apiEndpoint}/sessions`, {
method: 'POST',
headers: { 'Authorization': `Bearer ${process.env.SANDBOX_API_KEY}` }
});
const data = await response.json();
this.sessionToken = data.sessionId;
}
async observeState(): Promise<EnvironmentSnapshot> {
const response = await fetch(`${this.apiEndpoint}/sessions/${this.sessionToken}/state`);
return response.json();
}
async executeAction(action: AgentAction): Promise<ActionResult> {
const response = await fetch(`${this.apiEndpoint}/sessions/${this.sessionToken}/execute`, {
method: 'POST',
body: JSON.stringify({ action, sessionId: this.sessionToken })
});
return response.json();
}
async teardown(): Promise<void> {
if (this.sessionToken) {
await fetch(`${this.apiEndpoint}/sessions/${this.sessionToken}`, { method: 'DELETE' });
this.sessionToken = null;
}
}
}
Step 4: Runtime Router & Orchestration
The router evaluates task metadata, hardware availability, and compliance rules to select the appropriate backend. It implements fallback logic and idempotent action tracking.
class TaskOrchestrator {
private backends: Map<string, ExecutionBackend>;
private config: RuntimeConfig;
constructor(config: RuntimeConfig) {
this.config = config;
this.backends = new Map();
}
async routeTask(task: TaskDefinition): Promise<TaskResult> {
const backend = await this.selectBackend(task);
await backend.initialize(this.config);
try {
const initialState = await backend.observeState();
const actionQueue = task.plan.actions;
const results: ActionResult[] = [];
for (const action of actionQueue) {
const result = await backend.executeAction(action);
results.push(result);
// Verify state transition before proceeding
const currentState = await backend.observeState();
if (!this.verifyStateTransition(initialState, currentState, action)) {
throw new Error(`State drift detected after action ${action.id}`);
}
}
return { success: true, results, backendType: backend.type };
} finally {
await backend.teardown();
}
}
private async selectBackend(task: TaskDefinition): Promise<ExecutionBackend> {
const requiresLocalAccess = task.metadata.accessesLocalApps || task.metadata.containsPII;
const requiresScale = task.metadata.concurrency > 10;
if (requiresLocalAccess && this.config.complianceZone === 'strict-local') {
return new LocalExecutionBridge();
}
if (requiresScale || task.metadata.isWebNative) {
return new CloudSandboxClient();
}
// Fallback routing for hybrid scenarios
return this.config.fallbackEnabled
? new CloudSandboxClient()
: new LocalExecutionBridge();
}
private verifyStateTransition(before: EnvironmentSnapshot, after: EnvironmentSnapshot, action: AgentAction): boolean {
// Implement domain-specific validation logic
return after.timestamp > before.timestamp;
}
}
Architecture Decisions & Rationale
- Backend Abstraction: Decoupling execution from reasoning prevents vendor lock-in and enables compliance-driven routing. The
ExecutionBackend interface standardizes state observation and action dispatch across environments.
- Quantization Strategy: W8A16 quantization delivers ~80 tokens/s decode on Apple M5 Pro (64GB), while W8A8 activation quantization accelerates prefill by ~12.7%. This trade-off is critical for local deployment where memory bandwidth dictates throughput.
- State Verification Loop: Agents operating in GUI environments suffer from asynchronous UI updates. Verifying state transitions after each action prevents cascading failures from missed clicks or delayed renders.
- Ephemeral Cloud Sessions: Cloud sandboxes are provisioned and torn down per task to enforce isolation, contain blast radius, and align with per-session billing models. Persistent state must be explicitly serialized if required.
Pitfall Guide
1. Treating the Screen as a Synchronous API
Explanation: GUI elements render asynchronously. Assuming immediate state changes after dispatching a click or keystroke causes the agent to act on stale observations.
Fix: Implement a polling or event-driven state verification loop with configurable timeouts. Validate that target elements exist and are interactive before proceeding.
2. Ignoring Hardware Memory Ceilings
Explanation: Vision-language models with high-resolution screen capture consume significant VRAM/RAM. Exceeding unified memory limits triggers swapping, collapsing throughput to unusable levels.
Fix: Profile memory usage during peak observation windows. Enforce hardware thresholds at initialization. Use W8A16/W8A8 quantization to reduce footprint without sacrificing task completion rates.
3. Synchronous Action Blocking
Explanation: Dispatching input events synchronously blocks the reasoning loop, preventing concurrent observation or fallback routing.
Fix: Use async action queues with independent execution threads. Decouple perception from actuation so the model can reason about UI state while actions are being processed.
4. Assuming Cloud Sandbox Parity for Local Apps
Explanation: Cloud environments lack native desktop applications, registry access, and local file systems. Tool schemas designed for web APIs fail when mapped to desktop interactions.
Fix: Maintain environment-specific tool registries. Map high-level intents to platform-specific actions (e.g., open_file â os.startfile on Windows, open -a on macOS).
5. Hybrid Sync Drift
Explanation: In cloud-brain/local-hands architectures, network latency causes action receipts to arrive out of order. The cloud model may issue subsequent commands before the local daemon completes previous ones.
Fix: Implement idempotent action receipts with sequence numbers. Use heartbeat monitoring to detect daemon unresponsiveness and trigger automatic fallback or session reset.
6. Over-Provisioning Local Compute
Explanation: Running 72B-class models locally for simple routing tasks wastes hardware and increases power consumption. Not all tasks require maximum model capacity.
Fix: Deploy tiered models. Use 4B quantized variants for routine UI navigation and escalation logic, reserving larger models for complex planning or ambiguous visual grounding. Route dynamically based on task complexity scores.
7. Neglecting Compliance Boundaries
Explanation: Accidentally routing PII or internal documents through cloud sandboxes violates data residency requirements and triggers legal blockers.
Fix: Enforce data classification at the task ingestion layer. Tag tasks with containsPII, internalOnly, or publicWeb. The router must reject cloud backend selection for strict-local compliance zones.
Production Bundle
Action Checklist
Decision Matrix
| Scenario | Recommended Approach | Why | Cost Impact |
|---|
| Web scraping, API orchestration, public data tasks | Cloud Sandbox | Ephemeral isolation, instant horizontal scale, no local hardware dependency | Variable (per-session/token), scales linearly with concurrency |
| Cross-app desktop automation, legacy ERP interaction, strict data residency | Local GUI | Full environment access, zero data egress, fixed compute amortization | Fixed (hardware procurement), marginal cost per additional session |
| Mixed workloads, compliance-sensitive routing, tiered model deployment | Hybrid | Flexibility to route by task type, right-sized compute, fallback resilience | Mixed (cloud egress + local hardware), highest operational overhead |
| Prototyping, CI/CD integration, demo environments | Cloud Sandbox | Zero hardware setup, managed orchestration, rapid iteration | Low initial cost, scales predictably for short-lived tasks |
| Long-running automations, high-frequency inference, offline requirements | Local GUI | Eliminates per-token pricing, operates without network dependency, consistent latency | High initial hardware cost, near-zero marginal inference cost |
Configuration Template
runtime:
compliance_zone: "strict-local" # strict-local | perimeter-cloud | hybrid
fallback_enabled: true
hardware_threshold_gb: 32
quantization_mode: "W8A16" # W8A16 | W8A8 | FP16
routing:
web_native_tasks: "cloud"
local_app_tasks: "local"
pii_containing_tasks: "local"
concurrency_threshold: 10
fallback_backend: "cloud"
monitoring:
state_verification_timeout_ms: 2000
action_receipt_sequence_tracking: true
cloud_session_budget_usd_per_hour: 15.00
local_memory_pressure_alert_gb: 28
model_tiers:
routing:
model: "mano-p-4b-quantized"
quantization: "W8A8"
target_throughput_toks_per_sec: 120
planning:
model: "mano-p-72b-eval"
quantization: "W8A16"
target_throughput_toks_per_sec: 80
Quick Start Guide
- Provision Hardware or Cloud Credentials: Ensure local machines meet the 32GB RAM minimum (M4/M5 series recommended) or configure cloud sandbox API keys and billing alerts.
- Install Runtime Dependencies: Deploy the quantized model weights, screen capture utilities, and input dispatch libraries. Verify unified memory bandwidth and GPU/driver compatibility.
- Initialize the Router: Load the configuration template, set compliance boundaries, and run a dry-run task to validate state observation and action dispatch pipelines.
- Deploy Tiered Models: Load the 4B quantized variant for routing and simple UI navigation. Reserve the 72B evaluation configuration for complex planning or ambiguous visual grounding tasks.
- Monitor and Iterate: Track state verification success rates, action receipt latency, and hardware memory pressure. Adjust quantization modes and routing thresholds based on production telemetry.