response = await self.client.post("/api/chat", json=payload)
response.raise_for_status()
data = response.json()
return data.get("message", {}).get("content", "")
async def close(self):
await self.client.aclose()
**Architecture Rationale**: Direct async HTTP calls to Ollama avoid unnecessary middleware. Pydantic enforces strict payload shapes, preventing malformed requests from reaching the model. Context window tuning (`num_ctx`) ensures local models handle larger codebase snippets without truncation.
### Step 2: Secure Tool Registry & Execution Engine
Hardcoded `if/elif` routing is unmaintainable. A decorator-based registry enables dynamic tool discovery, input validation, and output sanitization.
```python
# src/tools/registry.py
import subprocess
import re
from typing import Callable, Dict, Any
from pydantic import BaseModel, Field
from functools import wraps
class ToolSchema(BaseModel):
name: str
description: str
parameters: Dict[str, Any]
class ToolRegistry:
def __init__(self):
self._tools: Dict[str, Callable] = {}
self._schemas: Dict[str, ToolSchema] = {}
def register(self, name: str, description: str, parameters: Dict[str, Any]):
def decorator(func: Callable):
@wraps(func)
async def wrapper(**kwargs):
return await func(**kwargs)
self._tools[name] = wrapper
self._schemas[name] = ToolSchema(name=name, description=description, parameters=parameters)
return wrapper
return decorator
def get_schemas(self) -> List[Dict[str, Any]]:
return [schema.model_dump() for schema in self._schemas.values()]
async def execute(self, name: str, kwargs: Dict[str, Any]) -> str:
if name not in self._tools:
return f"Error: Tool '{name}' not found."
try:
result = await self._tools[name](**kwargs)
return str(result)
except Exception as e:
return f"Execution error in '{name}': {str(e)}"
registry = ToolRegistry()
Step 3: Command Execution with Safety Guardrails
Raw subprocess calls are dangerous. Implement an allowlist, timeout enforcement, and output truncation.
# src/tools/shell.py
import subprocess
import asyncio
from .registry import registry
ALLOWED_COMMANDS = {"git", "ls", "cat", "grep", "find", "tree", "npm", "yarn", "docker", "make"}
MAX_OUTPUT_CHARS = 2000
@registry.register(
name="execute_command",
description="Run a safe shell command in the current working directory",
parameters={"type": "object", "properties": {"cmd": {"type": "string"}}, "required": ["cmd"]}
)
async def execute_command(cmd: str) -> str:
base_cmd = cmd.split()[0] if cmd else ""
if base_cmd not in ALLOWED_COMMANDS:
return f"Security violation: Command '{base_cmd}' is not in the allowlist."
try:
proc = await asyncio.create_subprocess_shell(
cmd,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE,
cwd="."
)
stdout, stderr = await asyncio.wait_for(proc.communicate(), timeout=15.0)
output = stdout.decode().strip() or stderr.decode().strip()
if len(output) > MAX_OUTPUT_CHARS:
output = output[:MAX_OUTPUT_CHARS] + f"\n... [truncated {len(output) - MAX_OUTPUT_CHARS} chars]"
return output
except asyncio.TimeoutError:
return "Command timed out after 15 seconds."
except Exception as e:
return f"Shell execution failed: {str(e)}"
Step 4: Terminal Multiplexer Integration
tmux provides pane isolation and output capture. The agent should target specific panes, send commands, and poll results without blocking the main event loop.
# src/integrations/tmux.py
import subprocess
import time
from typing import Optional
class TmuxController:
def __init__(self, session: str = "dev-agent"):
self.session = session
self._ensure_session()
def _ensure_session(self):
try:
subprocess.run(["tmux", "has-session", "-t", self.session], check=True, capture_output=True)
except subprocess.CalledProcessError:
subprocess.run(["tmux", "new-session", "-d", "-s", self.session])
def split_pane(self, target: str = "0", vertical: bool = False) -> str:
flag = "-v" if vertical else "-h"
subprocess.run(["tmux", "split-window", flag, "-t", f"{self.session}:{target}"])
return f"{self.session}:{target}"
def send_command(self, pane: str, command: str) -> None:
subprocess.run(["tmux", "send-keys", "-t", pane, command, "Enter"])
def capture_output(self, pane: str, lines: int = 50) -> str:
result = subprocess.run(
["tmux", "capture-pane", "-p", "-S", f"-{lines}", "-t", pane],
capture_output=True, text=True
)
return result.stdout.strip()
Step 5: Orchestrator Assembly
The main loop routes user input to the LLM, parses function calls, executes tools, and feeds results back for final reasoning.
# src/orchestrator.py
import asyncio
import json
from typing import List
from .inference.router import InferenceRouter, InferenceRequest, Message
from .tools.registry import registry
from .tools.shell import execute_command
from .integrations.tmux import TmuxController
class TerminalOrchestrator:
def __init__(self, model: str = "llama3"):
self.router = InferenceRouter()
self.tmux = TmuxController()
self.model = model
self.history: List[Message] = [
Message(role="system", content="You are a terminal automation assistant. Use available tools to execute safe commands. Return concise results.")
]
async def run(self, user_input: str) -> str:
self.history.append(Message(role="user", content=user_input))
request = InferenceRequest(
model=self.model,
messages=self.history,
options={"num_ctx": 4096}
)
response_text = await self.router.generate(request)
# Parse tool call (simplified JSON extraction for local models)
if "```json" in response_text:
json_block = response_text.split("```json")[1].split("```")[0].strip()
try:
tool_call = json.loads(json_block)
tool_name = tool_call.get("tool")
tool_args = tool_call.get("args", {})
tool_output = await registry.execute(tool_name, tool_args)
self.history.append(Message(role="assistant", content=f"Tool output: {tool_output}"))
# Second pass for final answer
final_request = InferenceRequest(model=self.model, messages=self.history)
final_response = await self.router.generate(final_request)
self.history.append(Message(role="assistant", content=final_response))
return final_response
except Exception as e:
return f"Tool parsing failed: {str(e)}"
self.history.append(Message(role="assistant", content=response_text))
return response_text
async def shutdown(self):
await self.router.close()
Architecture Decisions:
- Async-first: Prevents terminal freezing during model inference or long-running shell commands.
- Schema-driven tools: Pydantic validation catches malformed arguments before execution.
- Allowlisted execution: Blocks destructive commands (
rm, sudo, curl | sh) by default.
- Two-pass reasoning: Local models often struggle with single-shot function calling. Capturing tool output and feeding it back improves accuracy.
Pitfall Guide
| Pitfall | Explanation | Fix |
|---|
| Unrestricted Shell Execution | Passing model output directly to subprocess enables command injection or accidental data loss. | Implement a strict allowlist, validate arguments, and run commands in a sandboxed directory with limited permissions. |
| Blocking I/O in CLI | Synchronous HTTP calls or shell execution freeze the terminal, degrading UX. | Use asyncio and httpx/aiofiles. Stream responses where possible and enforce timeouts. |
| Token Overflow from Tool Output | Large file listings or build logs exceed local context windows, causing silent truncation. | Truncate output at a configurable limit, summarize long results, or paginate tool responses. |
| Hardcoded Secrets & Keys | Embedding API keys or tokens in scripts exposes them to version control or process listings. | Use environment variables, python-dotenv, or OS keyrings. Never log credentials. |
| tmux State Desync | Sending commands to wrong panes or capturing stale output breaks automation flows. | Explicitly target panes by index, verify pane existence before sending, and poll output with retry logic. |
| Ignoring Local Model Limits | llama3 and codellama:7b have finite context windows; feeding entire repos causes degradation. | Chunk codebases, use ripgrep/fd for targeted searches, and implement RAG-style retrieval for large projects. |
| Poor Error Recovery | Silent failures or generic exceptions leave the agent in an inconsistent state. | Implement structured logging, retry mechanisms with exponential backoff, and fallback to manual prompts on critical failures. |
Production Bundle
Action Checklist
Decision Matrix
| Scenario | Recommended Approach | Why | Cost Impact |
|---|
| Solo developer / offline work | Local-only agent with llama3 | Zero API cost, works without internet, low latency | ~$0/month |
| Team CI/CD automation | Hybrid agent (local routing + cloud fallback) | Balances speed with complex reasoning needs | $5–$20/month |
| Air-gapped enterprise | Local-only + custom fine-tuned model | Meets compliance, no data exfiltration | Hardware + maintenance |
| High-throughput devops | Cloud-optimized agent with streaming | Handles massive logs, scales horizontally | $50–$200/month |
Configuration Template
# agent.config.yaml
inference:
model: "llama3"
base_url: "http://localhost:11434"
context_window: 4096
timeout_seconds: 30
safety:
allowed_commands:
- git
- ls
- cat
- grep
- find
- tree
- npm
- yarn
- docker
- make
max_output_chars: 2000
working_directory: "."
tmux:
session_name: "dev-agent"
pane_layout: "horizontal"
capture_lines: 50
logging:
level: "INFO"
format: "%(asctime)s | %(levelname)s | %(message)s"
file: "agent.log"
Quick Start Guide
- Install dependencies:
pip install typer pydantic httpx asyncio
- Start Ollama:
ollama serve (runs in background)
- Pull models:
ollama pull llama3 and ollama pull codellama:7b
- Run agent:
python -m src.orchestrator "Show git status and list recent commits"
- Verify output: Agent routes to local model, executes
git status, returns formatted result within 200–400ms.
By treating terminal AI agents as production systems rather than experimental scripts, developers gain reliable, low-latency automation that respects security boundaries and scales with project complexity. The architecture outlined here provides a foundation for offline workflows, secure devops automation, and context-aware development assistance without vendor lock-in.