, "https://integrate.api.nvidia.com/v1"),
api_key=os.getenv("NIM_API_KEY")
)
TARGET_MODEL = "meta/llama-3.3-70b-instruct"
### Step 2: Tool Definition & Schema Generation
Tools should be plain functions with explicit type hints. We will use Pydantic to auto-generate the JSON schema, eliminating manual schema maintenance and ensuring runtime validation matches the model's expectations.
```python
import json
from datetime import datetime, timezone
from pydantic import BaseModel, Field
from typing import Optional
class TimeQuery(BaseModel):
timezone: str = Field(default="UTC", description="IANA timezone identifier (e.g., America/New_York, Europe/London)")
class KnowledgeQuery(BaseModel):
query: str = Field(description="Search phrase for internal documentation or policy databases")
def fetch_system_time(params: TimeQuery) -> str:
try:
tz = datetime.now(timezone.utc).astimezone(timezone(params.timezone))
except Exception:
tz = datetime.now(timezone.utc)
return tz.strftime("%Y-%m-%d %H:%M:%S %Z")
def query_document_store(params: KnowledgeQuery) -> str:
# Placeholder for vector retrieval or search API
# In production, this would call a RAG pipeline or search index
return f"Retrieved 3 relevant chunks for: '{params.query}'"
Step 3: Dispatch Registry & Schema Assembly
The model never sees Python functions. It only sees JSON schemas. We must maintain a strict 1:1 mapping between schema names and executable functions.
TOOL_REGISTRY = {
"fetch_system_time": fetch_system_time,
"query_document_store": query_document_store
}
def build_tool_schemas() -> list[dict]:
schemas = []
for name, func in TOOL_REGISTRY.items():
# Extract Pydantic model from function signature (simplified for clarity)
# In practice, use inspect.signature or explicit schema mapping
if name == "fetch_system_time":
schema = TimeQuery.model_json_schema()
else:
schema = KnowledgeQuery.model_json_schema()
schemas.append({
"type": "function",
"function": {
"name": name,
"description": func.__doc__ or f"Execute {name}",
"parameters": schema
}
})
return schemas
Step 4: The Execution Loop
The loop manages conversation state, handles tool invocations, and enforces termination. Key architectural decisions:
tool_choice="auto" allows the model to bypass tools when unnecessary.
tool_call_id binding ensures the model can correlate results with its original request.
- A hard iteration cap prevents infinite routing spirals.
- Temperature is capped at 0.2 to reduce routing variance.
MAX_ITERATIONS = 3
SYSTEM_PROMPT = """You are an infrastructure assistant. Use available tools when the query requires external data.
If tools return insufficient information, respond with: 'Data unavailable. Escalate to human review.'
Never fabricate tool results or call the same tool consecutively for identical parameters."""
def run_agent_cycle(user_input: str) -> str:
conversation_history = [
{"role": "system", "content": SYSTEM_PROMPT},
{"role": "user", "content": user_input}
]
tool_definitions = build_tool_schemas()
for iteration in range(MAX_ITERATIONS):
response = nim_client.chat.completions.create(
model=TARGET_MODEL,
messages=conversation_history,
tools=tool_definitions,
tool_choice="auto",
temperature=0.2,
max_tokens=512
)
assistant_msg = response.choices[0].message
conversation_history.append(assistant_msg.model_dump(exclude_none=True))
if not assistant_msg.tool_calls:
return assistant_msg.content or "No response generated."
for call in assistant_msg.tool_calls:
tool_name = call.function.name
try:
args = json.loads(call.function.arguments)
except json.JSONDecodeError:
args = {}
if tool_name not in TOOL_REGISTRY:
result = f"Error: Tool '{tool_name}' not registered."
else:
try:
# Pydantic validation ensures type safety
if tool_name == "fetch_system_time":
result = fetch_system_time(TimeQuery(**args))
else:
result = query_document_store(KnowledgeQuery(**args))
except Exception as e:
result = f"Execution failed: {str(e)}"
conversation_history.append({
"role": "tool",
"tool_call_id": call.id,
"name": tool_name,
"content": str(result)
})
return "Iteration limit reached. Routing failed to converge."
Step 5: Invocation & Fallback Behavior
The loop naturally handles multi-tool requests. If the model calls both tools in a single turn, each result is appended with its corresponding tool_call_id, and the next iteration receives a consolidated context. The system prompt enforces graceful degradation when tools yield no actionable data.
test_queries = [
"What is the current time in Tokyo?",
"Find the deployment policy for staging environments",
"Reset the production database immediately"
]
for q in test_queries:
print(f"Input: {q}")
print(f"Output: {run_agent_cycle(q)}\n")
Pitfall Guide
Explanation: Failing to match the tool_call_id from the model's response when appending tool results causes the model to treat the output as unstructured text, breaking the routing chain.
Fix: Always extract call.id from message.tool_calls and pass it exactly as tool_call_id in the role="tool" message. Never generate or modify this ID.
2. Schema Ambiguity & Routing Collisions
Explanation: Vague tool descriptions or overlapping parameter names cause the model to route to the wrong function. This is especially common when tools share similar intents (e.g., search_docs vs query_knowledge_base).
Fix: Use explicit, mutually exclusive descriptions. Include negative constraints in the schema (e.g., "Do not use for time queries"). Validate schemas against test prompts before deployment.
3. Unbounded Iteration Spirals
Explanation: Without a hard cap, models can enter recursive tool-calling loops, exhausting token budgets and incurring unnecessary costs.
Fix: Implement a strict iteration limit (3β5 is standard). Log each iteration count and trigger alerts when the cap is consistently hit, indicating a routing or schema issue.
4. State Contamination Across Turns
Explanation: Appending assistant messages without filtering None values or including internal metadata pollutes the context window, increasing latency and confusing the model.
Fix: Use message.model_dump(exclude_none=True) or equivalent serialization. Strip internal fields before appending to history. Maintain a clean conversation buffer separate from internal state.
5. Temperature-Induced Routing Instability
Explanation: High temperature values (>0.5) increase token variance, causing the model to hallucinate tool names or misparse JSON parameters.
Fix: Cap temperature at 0.2β0.3 for tool-calling turns. Use higher temperatures only for final answer generation if creative phrasing is required.
6. Missing Error Boundaries in Dispatch
Explanation: Unhandled exceptions in tool execution crash the loop or return raw tracebacks to the model, which may attempt to "fix" the error by calling the tool again with modified parameters.
Fix: Wrap dispatch calls in try/except blocks. Return structured error messages (e.g., Execution failed: timeout) instead of stack traces. Implement retry logic at the application layer, not the model layer.
7. Ignoring Token Budget in History Accumulation
Explanation: Each tool call adds multiple messages to the context window. Without pruning or truncation, long sessions exceed model limits, causing silent failures or degraded routing.
Fix: Implement sliding window truncation or summarize older tool results. Monitor token count per turn and enforce a hard context limit. Use max_tokens strategically to reserve space for tool responses.
Production Bundle
Action Checklist
Decision Matrix
| Scenario | Recommended Approach | Why | Cost Impact |
|---|
| Simple routing (1β3 tools) | Bare-metal loop | Minimal overhead, full state control, easier debugging | Low (baseline token usage) |
| Complex multi-agent workflows | Framework orchestration | Built-in state management, retry logic, and sub-agent routing | High (20β30% token overhead + framework licensing) |
| High-throughput production | Bare-metal + async dispatch | Parallel tool execution reduces latency; custom loop avoids framework bottlenecks | Medium (requires engineering effort for observability) |
| Rapid prototyping | Framework orchestration | Faster iteration, pre-built integrations, reduced boilerplate | Low initial, high long-term (vendor lock-in, hidden costs) |
Configuration Template
# agent_config.py
import os
from openai import OpenAI
class AgentConfig:
NIM_BASE_URL = os.getenv("NIM_BASE_URL", "https://integrate.api.nvidia.com/v1")
API_KEY = os.getenv("NIM_API_KEY")
TARGET_MODEL = "meta/llama-3.3-70b-instruct"
MAX_ITERATIONS = 3
TEMPERATURE = 0.2
MAX_TOKENS = 512
TOOL_CHOICE = "auto"
SYSTEM_PROMPT = """You are a deterministic assistant. Use tools only when explicitly required.
If tools cannot satisfy the query, respond with: 'Insufficient data. Escalate to human review.'
Maintain strict parameter validation. Never fabricate tool outputs."""
@classmethod
def get_client(cls) -> OpenAI:
return OpenAI(base_url=cls.NIM_BASE_URL, api_key=cls.API_KEY)
Quick Start Guide
- Install dependencies:
pip install openai pydantic
- Set environment variables: Export
NIM_BASE_URL and NIM_API_KEY with your NVIDIA NIM credentials.
- Define tools: Implement plain Python functions with Pydantic models for parameter validation.
- Initialize the loop: Use the provided execution template, ensuring
tool_call_id binding and iteration caps are enforced.
- Test routing: Run 10β20 diverse prompts to validate tool selection accuracy, error handling, and fallback behavior before scaling.