, reserve: usize) -> Self {
Self {
max_tokens,
response_reserve: reserve,
direction: TrimDirection::FromStart,
}
}
pub fn with_direction(mut self, dir: TrimDirection) -> Self {
self.direction = dir;
self
}
}
### Step 2: Implement Atomic Pair Detection
Tool calls and results must be treated as indivisible units. The truncation engine scans the message array and tags paired sequences before applying any slice operations.
```rust
#[derive(Debug, Clone, PartialEq)]
pub enum MessageRole {
System,
User,
Assistant,
ToolCall,
ToolResult,
}
#[derive(Debug, Clone)]
pub struct AgentMessage {
pub role: MessageRole,
pub content: String,
pub call_id: Option<String>, // Links ToolCall <-> ToolResult
}
/// Groups messages into atomic blocks for safe truncation
fn build_atomic_blocks(messages: &[AgentMessage]) -> Vec<Vec<AgentMessage>> {
let mut blocks: Vec<Vec<AgentMessage>> = Vec::new();
let mut current_block: Vec<AgentMessage> = Vec::new();
let mut pending_call: Option<String> = None;
for msg in messages {
match msg.role {
MessageRole::ToolCall => {
// Flush previous block if any
if !current_block.is_empty() {
blocks.push(std::mem::take(&mut current_block));
}
pending_call = msg.call_id.clone();
current_block.push(msg.clone());
}
MessageRole::ToolResult => {
current_block.push(msg.clone());
if msg.call_id == pending_call {
// Pair complete, flush block
blocks.push(std::mem::take(&mut current_block));
pending_call = None;
}
}
_ => {
current_block.push(msg.clone());
}
}
}
if !current_block.is_empty() {
blocks.push(current_block);
}
blocks
}
Step 3: Token Counting with Pluggable Estimators
Hardcoding a character ratio creates budget drift. A trait-based tokenizer allows swapping between fast approximations and provider-specific encodings without rewriting the truncation logic.
pub trait TokenEstimator: Send + Sync {
fn count(&self, text: &str) -> usize;
}
pub struct CharRatioEstimator;
impl TokenEstimator for CharRatioEstimator {
fn count(&self, text: &str) -> usize {
text.chars().count() / 4
}
}
pub struct TiktokenEstimator {
encoding: tiktoken_rs::CoreBPE,
}
impl TiktokenEstimator {
pub fn new() -> Result<Self, Box<dyn std::error::Error>> {
let enc = tiktoken_rs::cl100k_base()?;
Ok(Self { encoding: enc })
}
}
impl TokenEstimator for TiktokenEstimator {
fn count(&self, text: &str) -> usize {
self.encoding.encode_ordinary(text).len()
}
}
Step 4: Execute Truncation with System Prompt Isolation
The system message is extracted before budget calculation. If it exceeds the target window, the operation fails explicitly rather than silently degrading agent behavior.
pub struct ContextTrimmer {
estimator: Box<dyn TokenEstimator>,
}
impl ContextTrimmer {
pub fn new(estimator: Box<dyn TokenEstimator>) -> Self {
Self { estimator }
}
pub fn fit(
&self,
messages: &[AgentMessage],
budget: &ContextBudget,
) -> Result<Vec<AgentMessage>, TruncationError> {
let system_msg = messages.iter().find(|m| m.role == MessageRole::System);
let system_tokens = system_msg.map(|m| self.estimator.count(&m.content)).unwrap_or(0);
let effective_budget = budget.max_tokens - budget.response_reserve;
if system_tokens > effective_budget {
return Err(TruncationError::SystemPromptExceedsBudget);
}
let remaining_budget = effective_budget.saturating_sub(system_tokens);
let mut blocks = build_atomic_blocks(messages);
// Filter out system message from blocks
blocks.retain(|b| b.iter().all(|m| m.role != MessageRole::System));
let mut current_tokens = 0;
let mut kept_blocks: Vec<Vec<AgentMessage>> = Vec::new();
// Calculate total tokens first
for block in &blocks {
for msg in block {
current_tokens += self.estimator.count(&msg.content);
}
}
// Trim based on direction
while current_tokens > remaining_budget && !blocks.is_empty() {
let dropped = match budget.direction {
TrimDirection::FromStart => blocks.remove(0),
TrimDirection::FromEnd => blocks.pop().unwrap(),
TrimDirection::FromMiddle => {
let mid = blocks.len() / 2;
blocks.remove(mid)
}
};
for msg in &dropped {
current_tokens -= self.estimator.count(&msg.content);
}
}
kept_blocks = blocks;
kept_blocks.sort_by_key(|b| {
// Restore original order if middle trimming was used
messages.iter().position(|m| m == &b[0]).unwrap_or(usize::MAX)
});
let mut result: Vec<AgentMessage> = Vec::new();
if let Some(sys) = system_msg {
result.push(sys.clone());
}
for block in kept_blocks {
result.extend(block);
}
Ok(result)
}
}
#[derive(Debug)]
pub enum TruncationError {
SystemPromptExceedsBudget,
SingleMessageOversized,
}
Architecture Rationale
- Atomic Block Grouping: Tool sequences are grouped before truncation. This guarantees that
tool_use and tool_result are always added or removed together, preventing API contract violations.
- System Prompt Isolation: Extracting the system message before budget calculation ensures it is never silently truncated. The explicit error path forces developers to address oversized prompts upstream.
- Tokenizer Abstraction: The
TokenEstimator trait decouples counting logic from truncation logic. This allows hot-swapping between chars/4 for development and cl100k_base for production without modifying the core algorithm.
- Response Reserve: Subtracting a buffer from the budget prevents the model from hitting the hard limit during generation, which would otherwise truncate its own output mid-sentence.
Pitfall Guide
Explanation: Dropping messages by index without checking structural dependencies severs tool_use from its tool_result. The API rejects the payload with a validation error.
Fix: Always group messages into atomic blocks before applying slice operations. Validate pairing integrity after every truncation pass.
2. Assuming Character-to-Token Ratios are Universal
Explanation: chars / 4 works reasonably for English prose but fails on code, JSON, or multi-byte Unicode. A budget calculated with this heuristic will consistently overflow when passed to the model.
Fix: Use provider-specific tokenizers (tiktoken for OpenAI, cl100k_base encoding) in production. Reserve character ratios only for rapid prototyping or non-critical budget checks.
3. Silently Dropping the System Prompt
Explanation: Treating the system prompt as a disposable message during truncation removes agent instructions, tool schemas, and safety constraints. The model continues operating but with degraded or unpredictable behavior.
Fix: Isolate the system message before budget calculation. Return a hard error if it exceeds the target window, forcing explicit resolution rather than silent degradation.
4. Ignoring the Response Buffer
Explanation: Allocating 100% of the context window to input leaves no room for the model's output. The API may truncate the response mid-generation or reject the request if the model attempts to exceed the limit.
Fix: Always subtract a response reserve (typically 10-15% of the window) from the input budget. This guarantees headroom for completion tokens.
5. Treating Single Oversized Messages as Truncatable
Explanation: Attempting to split a single message that exceeds the budget breaks semantic continuity and creates malformed prompts. LLM APIs do not accept partial message payloads.
Fix: Validate individual message sizes before truncation. If a single message exceeds the budget, reject it upstream or implement a dedicated summarization pipeline rather than mid-message slicing.
6. Overlooking Multi-Modal Token Costs
Explanation: Text tokenizers do not account for image, audio, or document embeddings. Vision models charge per-image tokens based on resolution and processing mode. Relying solely on text counts leads to budget overflow.
Fix: Maintain separate token counters for modalities. Add provider-specific image token costs to the budget calculation before invoking the truncation engine.
7. Hardcoding Truncation Direction
Explanation: Always dropping from the oldest end (FromStart) works for linear support threads but fails for research or coding agents where early context contains critical constraints or established facts.
Fix: Expose truncation direction as a configurable policy. Use FromMiddle for sessions requiring boundary preservation, and FromEnd for real-time debugging or rollback scenarios.
Production Bundle
Action Checklist
Decision Matrix
| Scenario | Recommended Approach | Why | Cost Impact |
|---|
| Linear support threads | FromStart truncation | Oldest context is least relevant; preserves recent user intent | Low (predictable token usage) |
| Research/Coding sessions | FromMiddle truncation | Preserves initial constraints and recent findings; drops intermediate noise | Medium (requires careful boundary tracking) |
| Real-time debugging | FromEnd truncation | Keeps historical context intact; drops latest turns for rollback analysis | Low (minimal API overhead) |
| Code-heavy prompts | TiktokenEstimator (cl100k_base) | Character ratios diverge significantly on syntax; exact counts prevent overflow | High (CPU overhead for encoding) |
| Multi-modal agents | Separate modality counters + text truncation | Vision/audio tokens follow different pricing and counting rules | High (requires provider-specific math) |
Configuration Template
# Cargo.toml
[dependencies]
tiktoken-rs = "3.0"
serde = { version = "1.0", features = ["derive"] }
[features]
default = ["exact-token-counting"]
exact-token-counting = ["tiktoken-rs"]
fast-prototype = [] # Uses char/4 estimator
// config.rs
use serde::Deserialize;
#[derive(Debug, Deserialize)]
pub struct AgentContextConfig {
pub max_context_tokens: usize,
pub response_reserve_pct: f32,
pub truncation_direction: String, // "start", "end", "middle"
pub use_exact_tokenizer: bool,
}
impl AgentContextConfig {
pub fn response_reserve(&self) -> usize {
(self.max_context_tokens as f32 * self.response_reserve_pct).ceil() as usize
}
pub fn trim_direction(&self) -> TrimDirection {
match self.truncation_direction.as_str() {
"end" => TrimDirection::FromEnd,
"middle" => TrimDirection::FromMiddle,
_ => TrimDirection::FromStart,
}
}
}
Quick Start Guide
- Add Dependencies: Include
tiktoken-rs for production token counting or rely on the built-in character estimator for prototyping.
- Initialize Budget: Create a
ContextBudget with your model's maximum window and a 10-15% response reserve.
- Select Estimator: Instantiate
TiktokenEstimator for exact counts or CharRatioEstimator for fast development cycles.
- Execute Truncation: Pass your message array and budget to the trimmer. Handle
SystemPromptExceedsBudget explicitly before sending to the API.
- Validate Output: Log the resulting token count and message count. Verify tool pairing integrity before dispatching the request.