ict[str, Any]]
total_tokens: int
model_limit: int
utilization_pct: float
eviction_applied: bool
class ContextReplayEngine:
def init(self, model_id: str, tokenizer_name: str = "cl100k_base"):
self.model_id = model_id
self.tokenizer = tiktoken.get_encoding(tokenizer_name)
self.limits = {
"gpt-4o": 128000,
"claude-3-5-sonnet": 200000,
"deepseek-chat": 64000,
"gemma-7b": 8192
}
self.evacuation_strategies = {
"gpt-4o": "left_truncation",
"claude-3-5-sonnet": "left_truncation",
"deepseek-chat": "sliding_window",
"gemma-7b": "local_global_sampling"
}
def _count_tokens(self, text: str) -> int:
return len(self.tokenizer.encode(text))
def _apply_eviction(self, messages: List[Dict], limit: int) -> List[Dict]:
strategy = self.evacuation_strategies.get(self.model_id, "left_truncation")
if strategy == "left_truncation":
# Keep system prompt, truncate oldest user/assistant/tool messages
system_msgs = [m for m in messages if m.get("role") == "system"]
other_msgs = [m for m in messages if m.get("role") != "system"]
while other_msgs:
total = sum(self._count_tokens(m.get("content", "")) for m in messages)
if total <= limit:
break
other_msgs.pop(0)
return system_msgs + other_msgs
elif strategy == "sliding_window":
# DeepSeek-style: keep recent context + recency bias
# Simplified simulation for demonstration
return messages[-20:] if len(messages) > 20 else messages
return messages
def reconstruct_turn(self, session_log: List[Dict], turn_index: int) -> ContextSnapshot:
limit = self.limits.get(self.model_id, 128000)
accumulated = []
for i in range(turn_index + 1):
turn_data = session_log[i]
accumulated.extend(turn_data.get("messages", []))
final_context = self._apply_eviction(accumulated, limit)
total_tokens = sum(self._count_tokens(m.get("content", "")) for m in final_context)
return ContextSnapshot(
turn_index=turn_index,
messages=final_context,
total_tokens=total_tokens,
model_limit=limit,
utilization_pct=(total_tokens / limit) * 100,
eviction_applied=(total_tokens > limit)
)
**Architecture Rationale:** We use `tiktoken` for exact token counting because OpenAI's official tokenizer aligns with their API billing and context limits. Eviction strategies are mapped explicitly per model because assuming uniform behavior causes false positives in debugging. System messages are anchored because all major providers guarantee their persistence regardless of window pressure.
### Step 2: Fact Persistence Tracking
Agents fail when they "forget" constraints. Instead of manual log searching, we embed target facts and track their presence across every turn's reconstructed context.
```python
from sentence_transformers import SentenceTransformer
import numpy as np
class FactPersistenceAnalyzer:
def __init__(self, embedding_model: str = "all-MiniLM-L6-v2"):
self.encoder = SentenceTransformer(embedding_model)
self.presence_threshold = 0.75
self._cache = {}
def _get_embedding(self, text: str) -> np.ndarray:
if text not in self._cache:
self._cache[text] = self.encoder.encode(text)
return self._cache[text]
def track_across_session(self, session_log: List[Dict], target_fact: str) -> Dict:
fact_vec = self._get_embedding(target_fact)
presence_timeline = []
for turn_idx in range(len(session_log)):
# Reconstruct context for this turn
snapshot = ContextReplayEngine("gpt-4o").reconstruct_turn(session_log, turn_idx)
max_similarity = 0.0
for msg in snapshot.messages:
content = msg.get("content", "")
if not content:
continue
msg_vec = self._get_embedding(content)
sim = np.dot(fact_vec, msg_vec) / (np.linalg.norm(fact_vec) * np.linalg.norm(msg_vec))
max_similarity = max(max_similarity, sim)
presence_timeline.append({
"turn": turn_idx,
"present": max_similarity >= self.presence_threshold,
"similarity_score": round(max_similarity, 3)
})
first_seen = next((p["turn"] for p in presence_timeline if p["present"]), None)
last_seen = next((p["turn"] for p in reversed(presence_timeline) if p["present"]), None)
disappeared_at = next((p["turn"] for p in presence_timeline if not p["present"] and p["turn"] > first_seen), None)
return {
"fact": target_fact,
"first_appeared": first_seen,
"last_present": last_seen,
"disappeared_at": disappeared_at,
"timeline": presence_timeline
}
Architecture Rationale: Local embeddings (all-MiniLM-L6-v2) eliminate API latency and cost for forensic analysis. The 0.75 cosine similarity threshold balances semantic matching with false positives. Caching prevents redundant encoding during multi-turn sweeps.
Step 3: Cross-Session Divergence Detection
When two runs start identically but yield different outcomes, the root cause usually lies in an early context divergence. We align turns, reconstruct both contexts, and measure semantic drift.
class ExecutionDivergenceDetector:
def __init__(self, divergence_threshold: float = 0.85):
self.threshold = divergence_threshold
self.encoder = SentenceTransformer("all-MiniLM-L6-v2")
def find_earliest_divergence(self, run_a: List[Dict], run_b: List[Dict]) -> Dict:
min_turns = min(len(run_a), len(run_b))
for turn_idx in range(min_turns):
engine_a = ContextReplayEngine("gpt-4o")
engine_b = ContextReplayEngine("gpt-4o")
snap_a = engine_a.reconstruct_turn(run_a, turn_idx)
snap_b = engine_b.reconstruct_turn(run_b, turn_idx)
# Compute average max similarity between contexts
vecs_a = [self.encoder.encode(m.get("content", "")) for m in snap_a.messages if m.get("content")]
vecs_b = [self.encoder.encode(m.get("content", "")) for m in snap_b.messages if m.get("content")]
if not vecs_a or not vecs_b:
continue
similarities = []
for va in vecs_a:
max_sim = max(np.dot(va, vb) / (np.linalg.norm(va) * np.linalg.norm(vb)) for vb in vecs_b)
similarities.append(max_sim)
avg_max_sim = np.mean(similarities)
if avg_max_sim < self.threshold:
return {
"divergence_turn": turn_idx,
"similarity_score": round(avg_max_sim, 3),
"context_a_length": len(snap_a.messages),
"context_b_length": len(snap_b.messages),
"diagnosis": "Context payload drifted below acceptable similarity threshold"
}
return {"divergence_turn": None, "diagnosis": "No significant divergence detected"}
Architecture Rationale: Aligning by turn index ensures fair comparison. The 0.85 threshold flags meaningful structural or content shifts without triggering on minor phrasing variations. This replaces manual diffing with automated root-cause localization.
Pitfall Guide
Explanation: Treating all models as left-truncation causes false eviction predictions. DeepSeek uses sliding windows with recency bias; Gemma samples locally and globally.
Fix: Maintain an explicit model-to-strategy mapping and validate against provider documentation before reconstruction.
2. Token Count Mismatch
Explanation: Using generic character-to-token ratios or outdated tokenizers produces inaccurate utilization metrics.
Fix: Always use the official tokenizer (tiktoken for OpenAI, anthropic SDK for Claude) and account for tool/function schema overhead in token calculations.
3. Overlooking System Prompt Immunity
Explanation: Eviction algorithms that remove system messages contradict provider guarantees and break constraint tracking.
Fix: Anchor system messages at the top of the reconstructed payload and exclude them from truncation logic.
4. Similarity Threshold Drift
Explanation: Hardcoding 0.75 for fact tracking fails in domain-specific contexts where terminology varies slightly.
Fix: Calibrate thresholds per use case. Run a validation set of known-present facts and adjust the cosine cutoff to minimize false negatives.
5. Embedding Cache Invalidation
Explanation: Caching embeddings without versioning causes stale matches when session content updates or model vocabularies shift.
Fix: Hash session content or use turn indices as cache keys. Invalidate caches when session logs are modified or re-imported.
Explanation: Tool outputs often contain verbose JSON, stack traces, or HTML that consume disproportionate token budget.
Fix: Track tool payload size separately. Implement content summarization or truncation strategies for tool results before they enter the context window.
7. Parallel Turn Misalignment
Explanation: Comparing sessions with different turn counts or asynchronous tool calls causes index drift and false divergence flags.
Fix: Align sessions by logical turn markers (e.g., user input events) rather than raw array indices. Skip or interpolate missing turns during comparison.
Production Bundle
Action Checklist
Decision Matrix
| Scenario | Recommended Approach | Why | Cost Impact |
|---|
| Debugging a single failed 40-turn session | Context State Reconstruction | Pinpoints exact turn where constraints vanished | Low (local compute only) |
| Validating prompt changes across 100 runs | Fact Persistence Tracking | Quantifies constraint retention rate automatically | Medium (embedding compute) |
| Comparing success vs failure runs | Execution Divergence Detection | Isolates earliest context drift causing outcome split | Low-Medium (aligned reconstruction) |
| Real-time agent monitoring | Streaming context buffer + alerting | Catches budget overflow before eviction occurs | High (continuous API/infra) |
| Compliance/audit logging | Immutable session export + reconstruction | Provides deterministic proof of agent state | Low (post-hoc processing) |
Configuration Template
# context_forensics_config.yaml
model_mappings:
gpt-4o:
tokenizer: cl100k_base
context_limit: 128000
eviction_strategy: left_truncation
preserve_system: true
claude-3-5-sonnet:
tokenizer: claude
context_limit: 200000
eviction_strategy: left_truncation
preserve_system: true
deepseek-chat:
tokenizer: cl100k_base
context_limit: 64000
eviction_strategy: sliding_window
preserve_system: true
analysis_thresholds:
fact_presence_cosine: 0.75
divergence_cosine: 0.85
min_context_utilization_alert: 0.85
storage:
type: sqlite
path: ./forensics_data/sessions.db
max_sessions: 5000
embedding:
model: all-MiniLM-L6-v2
cache_enabled: true
cache_ttl_hours: 24
Quick Start Guide
- Install dependencies:
pip install tiktoken sentence-transformers pyyaml sqlite3
- Export your session logs: Ensure each turn contains
role, content, and token_count fields in a JSON array or SQLite table.
- Initialize the engine: Load your configuration, instantiate
ContextReplayEngine with your target model, and call reconstruct_turn(session_log, turn_index) to verify context state.
- Track critical constraints: Pass key instructions to
FactPersistenceAnalyzer.track_across_session() to generate a presence timeline and identify exactly when constraints drop out of context.
- Compare divergent runs: Use
ExecutionDivergenceDetector.find_earliest_divergence() on success/failure pairs to isolate the turn where context payloads split, then adjust prompt engineering or tool output formatting accordingly.