`typescript
// Types and Interfaces
export interface Message {
role: 'user' | 'assistant' | 'system';
content: string;
timestamp: number;
}
export interface MemoryStrategy {
getContext(): Promise<Message[]>;
addTurn(userMsg: string, assistantMsg: string): Promise<void>;
getTokenCount(): number;
reset(): void;
}
// Token Estimation Utility
// Approximation: 1 token β 4 chars for English text + overhead
const estimateTokens = (text: string): number => {
return Math.ceil(text.length / 4) + 4; // +4 for role/formatting overhead
};
// Strategy 1: Sliding Window
export class SlidingWindowMemory implements MemoryStrategy {
private history: Message[] = [];
private maxTokens: number;
constructor(maxTokens: number) {
this.maxTokens = maxTokens;
}
async addTurn(userMsg: string, assistantMsg: string): Promise<void> {
const now = Date.now();
this.history.push({ role: 'user', content: userMsg, timestamp: now });
this.history.push({ role: 'assistant', content: assistantMsg, timestamp: now });
this.trimToBudget();
}
async getContext(): Promise<Message[]> {
return this.history;
}
getTokenCount(): number {
return this.history.reduce((sum, msg) => sum + estimateTokens(msg.content), 0);
}
reset(): void {
this.history = [];
}
private trimToBudget(): void {
while (this.getTokenCount() > this.maxTokens && this.history.length > 2) {
this.history.shift();
}
}
}
// Strategy 2: Summary Compression
export class SummaryMemory implements MemoryStrategy {
private recentHistory: Message[] = [];
private summary: string = '';
private maxRecentTokens: number;
private summarizeThreshold: number;
constructor(maxRecentTokens: number, summarizeThreshold: number) {
this.maxRecentTokens = maxRecentTokens;
this.summarizeThreshold = summarizeThreshold;
}
async addTurn(userMsg: string, assistantMsg: string): Promise<void> {
const now = Date.now();
this.recentHistory.push({ role: 'user', content: userMsg, timestamp: now });
this.recentHistory.push({ role: 'assistant', content: assistantMsg, timestamp: now });
if (this.recentHistory.length >= this.summarizeThreshold) {
await this.compressHistory();
}
}
async getContext(): Promise<Message[]> {
const context: Message[] = [];
if (this.summary) {
context.push({
role: 'system',
content: [Previous Conversation Summary]: ${this.summary},
timestamp: 0
});
}
context.push(...this.recentHistory);
return context;
}
getTokenCount(): number {
const summaryTokens = this.summary ? estimateTokens(this.summary) : 0;
const recentTokens = this.recentHistory.reduce((sum, msg) => sum + estimateTokens(msg.content), 0);
return summaryTokens + recentTokens;
}
reset(): void {
this.recentHistory = [];
this.summary = '';
}
private async compressHistory(): Promise<void> {
// In production, call LLM to summarize oldest half of recentHistory
const splitIndex = Math.floor(this.recentHistory.length / 2);
const oldMessages = this.recentHistory.slice(0, splitIndex);
this.recentHistory = this.recentHistory.slice(splitIndex);
const oldText = oldMessages.map(m => `${m.role}: ${m.content}`).join('\n');
// Mock summarization; replace with actual LLM call
this.summary = this.summary
? `${this.summary} | ${oldText.substring(0, 100)}...`
: `Summary: ${oldText.substring(0, 100)}...`;
}
}
// Strategy 3: Entity Extraction
export class EntityMemory implements MemoryStrategy {
private entities: Map<string, string> = new Map();
private conversationHistory: Message[] = [];
private maxTokens: number;
constructor(maxTokens: number) {
this.maxTokens = maxTokens;
}
async addTurn(userMsg: string, assistantMsg: string): Promise<void> {
const now = Date.now();
this.conversationHistory.push({ role: 'user', content: userMsg, timestamp: now });
this.conversationHistory.push({ role: 'assistant', content: assistantMsg, timestamp: now });
// Extract entities from user message
await this.extractEntities(userMsg);
this.trimHistory();
}
async getContext(): Promise<Message[]> {
const entityContext = Array.from(this.entities.entries())
.map(([key, value]) => ${key}: ${value})
.join('\n');
const context: Message[] = [];
if (entityContext) {
context.push({
role: 'system',
content: `[User Profile]: ${entityContext}`,
timestamp: 0
});
}
context.push(...this.conversationHistory);
return context;
}
getTokenCount(): number {
const entityTokens = Array.from(this.entities.values())
.reduce((sum, val) => sum + estimateTokens(val), 0);
const historyTokens = this.conversationHistory.reduce((sum, msg) => sum + estimateTokens(msg.content), 0);
return entityTokens + historyTokens;
}
reset(): void {
this.entities.clear();
this.conversationHistory = [];
}
private async extractEntities(text: string): Promise<void> {
// Mock extraction; in production, use LLM with structured output
// Example: Detect "My name is Alice" -> entities.set('name', 'Alice')
const nameMatch = text.match(/my name is (\w+)/i);
if (nameMatch) {
this.entities.set('user_name', nameMatch[1]);
}
const prefMatch = text.match(/i prefer (\w+)/i);
if (prefMatch) {
this.entities.set('preference', prefMatch[1]);
}
}
private trimHistory(): void {
while (this.getTokenCount() > this.maxTokens && this.conversationHistory.length > 2) {
this.conversationHistory.shift();
}
}
}
// Chat Session Manager
export class ChatSession {
private memory: MemoryStrategy;
private systemPrompt: string;
constructor(memory: MemoryStrategy, systemPrompt: string) {
this.memory = memory;
this.systemPrompt = systemPrompt;
}
async processTurn(userInput: string, llmCall: (prompt: string) => Promise<string>): Promise<string> {
const contextMessages = await this.memory.getContext();
// Build prompt
const promptParts = [
`System: ${this.systemPrompt}`,
...contextMessages.map(m => `${m.role === 'user' ? 'Human' : 'Assistant'}: ${m.content}`),
'Assistant:'
];
const prompt = promptParts.join('\n');
// Invoke LLM
const response = await llmCall(prompt);
// Update memory
await this.memory.addTurn(userInput, response);
return response;
}
reset(): void {
this.memory.reset();
}
}
#### Rationale
* **Async Support:** Real-world LLM calls are asynchronous. The `addTurn` and `getContext` methods are async to accommodate external summarization or entity extraction services.
* **Token Awareness:** Each strategy implements `getTokenCount` and internal trimming logic. This ensures the application never exceeds the context window, preventing API errors and cost overruns.
* **Extensibility:** New strategies (e.g., vector search memory) can be added by implementing the `MemoryStrategy` interface without modifying the `ChatSession`.
### Pitfall Guide
1. **The "Lost in the Middle" Trap**
* *Explanation:* LLMs exhibit reduced attention for information located in the middle of the context window. Critical instructions or facts buried between old history and the current query may be ignored.
* *Fix:* Place system instructions and critical context at the beginning or end of the prompt. Use summary strategies to compress middle content rather than retaining full verbatim history.
2. **Token Budget Blowout**
* *Explanation:* Developers often estimate tokens based on message count, ignoring overhead from role tags, formatting, and system prompts. This leads to unexpected truncation or API errors.
* *Fix:* Implement strict token counting that includes all overhead. Reserve a buffer (e.g., 10%) for the model's response. Use the token estimation logic provided in the core solution.
3. **Summary Hallucination**
* *Explanation:* When compressing history via summarization, the LLM may introduce facts not present in the original text or omit critical details.
* *Fix:* Use extraction-focused prompts for summarization rather than generative ones. Verify summaries against source text in high-stakes applications. Consider chunking summaries to preserve granularity.
4. **Entity Drift and Staleness**
* *Explanation:* Entity stores may retain outdated information if the user changes preferences or facts without explicit correction.
* *Fix:* Implement versioning or timestamps for entities. Use confidence scores during extraction. Allow the model to query the entity store and update it dynamically based on user confirmation.
5. **Prompt Injection via History**
* *Explanation:* Malicious users can inject instructions into the conversation history that override system prompts when the history is replayed.
* *Fix:* Sanitize user inputs before storing them. Use distinct delimiters for system vs. user content. Consider encoding user messages or using structured formats that separate content from instructions.
6. **Async Race Conditions**
* *Explanation:* In concurrent environments, multiple messages may update memory simultaneously, leading to out-of-order history or lost turns.
* *Fix:* Use sequence numbers or locking mechanisms for memory updates. Ensure the chat loop processes turns sequentially or uses a queue to maintain order.
7. **Cost of Summarization Calls**
* *Explanation:* Summary strategies require additional LLM calls to compress history, increasing latency and cost.
* *Fix:* Batch summarization operations. Use cheaper, faster models for summarization tasks. Cache summaries and only re-summarize when new content is added.
### Production Bundle
#### Action Checklist
- [ ] Define a strict token budget for each request, including system prompt and response buffer.
- [ ] Implement a token counting utility that accounts for role overhead and formatting.
- [ ] Select a memory strategy based on use case requirements (fidelity vs. efficiency).
- [ ] Add entity extraction for applications requiring long-term personalization.
- [ ] Sanitize user inputs to prevent prompt injection attacks.
- [ ] Monitor context window usage metrics to detect budget violations.
- [ ] Implement error handling for token limit exceeded scenarios.
- [ ] Test memory strategies with edge cases (rapid turns, long responses, empty history).
#### Decision Matrix
| Scenario | Recommended Approach | Why | Cost Impact |
| :--- | :--- | :--- | :--- |
| **Customer Support Bot** | Sliding Window | Resolutions are typically short; immediate context is sufficient. | Low |
| **Creative Writing Co-pilot** | Summary Compression | Requires long narrative arc; verbatim detail of early turns is less critical. | Medium |
| **Personal Health Coach** | Entity Extraction | Needs persistent user stats, goals, and symptom history; narrative detail is secondary. | High (due to extraction calls) |
| **Legal Document Review** | Sliding Window + RAG | Precision is paramount; full context of recent analysis is needed. | Medium |
| **Multi-User Chat Room** | Summary + Entity | Balances group context with individual user profiles. | High |
#### Configuration Template
```json
{
"memory": {
"strategy": "summary",
"params": {
"max_recent_tokens": 2000,
"summarize_threshold": 15,
"entity_extraction": true,
"entity_fields": ["name", "preference", "location"]
},
"budget": {
"max_input_tokens": 8000,
"response_buffer": 1000,
"system_prompt_tokens": 200
}
}
}
Quick Start Guide
- Initialize Memory Manager:
const memory = new SummaryMemory(2000, 15);
const session = new ChatSession(memory, "You are a helpful assistant.");
- Define LLM Handler:
const llmHandler = async (prompt: string) => {
// Call your LLM API here
return await openai.chat.completions.create({ model: "gpt-4", messages: parsePrompt(prompt) });
};
- Process Turns:
const response = await session.processTurn("Tell me about Paris.", llmHandler);
console.log(response);
- Monitor Usage:
console.log(`Current token usage: ${memory.getTokenCount()}`);
- Reset Session:
session.reset();