mber): void {
this.turns.push({ role, content, tokens: tokenCount, timestamp: Date.now() });
this.enforceBudget();
}
getPromptPayload(): { systemContext: string; recentTurns: MessageTurn[] } {
return {
systemContext: this.runningSummary,
recentTurns: this.turns.slice(-this.PRESERVE_COUNT)
};
}
private enforceBudget(): void {
const totalTokens = this.turns.reduce((sum, t) => sum + t.tokens, 0);
if (totalTokens > this.MAX_TOKEN_BUDGET) {
this.compressHistory();
}
}
private compressHistory(): void {
const overflowTurns = this.turns.slice(0, -this.PRESERVE_COUNT);
const compressed = overflowTurns
.filter(t => t.role !== 'system')
.map(t => ${t.role}: ${t.content})
.join('\n');
// In production, route this to a lightweight summarization model
this.runningSummary = `[Compressed Context] ${compressed.substring(0, 2000)}...`;
this.turns = this.turns.slice(-this.PRESERVE_COUNT);
}
}
**Architecture Rationale:** The buffer maintains a fixed tail of recent turns for immediate conversational flow while compressing older turns into a running summary. The token budget prevents context window exhaustion. System instructions and tool definitions are excluded from compression to preserve behavioral constraints.
### Step 2: Cross-Session State Partitioning
Long-term memory requires storage optimized for different access patterns. Preferences need millisecond reads. Episodic summaries need semantic search. Structured facts need relational queries.
```typescript
export interface UserProfile {
userId: string;
preferences: Record<string, string>; // e.g., { tone: 'formal', language: 'en' }
lastActive: number;
}
export class StatePartitioner {
constructor(
private readonly redis: RedisClient,
private readonly postgres: Pool
) {}
async loadSessionContext(userId: string): Promise<UserProfile> {
const cached = await this.redis.get(`user:pref:${userId}`);
if (cached) return JSON.parse(cached);
const result = await this.postgres.query(
'SELECT preferences, last_active FROM user_profiles WHERE user_id = $1',
[userId]
);
const profile = result.rows[0];
await this.redis.setex(`user:pref:${userId}`, 3600, JSON.stringify(profile));
return profile;
}
}
Architecture Rationale: Redis handles high-frequency preference lookups without hitting the relational database. Postgres stores structured profiles and session metadata. This hybrid pattern prevents database connection pool exhaustion during peak traffic while maintaining ACID compliance for critical user data.
Step 3: Entity Resolution Layer
LLMs struggle with alias resolution. "John", "Mr. Smith", and "the founder" often refer to the same entity. A dedicated resolution layer maps textual references to canonical IDs.
export interface EntityRecord {
canonicalId: string;
aliases: string[];
attributes: Record<string, unknown>;
version: number;
}
export class EntityRegistry {
async resolveOrUpsert(rawName: string, attributes: Partial<EntityRecord['attributes']>): Promise<EntityRecord> {
const normalized = rawName.toLowerCase().trim();
// Check existing aliases first
const existing = await this.findByAlias(normalized);
if (existing) {
existing.attributes = { ...existing.attributes, ...attributes };
existing.version += 1;
return this.persist(existing);
}
// Create new canonical record
const newEntity: EntityRecord = {
canonicalId: createHash('sha256').update(normalized).digest('hex').slice(0, 12),
aliases: [normalized],
attributes,
version: 1
};
return this.persist(newEntity);
}
private async persist(record: EntityRecord): Promise<EntityRecord> {
await this.postgres.query(
`INSERT INTO entities (canonical_id, aliases, attributes, version)
VALUES ($1, $2, $3, $4)
ON CONFLICT (canonical_id) DO UPDATE SET aliases = $2, attributes = $3, version = $4`,
[record.canonicalId, record.aliases, JSON.stringify(record.attributes), record.version]
);
return record;
}
}
Architecture Rationale: Entity memory must be deterministic. By normalizing inputs and maintaining an alias-to-canonical mapping, the system prevents the LLM from conflating similar names. The version field enables optimistic concurrency control, preventing race conditions when multiple agents update the same record simultaneously.
Step 4: Semantic Recall with Temporal Weighting
When session history exceeds practical limits, retrieval-augmented generation (RAG) replaces full injection. Embedding past summaries and applying time-decay ensures recent context ranks higher.
export class RecallPipeline {
async retrieveRelevantMemories(query: string, userId: string, topK: number = 3): Promise<string[]> {
const queryEmbedding = await this.embedder.vectorize(query);
const results = await this.vectorDb.search({
vector: queryEmbedding,
filter: { user_id: userId },
limit: topK * 2 // Fetch extra to apply decay
});
const now = Date.now();
const scored = results.map(r => {
const ageHours = (now - r.metadata.timestamp) / 3_600_000;
const temporalDecay = Math.exp(-0.05 * ageHours); // Half-life ~14 hours
return { ...r, weightedScore: r.score * temporalDecay };
});
return scored
.sort((a, b) => b.weightedScore - a.weightedScore)
.slice(0, topK)
.map(r => r.metadata.summary);
}
}
Architecture Rationale: Vector similarity alone treats a memory from three years ago as equally relevant as one from yesterday. The exponential decay function (Math.exp(-Ξ»t)) naturally downweights stale context while preserving semantic relevance. Fetching topK * 2 and re-ranking prevents the vector index from returning outdated but semantically close summaries.
Pitfall Guide
Explanation: Appending every message to the prompt without token accounting. Works in dev, explodes in prod.
Fix: Implement a hard token budget. Use a sliding window for recent turns and trigger compression when the threshold is breached. Never allow unbounded growth.
2. Vector Databases for Exact Facts
Explanation: Using semantic search to retrieve precise data like account numbers, pricing tiers, or ticket IDs. Vectors approximate meaning, not exact values.
Fix: Store structured facts in relational or key-value stores. Use vectors exclusively for episodic summaries, preferences, and unstructured context.
3. Ignoring Temporal Decay in Retrieval
Explanation: Returning the most semantically similar memory regardless of age. Causes agents to act on outdated policies or resolved issues.
Fix: Apply a time-decay multiplier to retrieval scores. Tune the decay constant (Ξ») based on domain volatility (e.g., 0.02 for stable enterprise data, 0.1 for fast-moving support tickets).
4. Silent Context Truncation
Explanation: Relying on the LLM provider to handle overflow. Many APIs truncate oldest tokens without error codes, dropping critical system instructions.
Fix: Count tokens client-side before every request. Implement a fallback routine that strips low-priority context or triggers summarization when approaching 85% of the model's limit.
5. Compliance Afterthought
Explanation: Storing user data in memory without audit trails, export capabilities, or retention policies. Creates immediate GDPR/CCPA liability.
Fix: Design memory with privacy by default. Log every read/write, implement user-initiated deletion endpoints, and enforce TTLs on episodic data. Never store PII in vector embeddings without hashing or tokenization.
6. Over-Compression of Critical Context
Explanation: Compressing system prompts, tool definitions, or explicit user constraints alongside conversational turns.
Fix: Isolate behavioral constraints outside the compression window. Only compress user/assistant dialogue. Preserve tool schemas and safety guidelines in a separate, immutable context block.
7. Entity Name Collisions
Explanation: Assuming the LLM will naturally disambiguate "Apple" (fruit) vs "Apple" (company) based on context alone.
Fix: Implement a resolution layer that maps raw mentions to canonical IDs using domain-specific rules or lightweight NER pipelines. Cache resolved entities to avoid repeated inference.
Production Bundle
Action Checklist
Decision Matrix
| Scenario | Recommended Approach | Why | Cost Impact |
|---|
| MVP Chatbot (<1k users) | Session buffer + Redis preferences | Minimal infrastructure, fast iteration, predictable costs | Low ($50β$150/mo) |
| CRM/Support Assistant | Layered stack with entity registry | Prevents account conflation, enables cross-session personalization | Medium ($200β$500/mo) |
| High-Volume Knowledge Base | Vector recall + temporal decay | Scales to millions of documents, avoids context window limits | Medium-High ($400β$900/mo) |
| Regulated/Healthcare/Finance | Full stack + immutable audit + encryption | Meets compliance requirements, enables data subject requests | High ($800β$1.5k/mo) |
Configuration Template
// memory.config.ts
export const MemoryConfig = {
session: {
tokenBudget: 8000,
preserveRecentTurns: 5,
compressionModel: 'gpt-4o-mini', // Lightweight summarizer
fallbackStrategy: 'truncate_oldest'
},
storage: {
preferences: {
provider: 'redis',
ttl: 3600,
keyPrefix: 'user:pref:'
},
entities: {
provider: 'postgres',
tableName: 'entities',
conflictStrategy: 'upsert_with_version'
},
episodic: {
provider: 'qdrant',
collection: 'session_summaries',
similarityMetric: 'cosine',
defaultTopK: 5
}
},
retrieval: {
temporalDecayLambda: 0.05, // Adjust based on domain
maxContextInjection: 3000, // Tokens reserved for recalled memories
deduplication: true
},
compliance: {
auditLogging: true,
retentionDays: 180,
encryptionAtRest: true,
userExportEndpoint: '/api/v1/memory/export',
userDeleteEndpoint: '/api/v1/memory/delete'
}
};
Quick Start Guide
- Initialize Storage Layer: Deploy Redis, Postgres, and a vector instance (Qdrant or Weaviate). Run the provided schema migrations for
user_profiles and entities.
- Deploy the Orchestrator: Instantiate
SessionBuffer, StatePartitioner, EntityRegistry, and RecallPipeline using the configuration template. Wire them into a single MemoryOrchestrator class that exposes loadContext(userId) and updateState(userId, payload).
- Integrate with LLM Client: Before every model invocation, call
orchestrator.getPromptPayload(userId). Inject the returned systemContext, recentTurns, and recalledMemories into the request payload. Enforce the token budget client-side.
- Validate with Session Replay: Run a test suite that simulates 50+ turn conversations. Verify that token counts remain stable, entity aliases resolve correctly, and retrieval scores decay appropriately over simulated time. Monitor API costs and context utilization metrics.