hexdigest()
def get(self, prompt: str, model: str, params: dict) -> Optional[str]:
key = self._compute_key(prompt, model, params)
return self._cache.get(key)
def set(self, prompt: str, model: str, params: dict, response: str) -> None:
key = self._compute_key(prompt, model, params)
self._cache.set(key, response, expire=86400 * 7) # 7-day TTL
**Architecture Rationale**: Using `diskcache` provides SQLite-backed persistence with LRU eviction, avoiding memory exhaustion during long-running services. The SHA-256 key generation ensures that minor prompt variations or parameter changes trigger cache misses, preserving response integrity. A 7-day TTL balances freshness with cost savings, and can be adjusted based on knowledge base update frequency.
### 2. Persistent Embedding Caching
Embedding models produce identical vectors for identical inputs. Caching at the embedding layer prevents redundant vectorization of unchanged documents.
```python
import sqlite3
import numpy as np
import base64
from pathlib import Path
class EmbeddingStore:
def __init__(self, db_path: str = "embeddings.db", namespace: str = "default"):
self._namespace = namespace
self._conn = sqlite3.connect(db_path)
self._init_schema()
def _init_schema(self):
self._conn.execute("""
CREATE TABLE IF NOT EXISTS vectors (
namespace TEXT,
content_hash TEXT PRIMARY KEY,
vector_blob BLOB,
model_version TEXT,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
)
""")
self._conn.commit()
def get_or_compute(self, content: str, model_version: str, compute_fn) -> np.ndarray:
content_hash = hashlib.md5(content.encode()).hexdigest()
cursor = self._conn.execute(
"SELECT vector_blob FROM vectors WHERE namespace=? AND content_hash=? AND model_version=?",
(self._namespace, content_hash, model_version)
)
row = cursor.fetchone()
if row:
return np.frombuffer(base64.b64decode(row[0]), dtype=np.float32)
vector = compute_fn(content)
blob = base64.b64encode(vector.tobytes()).decode()
self._conn.execute(
"INSERT OR REPLACE INTO vectors (namespace, content_hash, vector_blob, model_version) VALUES (?, ?, ?, ?)",
(self._namespace, content_hash, blob, model_version)
)
self._conn.commit()
return vector
Architecture Rationale: SQLite offers ACID compliance and concurrent read safety without external dependencies. Namespacing by model version prevents dimension mismatches when upgrading embedding models. Storing vectors as base64-encoded blobs avoids JSON serialization overhead and preserves float32 precision. The compute_fn callback pattern keeps the cache decoupled from specific embedding providers.
3. Semantic Query Caching
Semantic caching intercepts queries before retrieval and generation by matching against historically answered questions using vector similarity.
import faiss
import numpy as np
from typing import Tuple, Optional
class SemanticQueryCache:
def __init__(self, dimension: int, threshold: float = 0.82):
self._index = faiss.IndexFlatIP(dimension)
self._answers: dict = {}
self._threshold = threshold
self._metadata: list = []
def search(self, query_vector: np.ndarray) -> Optional[str]:
if self._index.ntotal == 0:
return None
query_vector = query_vector.reshape(1, -1)
faiss.normalize_L2(query_vector)
scores, indices = self._index.search(query_vector, k=1)
if scores[0][0] >= self._threshold:
return self._answers[self._metadata[indices[0][0]]]
return None
def add(self, query_vector: np.ndarray, answer: str) -> None:
query_vector = query_vector.reshape(1, -1)
faiss.normalize_L2(query_vector)
self._index.add(query_vector)
cache_id = str(len(self._metadata))
self._metadata.append(cache_id)
self._answers[cache_id] = answer
Architecture Rationale: FAISS provides highly optimized inner-product search with minimal memory footprint. Normalizing vectors to unit length converts inner product to cosine similarity, aligning with standard embedding model outputs. The threshold must be calibrated against actual query distributions; hardcoding values leads to false negatives on paraphrases or false positives on unrelated topics. This cache bypasses both retrieval and generation, making it the most aggressive optimization when properly tuned.
4. Concurrent Embedding Batching
Sequential embedding calls serialize network I/O. Batching multiple texts into a single request leverages provider-side parallelism and reduces round-trip overhead.
import asyncio
import aiohttp
from typing import List
class AsyncEmbeddingBatcher:
def __init__(self, api_endpoint: str, api_key: str, batch_size: int = 64):
self._endpoint = api_endpoint
self._headers = {"Authorization": f"Bearer {api_key}", "Content-Type": "application/json"}
self._batch_size = batch_size
async def embed_texts(self, texts: List[str]) -> List[List[float]]:
async with aiohttp.ClientSession() as session:
tasks = []
for i in range(0, len(texts), self._batch_size):
batch = texts[i:i + self._batch_size]
tasks.append(self._post_batch(session, batch))
results = await asyncio.gather(*tasks)
return [vec for batch_result in results for vec in batch_result]
async def _post_batch(self, session: aiohttp.ClientSession, batch: List[str]) -> List[List[float]]:
payload = {"input": batch, "model": "embedding-model-v2"}
async with session.post(self._endpoint, json=payload, headers=self._headers) as resp:
data = await resp.json()
return [item["embedding"] for item in data["data"]]
Architecture Rationale: asyncio.gather dispatches multiple batch requests concurrently while respecting provider rate limits. Chunking into batches of 50β100 texts aligns with typical API payload limits and prevents timeout errors. This pattern reduces 12 sequential calls from ~800ms to ~280ms, with gains scaling linearly with document volume. It is most valuable during index construction and high-concurrency query windows.
Pitfall Guide
1. Cache Invalidation Blind Spots
Explanation: Storing responses or embeddings indefinitely causes stale data to surface when source documents or business logic change.
Fix: Implement TTL-based expiration, content-hash tracking, and explicit cache purge endpoints. Tie cache lifecycles to document versioning systems.
2. Semantic Threshold Guesswork
Explanation: Setting similarity thresholds arbitrarily (e.g., 0.85) ignores the actual distribution of your query space, causing high miss rates on paraphrases.
Fix: Run calibration scripts on historical query logs. Plot cosine similarity distributions for known-similar and known-dissimilar pairs. Select a threshold at the intersection point, typically 0.78β0.84 for modern embedding models.
3. Embedding Model Drift
Explanation: Switching embedding models without clearing caches introduces dimension mismatches and semantic drift, corrupting vector search results.
Fix: Namespace caches by model_name and model_version. Validate vector dimensions on cache read. Implement automatic cache invalidation when model configuration changes.
4. Synchronous Embedding Bottlenecks
Explanation: Using blocking HTTP clients for embedding requests ties up worker threads, degrading throughput under concurrent load.
Fix: Replace synchronous calls with async I/O or thread pools. Batch requests where APIs support it. Monitor event loop latency and adjust concurrency limits accordingly.
5. Over-Caching Personalized Queries
Explanation: Caching responses that contain user-specific data (e.g., account balances, personalized recommendations) violates data isolation and creates security risks.
Fix: Exclude queries containing user identifiers, session tokens, or dynamic variables from semantic and LLM caches. Implement cache key scoping that includes user context when personalization is required.
6. Unbounded Cache Growth
Explanation: File-based or in-memory caches expand indefinitely, eventually exhausting disk space or RAM, causing service crashes.
Fix: Configure maximum cache size, enable LRU eviction, and monitor storage metrics. Use external cache services (Redis, Memcached) for distributed deployments with built-in memory management.
7. False Confidence in Vector Similarity
Explanation: High cosine similarity does not guarantee factual correctness or contextual relevance. Semantic caches may return plausible but outdated answers.
Fix: Implement fallback mechanisms that bypass cache when confidence scores fall below secondary thresholds. Log cache hits for periodic audit and retraining.
Production Bundle
Action Checklist
Decision Matrix
| Scenario | Recommended Approach | Why | Cost Impact |
|---|
| High-volume FAQ bot with repetitive phrasing | Semantic Cache + LLM Cache | Bypasses retrieval and generation for similar queries | Reduces API spend by 60β80% |
| Static knowledge base with monthly updates | Embedding Cache + Async Batching | Prevents re-vectorization of unchanged documents | Cuts embedding costs by 70% |
| Dynamic enterprise search with user context | LLM Cache (scoped) + Async Batching | Preserves personalization while accelerating vectorization | Moderate cost reduction, low risk |
| Low-concurrency prototype | In-memory LLM Cache only | Minimal setup, validates caching benefits before scaling | Near-zero infrastructure cost |
Configuration Template
rag_pipeline:
caching:
llm_response:
enabled: true
backend: diskcache
ttl_hours: 168
max_size_gb: 2
embedding:
enabled: true
backend: sqlite
namespace_prefix: "v2"
model_version: "text-embedding-3-large"
semantic:
enabled: true
similarity_threshold: 0.81
index_type: faiss_ip
max_entries: 50000
concurrency:
embedding_batch_size: 64
async_workers: 4
rate_limit_rpm: 3000
observability:
cache_metrics: true
latency_p95_target_ms: 1200
cost_tracking: true
Quick Start Guide
- Initialize Cache Backends: Create directories for
diskcache and SQLite files. Configure environment variables for API keys and model versions.
- Deploy LLM & Embedding Caches: Instantiate
LLMResponseCache and EmbeddingStore at application startup. Wrap existing embedding and generation calls with cache lookup logic.
- Calibrate Semantic Threshold: Run a calibration script against 500 historical queries. Adjust
similarity_threshold until hit rate stabilizes between 35β50% without false positives.
- Enable Async Batching: Replace synchronous embedding loops with
AsyncEmbeddingBatcher. Configure batch size and concurrency limits matching your provider's rate limits.
- Monitor & Iterate: Deploy logging for cache hit rates, latency percentiles, and token consumption. Adjust TTLs and thresholds based on weekly telemetry reports.