fetchOrderHistory(sessionId, { limit: 3 })
]);
// Semantic retrieval against localized product embeddings
const matches = await vectorSearch({
query: userQuery,
collection: 'product_catalog',
filters: {
in_stock: true,
locale: profile.preferred_locale
},
topK: 5
});
return {
customer: sanitizePii(profile, ['first_name', 'tier', 'preferred_locale']),
cart: cart.items,
recentOrders: orders,
productMatches: matches
};
}
**Architecture Rationale:** Parallel fetching reduces latency by 60% compared to sequential calls. Vector search with `topK: 5` prevents context window pollution. PII sanitization is non-negotiable; models should never receive payment tokens, full addresses, or internal IDs. The pipeline returns a strictly typed payload, ensuring downstream layers cannot accidentally inject raw database rows.
### 2. Prompt Construction & Locale Locking
System prompts must enforce domain boundaries, inject assembled context, and lock language output. Relying on the model to self-regulate language or scope is a known failure pattern in multilingual deployments.
```typescript
export function buildSystemPrompt(ctx: ContextPayload): string {
const localeConstraint = `You are the commerce assistant for Acme Retail.
Respond exclusively in ${ctx.customer.preferred_locale}.
Never switch languages mid-conversation.
Only reference products, policies, or orders present in the provided context.`;
const contextBlock = `
## Customer State
- Name: ${ctx.customer.first_name}
- Loyalty Tier: ${ctx.customer.tier}
- Cart: ${ctx.cart.length} items | Total: $${ctx.cart.reduce((s, i) => s + i.price * i.qty, 0).toFixed(2)}
## Relevant Products
${ctx.productMatches.map(p =>
`- ${p.name} | $${p.price} | SKU: ${p.sku}\n ${p.description}`
).join('\n')}
## Recent Orders
${ctx.recentOrders.map(o =>
`- #${o.id} | Status: ${o.status} | Placed: ${o.date}`
).join('\n')}
`.trim();
return `${localeConstraint}\n\n${contextBlock}`;
}
Architecture Rationale: The prompt is split into a constraint layer and a data layer. Constraints are evaluated first by the model's attention mechanism, establishing behavioral boundaries before data injection. Locale locking uses explicit instruction rather than implicit expectation. Product data is formatted consistently to reduce parsing ambiguity during function call generation.
Function calling transforms the assistant from a responder to an actor. Tools must be strictly typed, idempotent where possible, and executed asynchronously to avoid blocking streaming responses.
import { z } from 'zod';
export const commerceTools = [
{
type: 'function',
function: {
name: 'modify_cart',
description: 'Add, remove, or update quantity of a product in the active session cart',
parameters: {
type: 'object',
properties: {
action: { type: 'string', enum: ['add', 'remove', 'update'] },
sku: { type: 'string' },
quantity: { type: 'integer', minimum: 1, maximum: 50 }
},
required: ['action', 'sku']
}
}
},
{
type: 'function',
function: {
name: 'retrieve_order_tracking',
description: 'Fetch current status and carrier tracking details for a customer order',
parameters: {
type: 'object',
properties: {
order_id: { type: 'string' }
},
required: ['order_id']
}
}
},
{
type: 'function',
function: {
name: 'validate_promo_code',
description: 'Check eligibility and apply discount to current cart total',
parameters: {
type: 'object',
properties: {
code: { type: 'string' }
},
required: ['code']
}
}
}
];
export async function executeToolCall(toolCall: any): Promise<any> {
const { name, arguments: args } = toolCall.function;
const parsed = JSON.parse(args);
switch (name) {
case 'modify_cart':
return await cartService.update(parsed.action, parsed.sku, parsed.quantity);
case 'retrieve_order_tracking':
return await orderService.getTracking(parsed.order_id);
case 'validate_promo_code':
return await promoEngine.apply(parsed.code);
default:
throw new Error(`Unknown tool: ${name}`);
}
}
Architecture Rationale: Tools are defined with strict schemas (Zod-compatible structure) to prevent malformed arguments. Execution is decoupled from inference; the model returns tool calls, the router executes them, and results are fed back in the next turn. This prevents blocking the LLM stream and allows retry logic, circuit breakers, and rate limiting at the tool layer.
4. Session State Management & Summarization
GPT models do not retain memory. Your application owns the conversation history. Naive truncation loses early intent; unbounded history exhausts context windows. The solution is rolling summarization with explicit state persistence.
import { Redis } from 'ioredis';
const redis = new Redis(process.env.REDIS_URL);
export class SessionOrchestrator {
private maxActiveTurns = 8;
private history: Message[] = [];
private summary: string | null = null;
constructor(private sessionId: string) {}
async append(role: 'user' | 'assistant', content: string): Promise<void> {
this.history.push({ role, content });
if (this.history.length > this.maxActiveTurns * 2) {
const older = this.history.splice(0, 6);
this.summary = await this.generateSummary(older, this.summary);
}
await this.persist();
}
getConversationPayload(systemPrompt: string): any[] {
const contextPrefix = this.summary
? `Earlier in this conversation: ${this.summary}\n\n`
: '';
return [
{ role: 'system', content: `${contextPrefix}${systemPrompt}` },
...this.history.slice(-this.maxActiveTurns)
];
}
private async persist(): Promise<void> {
await redis.setex(
`session:${this.sessionId}`,
3600,
JSON.stringify({ history: this.history, summary: this.summary })
);
}
static async load(sessionId: string): Promise<SessionOrchestrator> {
const raw = await redis.get(`session:${sessionId}`);
const instance = new SessionOrchestrator(sessionId);
if (raw) {
const data = JSON.parse(raw);
instance.history = data.history;
instance.summary = data.summary;
}
return instance;
}
private async generateSummary(turns: Message[], existing: string | null): Promise<string> {
const prompt = `Summarize the following conversation turns concisely. Preserve product interests, cart actions, and unresolved questions.\n\n${turns.map(t => `${t.role}: ${t.content}`).join('\n')}`;
const response = await llmClient.chat.completions.create({
model: 'gpt-4o-mini',
messages: [{ role: 'user', content: prompt }],
max_tokens: 150
});
return existing ? `${existing} | ${response.choices[0].message.content}` : response.choices[0].message.content;
}
}
Architecture Rationale: The orchestrator maintains a sliding window of 8 active turns. Older turns are compressed into a summary via a lightweight model (gpt-4o-mini), preserving intent without token bloat. State is persisted to Redis with a 1-hour TTL, matching typical shopping session duration. Loading reconstructs the exact state, enabling seamless multi-device handoffs.
Pitfall Guide
1. Catalog Dumping
Explanation: Injecting the entire product database into the prompt to "ensure coverage." This inflates token usage, degrades attention quality, and increases hallucination risk.
Fix: Implement semantic filtering with topK: 5 and strict stock/locale filters. Use hybrid search (BM25 + vector) if your catalog exceeds 10k SKUs.
2. Implicit Language Routing
Explanation: Assuming the model will detect and maintain language automatically. GPT defaults to English under ambiguity, causing silent locale drift.
Fix: Run explicit language detection (franc, langdetect, or platform locale headers) before inference. Lock output language in the system prompt and validate responses against the detected locale.
3. History Truncation
Explanation: Dropping the oldest messages when the context window fills. This erases early purchase intent, cart modifications, and return requests.
Fix: Replace truncation with rolling summarization. Maintain a compact summary block and a fixed-size active window. Validate summary accuracy with periodic human review.
Explanation: Executing tool calls inline while waiting for the LLM stream. This breaks the streaming experience and increases perceived latency.
Fix: Decouple inference and execution. Stream the initial response, pause for tool calls, execute asynchronously, then resume streaming with tool results. Implement retry logic with exponential backoff for failed cart or promo operations.
5. Unsanitized PII in Context
Explanation: Passing full customer profiles, payment tokens, or internal IDs to the model. This violates compliance standards and increases data exposure risk.
Fix: Implement a strict field whitelist. Strip PII at the context assembly layer. Use hashed identifiers for internal references and never expose raw database schemas.
6. Ignoring Token Budgets
Explanation: Letting context size grow unchecked, causing rate limit errors or degraded model performance.
Fix: Enforce dynamic token budgeting. Allocate ~40% for system prompt, ~30% for context, ~20% for history, and ~10% for tool definitions. Trim or summarize when thresholds are approached.
7. Hardcoded Fallbacks
Explanation: Returning generic error messages when tool calls fail or confidence is low. This breaks trust and increases support tickets.
Fix: Implement confidence scoring on model outputs. Route low-confidence intents to human agents or structured FAQ fallbacks. Log all fallback triggers for continuous prompt refinement.
Production Bundle
Action Checklist
Decision Matrix
| Scenario | Recommended Approach | Why | Cost Impact |
|---|
| Catalog < 5k SKUs | Pure vector search (pgvector) | Low latency, high precision, minimal infrastructure | Low |
| Catalog > 50k SKUs | Hybrid search (BM25 + vector) | Balances keyword accuracy with semantic relevance | Medium |
| Single Locale (EN) | Inline prompt translation | Simpler pipeline, lower compute overhead | Low |
| Multi-Locale (FR/EN/DE) | Offline batch translation + locale filtering | Prevents inference latency, ensures consistent terminology | Medium |
| High-Volume Cart Actions | Async tool execution + streaming | Maintains UX responsiveness under load | Low |
| Compliance-Heavy (EU/CA) | Strict PII scrubbing + hashed IDs | Meets GDPR/CCPA requirements without model exposure | Medium |
Configuration Template
// openai-config.ts
import OpenAI from 'openai';
import { commerceTools } from './tool-definitions';
export const llmClient = new OpenAI({
apiKey: process.env.OPENAI_API_KEY,
defaultQuery: { timeout: 15000 }
});
export const inferenceConfig = {
model: 'gpt-4o',
tools: commerceTools,
tool_choice: 'auto',
temperature: 0.3,
max_tokens: 800,
stream: true
};
// session-config.ts
export const sessionConfig = {
redisUrl: process.env.REDIS_URL,
ttlSeconds: 3600,
maxActiveTurns: 8,
summaryModel: 'gpt-4o-mini',
summaryMaxTokens: 150
};
Quick Start Guide
- Initialize Context Pipeline: Deploy
assembleContext with parallel fetching and vector search. Configure your product embeddings with locale tags and stock filters.
- Register Tools: Define function schemas matching your commerce backend. Implement idempotent handlers for cart, orders, and promotions. Add retry logic with circuit breakers.
- Deploy State Manager: Spin up Redis, configure TTL to 3600s, and instantiate
SessionOrchestrator. Test rolling summarization with 20+ turn conversations.
- Enable Streaming: Configure
stream: true in inference calls. Implement async tool execution that pauses streaming, runs backend actions, and resumes output.
- Validate & Monitor: Run load tests with simulated multilingual queries. Track token usage, tool success rates, and fallback triggers. Adjust
topK and summarization thresholds based on telemetry.