documents=chunks,
embeddings=[e.embedding for e in embeddings],
metadatas=[{"source_doc": document_id} for _ in chunks]
)
**Architecture Rationale:** ChromaDB is selected for local development and rapid iteration. The persistent client avoids external dependencies during prototyping. In production, this layer should run as a scheduled job or CI/CD step, never inside the request path. Overlap chunking preserves cross-sentence therapeutic context, which is critical for CBT reframing techniques.
### 2. Retrieval & Generation API
The Next.js API route handles streaming responses, retrieves relevant context, and enforces safety boundaries before generation.
```typescript
// app/api/chat/route.ts
import { NextRequest, NextResponse } from "next/server";
import OpenAI from "openai";
const openai = new OpenAI({ apiKey: process.env.OPENAI_API_KEY });
const SYSTEM_PROMPT = `You are a supportive mental health assistant.
You ground all responses in verified therapeutic principles.
Never diagnose, prescribe, or replace licensed care.
If a user expresses crisis intent, immediately provide emergency resources.`;
export async function POST(req: NextRequest) {
const { messages, userQuery } = await req.json();
// 1. Generate query embedding
const queryEmbedding = await openai.embeddings.create({
model: "text-embedding-3-small",
input: userQuery,
});
// 2. Retrieve context (mocked vector DB call for brevity)
const contextChunks = await retrieveRelevantContext(queryEmbedding.data[0].embedding);
const groundedContext = contextChunks.join("\n\n---\n\n");
// 3. Stream generation with safety constraints
const stream = await openai.chat.completions.create({
model: "gpt-4o-mini",
messages: [
{ role: "system", content: SYSTEM_PROMPT },
{ role: "user", content: `Context:\n${groundedContext}\n\nUser: ${userQuery}` },
...messages.slice(-4) // Keep conversation history bounded
],
stream: true,
temperature: 0.3, // Lower temperature reduces creative drift
});
return new NextResponse(new ReadableStream({
async start(controller) {
for await (const chunk of stream) {
const content = chunk.choices[0]?.delta?.content || "";
controller.enqueue(new TextEncoder().encode(content));
}
controller.close();
}
}), {
headers: { "Content-Type": "text/plain; charset=utf-8" },
});
}
async function retrieveRelevantContext(queryVector: number[]) {
// In production, replace with actual ChromaDB/Weaviate/Pinecone client
// Returns top-3 semantically similar chunks
return [
"CBT Principle: Cognitive restructuring involves identifying automatic negative thoughts and evaluating evidence for and against them.",
"Therapeutic Boundary: The assistant provides psychoeducation, not clinical diagnosis. Always encourage professional consultation for persistent symptoms.",
"Crisis Protocol: If a user mentions self-harm, immediately share national crisis lifelines and discourage isolation."
];
}
Architecture Rationale: Streaming reduces perceived latency and improves UX for emotional conversations. Bounding conversation history prevents context window bloat and cost inflation. Lower temperature (0.3) constrains creative variance, which is essential for clinical alignment. The system prompt acts as a hard boundary, not a suggestion.
3. Frontend Integration & Mood Tracking
The UI combines a chat interface with a sentiment visualization layer. Tracking emotional trajectory over time provides users with reflective insights and helps identify patterns.
// components/TherapySession.tsx
"use client";
import { useState, useRef, useEffect } from "react";
import { LineChart, Line, XAxis, YAxis, Tooltip, ResponsiveContainer } from "recharts";
interface Message {
id: string;
role: "user" | "assistant";
content: string;
timestamp: number;
}
interface MoodEntry {
date: string;
score: number; // 1-10 scale
}
export function TherapySession() {
const [messages, setMessages] = useState<Message[]>([]);
const [input, setInput] = useState("");
const [moodHistory, setMoodHistory] = useState<MoodEntry[]>([]);
const chatEndRef = useRef<HTMLDivElement>(null);
useEffect(() => {
chatEndRef.current?.scrollIntoView({ behavior: "smooth" });
}, [messages]);
const handleSend = async () => {
if (!input.trim()) return;
const userMsg: Message = { id: crypto.randomUUID(), role: "user", content: input, timestamp: Date.now() };
setMessages(prev => [...prev, userMsg]);
setInput("");
const assistantMsg: Message = { id: crypto.randomUUID(), role: "assistant", content: "", timestamp: Date.now() };
setMessages(prev => [...prev, assistantMsg]);
const response = await fetch("/api/chat", {
method: "POST",
headers: { "Content-Type": "application/json" },
body: JSON.stringify({ messages: messages.slice(0, -1), userQuery: userMsg.content }),
});
const reader = response.body?.getReader();
const decoder = new TextDecoder();
let buffer = "";
if (reader) {
while (true) {
const { done, value } = await reader.read();
if (done) break;
buffer += decoder.decode(value, { stream: true });
setMessages(prev => prev.map(m => m.id === assistantMsg.id ? { ...m, content: buffer } : m));
}
}
};
const logMood = (score: number) => {
const today = new Date().toISOString().split("T")[0];
setMoodHistory(prev => [...prev.filter(e => e.date !== today), { date: today, score }]);
};
return (
<div className="grid grid-cols-1 lg:grid-cols-3 gap-6 p-4 max-w-6xl mx-auto">
<div className="lg:col-span-2 border rounded-lg p-4 bg-white shadow-sm">
<div className="h-96 overflow-y-auto space-y-3 mb-4">
{messages.map(m => (
<div key={m.id} className={`p-3 rounded-lg max-w-[80%] ${m.role === "user" ? "ml-auto bg-blue-100" : "bg-gray-100"}`}>
{m.content || "Thinking..."}
</div>
))}
<div ref={chatEndRef} />
</div>
<div className="flex gap-2">
<input
value={input}
onChange={e => setInput(e.target.value)}
onKeyDown={e => e.key === "Enter" && handleSend()}
placeholder="Share how you're feeling..."
className="flex-1 p-2 border rounded-md focus:outline-none focus:ring-2 focus:ring-blue-400"
/>
<button onClick={handleSend} className="px-4 py-2 bg-blue-600 text-white rounded-md hover:bg-blue-700">
Send
</button>
</div>
</div>
<div className="border rounded-lg p-4 bg-white shadow-sm">
<h3 className="font-semibold mb-3">Weekly Mood Trajectory</h3>
<ResponsiveContainer width="100%" height={200}>
<LineChart data={moodHistory}>
<XAxis dataKey="date" tick={{ fontSize: 10 }} />
<YAxis domain={[1, 10]} tick={{ fontSize: 10 }} />
<Tooltip />
<Line type="monotone" dataKey="score" stroke="#3b82f6" strokeWidth={2} dot={{ r: 4 }} />
</LineChart>
</ResponsiveContainer>
<div className="flex justify-between mt-4">
{[1, 3, 5, 7, 10].map(val => (
<button key={val} onClick={() => logMood(val)} className="px-2 py-1 text-xs border rounded hover:bg-gray-50">
{val}
</button>
))}
</div>
</div>
</div>
);
}
Architecture Rationale: Recharts provides lightweight, accessible visualization without heavy bundle overhead. Mood logging is decoupled from chat state to prevent unnecessary re-renders. The UI enforces a clear separation between conversational support and self-reflection tools, aligning with therapeutic best practices.
Pitfall Guide
1. Monolithic Ingestion & Inference Runtime
Explanation: Running embedding generation and vector storage inside the same request path as chat inference causes unpredictable latency and blocks user interactions.
Fix: Decouple ingestion into a background worker, scheduled cron job, or CI/CD pipeline. The inference path should only perform read operations against a pre-populated vector store.
2. Naive Text Chunking
Explanation: Splitting clinical texts by fixed character counts or single sentences breaks semantic continuity. Therapeutic concepts often span multiple paragraphs.
Fix: Implement overlap chunking (10-15% overlap) combined with semantic boundary detection. Preserve paragraph structure where possible, and tag chunks with metadata indicating source chapter or technique type.
3. Unbounded System Prompts
Explanation: Relying solely on the system prompt for safety boundaries fails under adversarial or emotionally charged inputs. LLMs can drift when context windows fill.
Fix: Implement a pre-generation safety filter that scans user input for crisis keywords. Route high-risk queries to explicit emergency protocols before invoking the LLM. Keep system prompts under 300 tokens to preserve context window for retrieval.
4. PII Leakage in Vector Stores
Explanation: Storing raw user messages or identifiable metadata in ChromaDB/Pinecone creates compliance risks. Embeddings themselves are generally safe, but associated metadata is not.
Fix: Hash or anonymize user identifiers before storage. Never store full conversation histories in the vector database. Use ephemeral in-memory caching for active sessions, and purge after timeout.
5. Single-Metric Retrieval
Explanation: Relying exclusively on cosine similarity ignores temporal relevance and document authority. Older therapeutic guidelines may conflict with current best practices.
Fix: Implement hybrid retrieval combining semantic similarity with metadata filtering (e.g., publication_year >= 2020, technique_type = "CBT"). Apply recency boosting or manual curation weights for high-authority sources.
6. Missing Fallback Protocols
Explanation: When retrieval returns zero relevant chunks, the system either hallucinates or fails silently.
Fix: Define a graceful degradation path. If similarity scores fall below a threshold (e.g., 0.65), trigger a fallback response: "I don't have specific guidance for that scenario, but here are general coping strategies..." Log the gap for corpus expansion.
7. Neglecting Rate Limiting & Cost Controls
Explanation: OpenAI embedding and completion endpoints have strict rate limits. Unbounded concurrent requests trigger 429 errors and inflate costs.
Fix: Implement token-aware rate limiting at the API gateway. Cache identical queries where appropriate. Use gpt-4o-mini for routine interactions and reserve larger models for complex reasoning. Monitor token consumption per session.
Production Bundle
Action Checklist
Decision Matrix
| Scenario | Recommended Approach | Why | Cost Impact |
|---|
| Prototype / Internal Testing | ChromaDB (Local) + gpt-4o-mini | Zero infrastructure overhead, fast iteration | Low (~$0.02/session) |
| Mid-Scale Production | Weaviate Cloud + gpt-4o-mini | Managed scaling, hybrid search, built-in RBAC | Medium (~$0.05/session + infra) |
| Enterprise / HIPAA-Adjacent | Pinecone Serverless + Custom Fine-Tuned Model | Strict compliance, dedicated tenancy, audit trails | High (~$0.12/session + infra) |
| Offline / Air-Gapped Environments | Ollama + Qdrant + nomic-embed-text | No external API calls, full data sovereignty | Medium (Hardware + maintenance) |
Configuration Template
# .env.local
OPENAI_API_KEY=sk-proj-xxxxxxxxxxxxxxxxxxxxxxxx
NEXT_PUBLIC_APP_URL=http://localhost:3000
VECTOR_DB_PATH=./vector_store
MAX_CONTEXT_TOKENS=4096
RETRIEVAL_TOP_K=3
SIMILARITY_THRESHOLD=0.65
RATE_LIMIT_PER_MINUTE=30
CRISIS_KEYWORDS=self-harm,suicide,overdose,abuse,emergency
// next.config.js
/** @type {import('next').NextConfig} */
const nextConfig = {
experimental: {
serverActions: {
bodySizeLimit: "2mb",
},
},
headers: async () => [
{
source: "/api/:path*",
headers: [
{ key: "X-Content-Type-Options", value: "nosniff" },
{ key: "X-Frame-Options", value: "DENY" },
{ key: "Referrer-Policy", value: "strict-origin-when-cross-origin" },
],
},
],
};
module.exports = nextConfig;
# docker-compose.yml (for production ChromaDB)
version: '3.8'
services:
chroma:
image: chromadb/chroma:latest
ports:
- "8000:8000"
volumes:
- ./vector_store:/chroma/chroma
environment:
- CHROMA_SERVER_AUTH_CREDENTIALS_FILE=/auth/credentials.json
- CHROMA_SERVER_AUTH_PROVIDER=chromadb.auth.token.TokenAuthServerProvider
restart: unless-stopped
Quick Start Guide
- Initialize the project: Run
npx create-next-app@latest clinical-assistant --typescript --tailwind --app and navigate into the directory.
- Install dependencies: Execute
npm install openai recharts lucide-react clsx tailwind-merge and create the .env.local file with your OpenAI key.
- Populate the vector store: Run the Python ingestion script against your curated therapeutic corpus. Verify chunks are stored with correct metadata.
- Launch the application: Start the dev server with
npm run dev, navigate to http://localhost:3000, and test chat streaming alongside mood logging.
- Validate safety boundaries: Input crisis-related phrases and confirm the system triggers emergency protocols instead of generating clinical advice.