authentication modes reduces the attack surface to local service accounts only.
2. Sanitized Ingestion Pipeline
Clinical documents must be de-identified before embedding. HIPAA Safe Harbor requires stripping 18 direct identifiers; Expert Determination allows statistical risk assessment. De-identification happens before chunking, not after.
# pipeline/ingestor.py
import hashlib
import re
from dataclasses import dataclass
from typing import List, Dict, Any
from sentence_transformers import SentenceTransformer
import actian_vectorai as vdb
@dataclass
class ClinicalChunk:
chunk_id: str
raw_text: str
department: str
doc_type: str
author_role: str
ingestion_date: str
class ClinicalIngestor:
def __init__(self, host: str, collection: str):
self.client = vdb.Client(host)
self.collection = collection
self.encoder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
self._init_collection()
def _init_collection(self) -> None:
if not self.client.has_collection(self.collection):
self.client.create_collection(
name=self.collection,
vector_size=384,
distance=vdb.Distance.COSINE,
metadata_schema={
"department": "string",
"doc_type": "string",
"author_role": "string",
"ingestion_date": "string"
}
)
def sanitize(self, text: str) -> str:
patterns = [
r'\b\d{3}-\d{2}-\d{4}\b', # SSN
r'\b\d{2}/\d{2}/\d{4}\b', # DOB
r'(?:MRN|Patient ID)[:\s]*[A-Z0-9]+',
r'(?:Dr\.|Patient)[:\s]*[A-Z][a-z]+'
]
sanitized = text
for pat in patterns:
sanitized = re.sub(pat, '[REDACTED]', sanitized)
return sanitized
def chunk_and_embed(self, documents: List[Dict[str, Any]]) -> List[vdb.PointStruct]:
points = []
for doc in documents:
clean_text = self.sanitize(doc["content"])
chunks = self._split_text(clean_text, chunk_size=512, overlap=50)
for idx, chunk in enumerate(chunks):
chunk_id = hashlib.sha256(f"{doc['id']}_{idx}".encode()).hexdigest()[:12]
embedding = self.encoder.encode(chunk).tolist()
points.append(vdb.PointStruct(
id=chunk_id,
vector=embedding,
payload={
"department": doc["department"],
"doc_type": doc["type"],
"author_role": doc["author_role"],
"ingestion_date": doc["date"],
"text_preview": chunk[:150] + "..."
}
))
return points
def _split_text(self, text: str, chunk_size: int, overlap: int) -> List[str]:
words = text.split()
chunks = []
step = chunk_size - overlap
for i in range(0, len(words), step):
chunk = " ".join(words[i:i + chunk_size])
chunks.append(chunk)
if i + chunk_size >= len(words):
break
return chunks
def load(self, points: List[vdb.PointStruct]) -> None:
self.client.upsert(collection_name=self.collection, points=points)
Rationale: Metadata is enforced at the schema level, making it impossible to insert records without departmental or role classification. The 512-token chunk size with 50-token overlap preserves clinical context across boundaries without bloating retrieval windows. SHA-256 chunk IDs prevent duplicate ingestion during pipeline retries.
3. Query-Time Access Enforcement & Generation
Retrieval must be gated by role-based access control (RBAC) before reaching the vector index. Hybrid search combines semantic similarity with strict metadata filtering.
# pipeline/query_engine.py
import logging
from typing import Optional
from dataclasses import dataclass
import actian_vectorai as vdb
@dataclass
class QueryContext:
user_id: str
department: str
role: str
raw_query: str
class ClinicalQueryEngine:
def __init__(self, host: str, collection: str, llm_endpoint: str):
self.client = vdb.Client(host)
self.collection = collection
self.llm_endpoint = llm_endpoint
self.encoder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
self.audit_logger = logging.getLogger("clinical_audit")
def execute(self, ctx: QueryContext, top_k: int = 5) -> dict:
self._validate_access(ctx)
query_vec = self.encoder.encode(ctx.raw_query).tolist()
filter_payload = {
"department": ctx.department,
"doc_type": {"$in": ["clinical_note", "treatment_protocol"]}
}
results = self.client.search(
collection_name=self.collection,
query_vector=query_vec,
limit=top_k,
query_filter=filter_payload
)
context_blocks = [r.payload["text_preview"] for r in results]
response = self._generate_with_citations(ctx.raw_query, context_blocks)
self._log_audit(ctx, results, response)
return response
def _validate_access(self, ctx: QueryContext) -> None:
restricted_roles = {"billing_clerk", "scheduling_bot", "external_vendor"}
if ctx.role in restricted_roles:
raise PermissionError(f"Role {ctx.role} lacks clinical retrieval privileges")
def _generate_with_citations(self, query: str, context: list) -> dict:
prompt = (
f"Answer the following clinical query using ONLY the provided context. "
f"Include citation markers [1], [2] corresponding to context order. "
f"Do not invent information or reference external knowledge.\n\n"
f"Context:\n{'---'.join(context)}\n\nQuery: {query}"
)
# Placeholder for local LLM call
return {"answer": "[Local LLM Output]", "citations": list(range(1, len(context)+1))}
def _log_audit(self, ctx: QueryContext, results: list, response: dict) -> None:
record = {
"timestamp": "2025-06-10T14:32:00Z",
"user_id": ctx.user_id,
"department": ctx.department,
"role": ctx.role,
"query_hash": hashlib.sha256(ctx.raw_query.encode()).hexdigest(),
"retrieved_ids": [r.id for r in results],
"response_length": len(response.get("answer", ""))
}
self.audit_logger.info("CLINICAL_QUERY", extra=record)
Rationale: The MUST filter on department prevents cross-unit data leakage. Query hashing in audit logs preserves traceability without storing raw PHI. The generation prompt explicitly forbids external knowledge, forcing the model to ground responses in retrieved chunks only. This eliminates hallucination-driven compliance violations.
Pitfall Guide
1. Treating BAA as Application Compliance
Explanation: Organizations assume infrastructure agreements cover prompt construction, logging, and retrieval routing. BAAs explicitly exclude application-layer data handling.
Fix: Implement data flow mapping at the code level. Every function that touches PHI must include explicit sanitization, filtering, or logging controls.
Explanation: Storing documents without structured departmental or role metadata forces post-retrieval filtering, which is slower and prone to bypass.
Fix: Enforce metadata at ingestion. Make department, document type, and author role required fields in the vector schema. Reject inserts that lack classification.
3. Unrestricted Debug Logging
Explanation: Development frameworks often log full request payloads to stdout or external observability tools. This silently exfiltrates PHI.
Fix: Override default loggers. Hash or redact query text before logging. Route audit records to a WORM (Write Once, Read Many) storage volume with strict retention policies.
4. Ignoring Chunk Boundary Artifacts
Explanation: Naive character splitting breaks clinical sentences, causing retrieval to return incomplete diagnostic context or fragmented medication lists.
Fix: Use token-aware chunking with overlap. Preserve sentence boundaries where possible. Validate chunk coherence by sampling retrieved segments during QA.
5. Weak Query-Time Filtering
Explanation: Relying on application-layer if statements to filter results after retrieval allows unauthorized data to briefly exist in memory.
Fix: Push filters to the vector engine. Use MUST clauses on department and cohort fields. Verify filter execution via database query plans.
6. Model Hallucination Without Grounding Constraints
Explanation: LLMs default to completing patterns, which can generate plausible but incorrect clinical advice when context is sparse.
Fix: Inject explicit grounding instructions in the system prompt. Require citation markers. Implement a post-generation validation step that cross-references claims against retrieved chunk IDs.
7. Audit Log Rotation Misconfiguration
Explanation: HIPAA requires 6-year retention for access logs. Default container logging or logrotate policies delete records prematurely.
Fix: Mount audit volumes to persistent host storage. Configure log rotation to archive, not delete. Implement checksum verification to detect tampering.
Production Bundle
Action Checklist
Decision Matrix
| Scenario | Recommended Approach | Why | Cost Impact |
|---|
| Small clinic (<500 beds) | Single-node local RAG | Sufficient for limited query volume; minimal hardware overhead | Low (commodity server + GPU) |
| Mid-size hospital (500-2000 beds) | Clustered vector store + dedicated inference node | Handles concurrent clinician queries; isolates embedding and generation workloads | Medium (2-3 servers + load balancer) |
| Research network / Multi-site | Federated on-prem nodes with centralized audit aggregation | Maintains data sovereignty per site while enabling cross-institutional compliance reporting | High (network infrastructure + centralized SIEM) |
| Hybrid cloud requirement | Local vector + cloud LLM via secure proxy | Only viable if PHI never leaves local boundary and prompts are fully sanitized | Variable (depends on proxy architecture) |
Configuration Template
# .env.production
VECTORAI_HOST=localhost:50052
COLLECTION_NAME=clinical_knowledge_base
EMBED_MODEL=sentence-transformers/all-MiniLM-L6-v2
CHUNK_SIZE=512
OVERLAP_SIZE=50
AUDIT_LOG_PATH=/app/audit_logs
LOG_RETENTION_DAYS=2190
RBAC_RESTRICTED_ROLES=billing_clerk,scheduling_bot,external_vendor
GENERATION_MAX_TOKENS=512
CITATION_REQUIRED=true
# config/pipeline_settings.py
from pydantic import BaseModel, Field
from typing import List
class PipelineConfig(BaseModel):
vector_host: str = Field(alias="VECTORAI_HOST")
collection: str = Field(alias="COLLECTION_NAME")
embed_model: str = Field(alias="EMBED_MODEL")
chunk_size: int = Field(alias="CHUNK_SIZE")
overlap: int = Field(alias="OVERLAP_SIZE")
audit_dir: str = Field(alias="AUDIT_LOG_PATH")
retention_days: int = Field(alias="LOG_RETENTION_DAYS")
restricted_roles: List[str] = Field(alias="RBAC_RESTRICTED_ROLES")
max_gen_tokens: int = Field(alias="GENERATION_MAX_TOKENS")
enforce_citations: bool = Field(alias="CITATION_REQUIRED")
class Config:
env_file = ".env.production"
env_file_encoding = "utf-8"
Quick Start Guide
- Provision Infrastructure: Run
docker-compose up -d to launch the local vector store and mount persistent volumes for data and audit logs.
- Initialize Environment: Copy
.env.production to your project root and verify network isolation. Install dependencies using uv pip install -r requirements.txt.
- Load Sanitized Corpus: Execute the ingestion pipeline against a de-identified test dataset. Verify metadata schema enforcement and chunk coherence.
- Test Access Controls: Submit queries with restricted roles and valid department filters. Confirm that unauthorized requests return
PermissionError and that cross-department retrieval yields zero results.
- Validate Audit Trail: Query the host-mounted audit directory. Ensure every interaction logs timestamp, user ID, department, query hash, and retrieved chunk IDs without exposing raw PHI.