e_content.splitlines()
self._file_ref = relative_path
self._scope_stack: List[str] = []
self.artifacts: List[FunctionArtifact] = []
def visit_ClassDef(self, node: ast.ClassDef) -> None:
self._scope_stack.append(node.name)
self.generic_visit(node)
self._scope_stack.pop()
def _process_callable(self, node: ast.AST) -> None:
start = node.lineno
end = node.end_lineno
extracted_source = "\n".join(self._source_lines[start - 1 : end])
artifact = FunctionArtifact(
identifier=node.name,
category="method" if self._scope_stack else "function",
origin_file=self._file_ref,
line_range=(start, end),
raw_source=extracted_source,
documentation=ast.get_docstring(node) or "",
parent_scope=self._scope_stack[-1] if self._scope_stack else "",
invoked_symbols=self._resolve_calls(node)
)
self.artifacts.append(artifact)
self.generic_visit(node)
visit_FunctionDef = _process_callable
visit_AsyncFunctionDef = _process_callable
def _resolve_calls(self, node: ast.AST) -> List[str]:
targets = set()
for child in ast.walk(node):
if isinstance(child, ast.Call):
if isinstance(child.func, ast.Name):
targets.add(child.func.id)
elif isinstance(child.func, ast.Attribute):
targets.add(child.func.attr)
return sorted(targets)
**Architecture Rationale**: Extracting by line range instead of character offsets guarantees that multi-line functions remain intact. Tracking `parent_scope` preserves class hierarchy without requiring complex namespace resolution. Separating call resolution into a dedicated traversal keeps the extraction pipeline linear and cacheable.
### Phase 2: Dependency Network Construction
Once callable units are isolated, execution relationships must be mapped bidirectionally. This enables upstream tracing (who invokes this?) and downstream tracing (what does this invoke?).
```python
from collections import defaultdict, deque
from typing import Dict, Set, List, Optional
class DependencyNetwork:
def __init__(self, artifacts: List[FunctionArtifact]):
self.outbound: Dict[str, Set[str]] = defaultdict(set)
self.inbound: Dict[str, Set[str]] = defaultdict(set)
known_identifiers = {a.identifier for a in artifacts}
for artifact in artifacts:
for target in artifact.invoked_symbols:
if target in known_identifiers:
self.outbound[artifact.identifier].add(target)
self.inbound[target].add(artifact.identifier)
def trace_downstream(self, entry: str, max_depth: int = 4) -> List[str]:
return self._bfs_traversal(entry, self.outbound, max_depth)
def trace_upstream(self, entry: str, max_depth: int = 4) -> List[str]:
return self._bfs_traversal(entry, self.inbound, max_depth)
def find_shortest_path(self, source: str, destination: str) -> Optional[List[str]]:
if source == destination:
return [source]
queue = deque([[source]])
visited = {source}
while queue:
current_path = queue.popleft()
current_node = current_path[-1]
for neighbor in self.outbound.get(current_node, set()):
if neighbor not in visited:
visited.add(neighbor)
new_path = current_path + [neighbor]
if neighbor == destination:
return new_path
queue.append(new_path)
return None
def _bfs_traversal(self, start: str, graph: Dict[str, Set[str]], depth: int) -> List[str]:
visited = {start}
queue = deque([(start, 0)])
result = []
while queue:
node, current_depth = queue.popleft()
if current_depth > 0:
result.append(node)
if current_depth < depth:
for neighbor in graph.get(node, set()):
if neighbor not in visited:
visited.add(neighbor)
queue.append((neighbor, current_depth + 1))
return result
Architecture Rationale: Bidirectional adjacency maps eliminate recursive graph traversal overhead during queries. BFS guarantees shortest-path discovery without Dijkstra's complexity, which is unnecessary for unweighted call graphs. Limiting depth prevents combinatorial explosion in large orchestration functions.
Phase 3: Dual-Channel Vector Indexing
Embedding APIs impose strict token limits (typically 512β8192 tokens). Full function bodies frequently exceed these boundaries, while docstrings alone lack sufficient semantic density. The solution decouples retrieval from context injection.
from langchain_core.documents import Document
def prepare_retrieval_units(artifacts: List[FunctionArtifact]) -> List[Document]:
documents = []
for unit in artifacts:
scope_prefix = f"{unit.parent_scope}." if unit.parent_scope else ""
full_identifier = f"{scope_prefix}{unit.identifier}"
# Semantic payload: optimized for embedding token budget
semantic_payload = f"{full_identifier}: {unit.documentation or unit.raw_source.splitlines()[0]}"[:400]
# Context payload: preserved for LLM consumption, excluded from vectorization
context_payload = unit.raw_source[:2000]
documents.append(Document(
page_content=semantic_payload,
metadata={
"identifier": unit.identifier,
"origin": unit.origin_file,
"line_range": unit.line_range,
"full_context": context_payload
}
))
return documents
At query time, the vector store returns semantically relevant units. The LLM receives the full_context metadata, not the truncated embedding payload.
def assemble_llm_context(query_results, max_chars_per_unit=600) -> str:
context_blocks = []
for doc in query_results:
raw = doc.metadata.get("full_context", doc.page_content)
context_blocks.append(raw[:max_chars_per_unit])
return "\n\n---\n\n".join(context_blocks)
Architecture Rationale: Separating retrieval semantics from execution context prevents token budget exhaustion while preserving code completeness. Storing full source in vector DB metadata is safe because metadata is never vectorized; it is only serialized during retrieval. This pattern reduces embedding costs by ~60% compared to naive full-body indexing.
Phase 4: Embedding Model Selection
General-purpose text embedders (text-embedding-3, BGE) treat code as prose. They capture docstring semantics but miss control flow, variable accumulation patterns, and API contract nuances. Specialized code models dramatically improve retrieval precision.
| Model | Architecture | Token Limit | Best Use Case |
|---|
microsoft/codebert-base | Dual-tower (code + docs) | 512 | Signature & documentation matching |
Salesforce/codet5-base | Encoder-decoder | 512 | Completion-aware retrieval |
nomic-ai/nomic-embed-text-v1.5 | Generalist with code tuning | 8192 | Full-function embedding without truncation |
voyage-code-2 | Specialized code encoder | 16000 | Enterprise-scale codebase indexing |
Recommendation: If your vector infrastructure supports extended contexts, nomic-ai/nomic-embed-text-v1.5 or voyage-code-2 allow direct embedding of complete function bodies. This eliminates the dual-channel workaround entirely. For constrained environments, maintain the semantic/context split and pair it with codebert-base for maximum precision.
Pitfall Guide
1. Blindly Embedding Full Function Bodies
Explanation: Feeding 150-line functions directly into 512-token embedders causes silent truncation. The model indexes only the first third of the code, discarding return logic, error handling, and downstream calls.
Fix: Implement token-aware chunking or adopt the dual-channel strategy. Always validate embedding length against model limits before indexing.
2. Ignoring Async and Special Method Boundaries
Explanation: Standard AST visitors often skip AsyncFunctionDef or dunder methods (__init__, __call__), creating blind spots in dependency graphs.
Fix: Explicitly register visit_AsyncFunctionDef and filter dunder methods based on project conventions. Never assume synchronous-only codebases.
3. Static Analysis Blind Spots
Explanation: AST parsing cannot resolve dynamic imports, getattr(), eval(), or plugin architectures. Call graphs will show missing edges, leading to incomplete traversal results.
Fix: Document known dynamic boundaries. Supplement AST graphs with runtime tracing (e.g., sys.settrace or APM tools) for critical execution paths. Treat static graphs as approximations, not absolutes.
Explanation: Storing full source code in vector DB metadata increases storage costs and slows retrieval serialization. Some vector databases impose metadata size limits.
Fix: Compress metadata using zlib or store source references in an external object store (S3, MinIO). Vector DBs should only hold lightweight identifiers and line ranges.
5. Over-Reliance on Docstrings for Semantic Search
Explanation: Docstring-only embeddings fail when developers omit documentation or write vague descriptions. Queries like "incremental indexing" return zero results if the term never appears in docstrings.
Fix: Fallback to the first executable line or function signature when docstrings are absent. Consider hybrid search: combine vector similarity with BM25 keyword matching on source bodies.
6. Missing Cross-Module Import Resolution
Explanation: Call graphs built per-file cannot resolve from utils import helper across directories. Functions appear isolated, breaking upstream/downstream queries.
Fix: Perform a two-pass extraction. First pass collects all identifiers globally. Second pass resolves cross-file references by matching invoked symbols against the global registry.
Explanation: Re-parsing entire repositories on every deployment wastes CI/CD time and vector DB quota. AST extraction is deterministic and should be cached.
Fix: Hash source files with sha256. Only re-extract and re-embed changed files. Store extraction artifacts in a local cache directory or distributed KV store.
Production Bundle
Action Checklist
Decision Matrix
| Scenario | Recommended Approach | Why | Cost Impact |
|---|
| Small codebase (<50k LOC) | Full-function embedding with nomic-embed-text-v1.5 | Simplifies pipeline, eliminates metadata overhead | Moderate (higher token costs, lower engineering overhead) |
| Large enterprise repo (>500k LOC) | Dual-channel indexing + codebert-base | Controls token spend, enables precise retrieval | Low (optimized vector storage, faster indexing) |
| Dynamic/plugin architecture | AST extraction + runtime APM tracing | Static graphs miss dynamic dispatch paths | High (requires instrumentation, but prevents blind spots) |
| Strict compliance/air-gapped | Self-hosted codet5-base or codebert-base | Avoids external API dependencies, ensures data sovereignty | High (GPU inference costs, maintenance overhead) |
Configuration Template
# code-rag-pipeline.yaml
extraction:
language: python
parser: ast
cache_strategy: sha256_file_hash
max_depth: 4
include_async: true
include_dunder: false
indexing:
strategy: dual_channel
semantic_model: nomic-ai/nomic-embed-text-v1.5
max_semantic_tokens: 400
metadata_compression: zlib
vector_db: chroma
collection_name: code_artifacts_v2
query:
modes:
- semantic_search
- dependency_traversal
- llm_qa
max_context_chars: 600
fallback_to_signature: true
evaluation:
metrics:
- code_retrieval_precision
- call_chain_accuracy
- token_efficiency_ratio
test_suite: generated_qa_pairs
Quick Start Guide
- Install dependencies:
pip install langchain langchain-community chromadb ast
- Initialize extractor: Point
SyntaxTreeExtractor at your repository root. Run extraction and serialize artifacts to JSON.
- Build dependency graph: Instantiate
DependencyNetwork with extracted artifacts. Validate edge counts against expected orchestration functions.
- Index vector store: Run
prepare_retrieval_units() and persist to Chroma/Pinecone. Verify metadata contains full source while page_content remains under token limits.
- Test queries: Execute semantic search ("cache invalidation logic"), upstream tracing (
trace_upstream("build_embeddings")), and LLM Q&A. Compare results against naive chunking baseline.