nAI(api_key=os.environ.get("OPENAI_API_KEY"))
class DocumentMetadata(BaseModel):
"""Schema for extracted filename components."""
doc_category: str = Field(description="Category: invoice, contract, receipt, id, report, or other")
entity_name: str = Field(description="Vendor, company, or person name. 'unknown' if missing.")
date_iso: str = Field(description="Primary date in YYYY-MM-DD format. 'unknown' if missing.")
reference_id: str = Field(description="Invoice number, contract ID, or ref number. 'unknown' if missing.")
class SemanticRenamer:
"""Pipeline for content-aware file renaming."""
def __init__(self, dry_run: bool = True, max_tokens: int = 2000):
self.dry_run = dry_run
self.max_tokens = max_tokens
self.fallback_hash_length = 6
def classify_file(self, file_path: Path) -> str:
"""Route file to extraction strategy based on MIME type and content density."""
mime = magic.from_file(str(file_path), mime=True)
if mime == "application/pdf":
# Heuristic: Check first page for embedded text
try:
with pdfplumber.open(file_path) as pdf:
if pdf.pages:
text = pdf.pages[0].extract_text()
if text and len(text.strip()) > 100:
return "TEXT_HEAVY"
except Exception:
pass
return "SCANNED_PDF"
if mime.startswith("image/"):
return "IMAGE"
return "UNSUPPORTED"
def extract_raw_content(self, file_path: Path, strategy: str) -> str:
"""Extract text or description based on routing strategy."""
if strategy in ("TEXT_HEAVY", "SCANNED_PDF"):
return self._extract_pdf_content(file_path)
elif strategy == "IMAGE":
return self._extract_image_description(file_path)
return ""
def _extract_pdf_content(self, file_path: Path) -> str:
"""Extract text from PDF using pdfplumber with OCR fallback."""
try:
with pdfplumber.open(file_path) as pdf:
pages_text = [p.extract_text() or "" for p in pdf.pages]
combined = "\n".join(pages_text).strip()
if len(combined) > 50:
return combined
except Exception as e:
logger.warning(f"pdfplumber failed for {file_path.name}: {e}")
# Fallback to OCR for scanned PDFs
try:
import pdf2image
images = pdf2image.convert_from_path(file_path, first_page=1, last_page=1)
if images:
return pytesseract.image_to_string(images[0])
except Exception as e:
logger.error(f"OCR fallback failed for {file_path.name}: {e}")
return ""
def _extract_image_description(self, file_path: Path) -> str:
"""Use Vision model to describe image content."""
try:
with open(file_path, "rb") as img_file:
base64_image = img_file.read()
response = client.chat.completions.create(
model="gpt-4o",
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": "Describe this image to generate a filename. Identify document type, entity names, dates, and reference numbers."},
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image.decode('latin-1')}", "detail": "low"}}
]
}
],
max_tokens=200,
)
return response.choices[0].message.content.strip()
except Exception as e:
logger.error(f"Vision model failed for {file_path.name}: {e}")
return ""
def infer_metadata(self, raw_content: str) -> DocumentMetadata:
"""Send content to LLM and parse structured output."""
if not raw_content.strip():
return DocumentMetadata(doc_category="file", entity_name="unknown", date_iso="unknown", reference_id="unknown")
prompt = f"""
Extract metadata from the following content. Return ONLY valid JSON matching the schema.
Fields: doc_category, entity_name, date_iso, reference_id.
Use 'unknown' for missing fields.
Content:
{raw_content[:self.max_tokens]}
"""
try:
response = client.chat.completions.create(
model="gpt-4o-mini",
messages=[{"role": "user", "content": prompt}],
temperature=0,
response_format={"type": "json_object"},
)
content = response.choices[0].message.content
return DocumentMetadata.model_validate_json(content)
except (ValidationError, Exception) as e:
logger.warning(f"Metadata inference failed: {e}. Using fallback.")
return DocumentMetadata(doc_category="file", entity_name="unknown", date_iso="unknown", reference_id="unknown")
def generate_slug(self, metadata: DocumentMetadata, original_path: Path) -> str:
"""Construct a sanitized filename with collision protection."""
ext = original_path.suffix.lower()
parts = []
# Map fields to slug components
if metadata.doc_category and metadata.doc_category != "unknown":
parts.append(metadata.doc_category)
if metadata.entity_name and metadata.entity_name != "unknown":
parts.append(metadata.entity_name)
if metadata.date_iso and metadata.date_iso != "unknown":
parts.append(metadata.date_iso)
if metadata.reference_id and metadata.reference_id != "unknown":
parts.append(metadata.reference_id)
# Sanitize components
clean_parts = [re.sub(r'[<>:"/\\|?*\x00-\x1f\s]+', "_", p)[:40] for p in parts if p]
# Remove empty or 'unknown' strings after sanitization
clean_parts = [p for p in clean_parts if p and p.lower() != "unknown"]
if not clean_parts:
# Fallback: original stem + content hash
stem = original_path.stem
content_hash = hashlib.md5(stem.encode()).hexdigest()[:self.fallback_hash_length]
clean_parts = [stem, content_hash]
slug = "_".join(clean_parts) + ext
return slug
def process_file(self, file_path: str) -> Optional[str]:
"""Execute the full pipeline for a single file."""
path = Path(file_path)
if not path.exists():
logger.error(f"File not found: {path}")
return None
strategy = self.classify_file(path)
if strategy == "UNSUPPORTED":
logger.warning(f"Unsupported file type: {path.name}")
return None
logger.info(f"Processing {path.name} via {strategy}...")
raw_content = self.extract_raw_content(path, strategy)
metadata = self.infer_metadata(raw_content)
new_slug = self.generate_slug(metadata, path)
new_path = path.parent / new_slug
if self.dry_run:
logger.info(f"[DRY RUN] {path.name} -> {new_slug}")
return str(new_path)
try:
if new_path.exists() and new_path != path:
# Collision detected; append hash
stem = new_path.stem
ext = new_path.suffix
collision_hash = hashlib.md5(str(path.stat().st_size).encode()).hexdigest()[:4]
new_path = path.parent / f"{stem}_{collision_hash}{ext}"
logger.warning(f"Collision resolved: {new_path.name}")
path.rename(new_path)
logger.info(f"Renamed: {path.name} -> {new_path.name}")
return str(new_path)
except Exception as e:
logger.error(f"Rename failed for {path.name}: {e}")
return None
**Rationale**
* **`pdfplumber` over `PyPDF2`:** `pdfplumber` handles complex layouts and tables more robustly, reducing extraction errors that confuse the LLM.
* **`response_format={"type": "json_object"}`:** This OpenAI parameter enforces JSON output, reducing parsing failures compared to prompt-only instructions.
* **Collision Resolution:** The `generate_slug` method includes a fallback hash, and `process_file` detects existing files to append a size-based hash. This prevents data loss in batch operations.
* **Token Capping:** `raw_content[:self.max_tokens]` limits context window usage, controlling costs and latency without sacrificing accuracy for most documents.
### Pitfall Guide
1. **Context Window Exhaustion**
* *Explanation:* Feeding multi-page PDFs into the LLM can exceed token limits or dilute signal with irrelevant footer text.
* *Fix:* Truncate content to the first 2,000 tokens. Most critical fields (vendor, date, ID) appear on the first page of invoices and contracts.
2. **Hallucinated Identifiers**
* *Explanation:* LLMs may invent invoice numbers or dates when information is ambiguous.
* *Fix:* Implement regex validation in `infer_metadata`. If `reference_id` does not match expected patterns (e.g., alphanumeric sequences), flag it as `unknown` or request a confidence score.
3. **Filename Collisions in Batches**
* *Explanation:* Two different files may yield identical metadata (e.g., two invoices from the same vendor on the same day).
* *Fix:* Always check `new_path.exists()` before renaming. Append a unique hash derived from file size or content checksum to resolve collisions.
4. **OCR Noise from Headers/Footers**
* *Explanation:* Scanned documents often contain repetitive headers/footers that dominate the OCR output, distracting the LLM.
* *Fix:* Use region-of-interest extraction in `pdfplumber` to crop headers/footers, or instruct the LLM prompt to ignore boilerplate text.
5. **Cost Spirals with Vision Models**
* *Explanation:* Routing all files through `gpt-4o` vision is expensive and slow.
* *Fix:* Implement a strict router. Use OCR for any PDF or image with high text density. Reserve vision models for photographs and mixed-media inputs.
6. **Sanitization Edge Cases**
* *Explanation:* Unicode characters or emojis in entity names can break filesystem operations on certain OSes.
* *Fix:* Use a strict regex that allows only alphanumeric characters, hyphens, and underscores. Strip all other characters.
7. **Dry Run Neglect**
* *Explanation:* Running the pipeline without preview can result in mass renaming errors that are difficult to reverse.
* *Fix:* Default `dry_run=True`. Require an explicit flag to execute renames. Log all proposed changes for audit.
### Production Bundle
#### Action Checklist
- [ ] Install system dependencies: `tesseract-ocr`, `poppler-utils` (for `pdf2image`).
- [ ] Set environment variable `OPENAI_API_KEY` with a key that has access to `gpt-4o` and `gpt-4o-mini`.
- [ ] Install Python packages: `pip install pdfplumber pytesseract Pillow openai pydantic python-magic`.
- [ ] Run pipeline with `dry_run=True` on a test directory containing 10–20 representative files.
- [ ] Review logs for extraction errors and hallucination patterns.
- [ ] Configure retry logic and rate limiting for API calls if processing >100 files.
- [ ] Implement file backup strategy before executing production renames.
- [ ] Monitor API costs and latency metrics during initial batch processing.
#### Decision Matrix
| Scenario | Recommended Approach | Why | Cost Impact |
| :--- | :--- | :--- | :--- |
| **High-volume scanned invoices** | OCR + `gpt-4o-mini` | Text extraction is reliable; mini model is sufficient for structured data. | Low ($0.15/1k) |
| **Mixed photo dump (receipts, IDs)** | Vision + `gpt-4o` | OCR fails on photos; vision model understands context and reads text in situ. | Medium ($1.20/1k) |
| **Legal contracts (text-heavy)** | OCR + `gpt-4o-mini` | Contracts contain dense text; OCR preserves structure; mini model extracts parties/dates. | Low ($0.20/1k) |
| **Low-budget batch processing** | OCR + Local LLM | If API costs are prohibitive, use local models like `Llama-3` with structured output. | Near Zero (Compute only) |
| **Real-time upload renaming** | Vision + `gpt-4o-mini` | Balance speed and accuracy; mini model with vision capability reduces latency. | Medium ($0.40/1k) |
#### Configuration Template
Use a YAML configuration to manage pipeline parameters without code changes.
```yaml
# rename_config.yaml
pipeline:
dry_run: true
max_tokens: 2000
fallback_hash_length: 6
models:
vision: "gpt-4o"
inference: "gpt-4o-mini"
temperature: 0
routing:
text_density_threshold: 100 # Characters for TEXT_HEAVY classification
ocr_fallback: true
sanitization:
max_slug_length: 150
allowed_chars: "a-zA-Z0-9_-"
Quick Start Guide
-
Setup Environment:
pip install pdfplumber pytesseract Pillow openai pydantic python-magic
export OPENAI_API_KEY="sk-proj-..."
-
Create Script:
Save the SemanticRenamer class code to renamer.py.
-
Run Dry Run:
from renamer import SemanticRenamer
from pathlib import Path
renamer = SemanticRenamer(dry_run=True)
test_dir = Path("./test_files")
for file in test_dir.iterdir():
renamer.process_file(str(file))
-
Verify and Execute:
Review the log output. If filenames are correct, set dry_run=False and rerun.
-
Monitor:
Check the target directory for renamed files. Verify that collisions were handled and no data was lost.