ntNode;
while (current) {
if (current instanceof HTMLTableElement) return true;
current = current.parentNode;
}
return false;
};
if (isNested(target)) return null;
}
const clone = target.cloneNode(true) as HTMLTableElement;
if (config.sanitizePayloads) {
this.stripInvisibleNodes(clone);
}
return clone;
}
private static stripInvisibleNodes(container: HTMLElement): void {
const targets = container.querySelectorAll('style, script, noscript, template, link, meta');
targets.forEach(node => node.remove());
}
}
**Architecture Rationale:** Cloning prevents side effects in single-page applications. Parent traversal filtering ensures only root-level tables are processed, avoiding the common mistake of extracting inner layout tables as data.
### Phase 2: Spatial Grid Construction
Linear DOM order does not match visual grid coordinates. We must track occupied cells to correctly align `rowspan` and `colspan` expansions.
```typescript
type GridMatrix = (string | null)[][];
class SpatialMapper {
static buildGrid(table: HTMLTableElement): GridMatrix {
const rows = Array.from(table.rows);
const grid: GridMatrix = [];
const occupied = new Set<string>();
rows.forEach((rowEl, rIndex) => {
if (!grid[rIndex]) grid[rIndex] = [];
let cIndex = 0;
Array.from(rowEl.cells).forEach(cell => {
while (occupied.has(`${rIndex}-${cIndex}`)) cIndex++;
const rawText = cell.textContent?.trim() ?? '';
const rSpan = parseInt(cell.getAttribute('rowspan') || '1', 10);
const cSpan = parseInt(cell.getAttribute('colspan') || '1', 10);
for (let dr = 0; dr < rSpan; dr++) {
const targetRow = rIndex + dr;
if (!grid[targetRow]) grid[targetRow] = [];
for (let dc = 0; dc < cSpan; dc++) {
const targetCol = cIndex + dc;
const coord = `${targetRow}-${targetCol}`;
if (!occupied.has(coord)) {
grid[targetRow][targetCol] = rawText;
occupied.add(coord);
}
}
}
cIndex += cSpan;
});
});
return grid;
}
}
Architecture Rationale: Using a Set for coordinate tracking provides O(1) lookup complexity, making the algorithm scale efficiently to tables with hundreds of rows. The matrix guarantees rectangular output, eliminating column drift.
Phase 3: Content Sanitization & Whitespace Normalization
Raw textContent captures all descendant text, including hidden payloads. We normalize whitespace while preserving meaningful separators.
class ContentSanitizer {
static normalize(raw: string | null): string {
if (!raw) return '';
return raw
.replace(/[\t\n\r]+/g, ' ')
.replace(/\s{2,}/g, ' ')
.trim();
}
static sanitizeCell(raw: string | null): string {
return this.normalize(raw);
}
}
Export formats must adhere to strict specifications to prevent downstream parsing failures.
class ExportSerializer {
static toCSV(matrix: GridMatrix, delimiter: string = ','): string {
const escapeCell = (val: string | null): string => {
const str = val ?? '';
const requiresQuoting = str.includes(delimiter) || str.includes('"') || /[\r\n]/.test(str);
const escaped = str.replace(/"/g, '""');
return requiresQuoting ? `"${escaped}"` : escaped;
};
return matrix
.map(row => row.map(cell => escapeCell(cell)).join(delimiter))
.join('\r\n');
}
static toJSON(matrix: GridMatrix): string {
if (matrix.length < 2) return '[]';
const headers = matrix[0].map((h, i) => this.slugify(h ?? `col_${i + 1}`));
const dataRows = matrix.slice(1);
const records = dataRows.map(row => {
const obj: Record<string, string> = {};
headers.forEach((key, idx) => {
obj[key] = ContentSanitizer.sanitizeCell(row[idx]);
});
return obj;
});
return JSON.stringify(records, null, 2);
}
private static slugify(input: string): string {
return input
.normalize('NFD')
.replace(/[\u0300-\u036f]/g, '')
.toLowerCase()
.replace(/[^a-z0-9]+/g, '_')
.replace(/^_+|_+$/g, '');
}
}
Architecture Rationale: RFC 4180 compliance is enforced through conditional quoting and double-quote escaping. JSON key slugification prevents invalid property names and ensures consistent schema mapping across exports.
Phase 5: Client-Side Export Dispatch
Triggering downloads without server roundtrips requires careful memory management.
class ExportDispatcher {
static triggerDownload(content: string, filename: string, mimeType: string): void {
const blob = new Blob([content], { type: mimeType });
const url = URL.createObjectURL(blob);
const anchor = document.createElement('a');
anchor.href = url;
anchor.download = filename;
anchor.style.display = 'none';
document.body.appendChild(anchor);
anchor.click();
// Cleanup to prevent memory leaks
setTimeout(() => {
document.body.removeChild(anchor);
URL.revokeObjectURL(url);
}, 100);
}
}
Architecture Rationale: Appending the anchor to the DOM ensures compatibility with older browsers that ignore programmatic clicks on detached elements. The setTimeout cleanup guarantees the download completes before the Blob URL is invalidated.
Pitfall Guide
1. The Phantom Column Shift
Explanation: Iterating through row.cells sequentially ignores colspan/rowspan occupancy, causing subsequent rows to misalign.
Fix: Implement a coordinate-occupancy tracker (as shown in SpatialMapper) that skips already-filled grid positions before placing new values.
2. Invisible Payload Injection
Explanation: textContent extracts text from <script>, <style>, and <noscript> tags embedded within cells, polluting data with CSS rules or JS logic.
Fix: Clone the DOM subtree first, then explicitly remove invisible node types before text extraction. Never mutate the live DOM during extraction.
3. Recursive Nesting Traps
Explanation: Extracting a parent table without filtering nested tables results in inner layout data being merged into the primary dataset.
Fix: Traverse up the DOM tree from the target element. If a <table> ancestor exists, exclude it from processing unless explicitly configured otherwise.
4. CSV Delimiter Collision
Explanation: Unescaped commas, quotes, or newlines inside cell values break CSV parsers, causing column shifts or record truncation.
Fix: Implement RFC 4180 escaping: wrap values containing delimiters/quotes/newlines in double quotes, and escape internal quotes by doubling them (" β "").
5. Memory Leak via Blob URLs
Explanation: URL.createObjectURL() allocates memory that persists until explicitly revoked. Repeated exports without cleanup cause browser memory bloat.
Fix: Always call URL.revokeObjectURL() after the download trigger. Use a short timeout or requestAnimationFrame to ensure the browser has queued the download before revocation.
Explanation: Duplicate or empty column headers produce invalid or overwritten JSON keys, breaking schema validation downstream.
Fix: Apply a slugification algorithm that normalizes whitespace, removes accents, and appends index-based fallbacks (col_1, col_2) for empty or duplicate headers.
7. Whitespace Normalization Failure
Explanation: Collapsing all whitespace indiscriminately destroys meaningful line breaks or tab-separated values within cells.
Fix: Replace tabs and newlines with single spaces first, then collapse multiple spaces into one. Preserve internal spacing that carries semantic meaning.
Production Bundle
Action Checklist
Decision Matrix
| Scenario | Recommended Approach | Why | Cost Impact |
|---|
| Static HTML tables on internal dashboards | Client-Side JS Pipeline | Zero infrastructure, instant execution, full control over formatting | $0 (browser-native) |
| JS-rendered tables (React/Vue/Angular) | Headless Browser (Playwright/Puppeteer) | Waits for DOM hydration, handles dynamic content injection | High (compute + maintenance) |
| High-volume automated ETL (10k+ pages/day) | Server-Side Parser (Cheerio + JSDOM) | Batch processing, queue integration, no UI thread blocking | Medium (server costs + dev time) |
| One-off analyst requests | Browser Bookmarklet / Extension | No code deployment, immediate UI feedback, format selection | $0 (user-time cost) |
Configuration Template
// pipeline.config.ts
import { TableIsolator, SpatialMapper, ExportSerializer, ExportDispatcher } from './extractor';
export interface PipelineOptions {
selector: string;
format: 'csv' | 'json';
delimiter?: string;
filename?: string;
mimeType?: string;
}
export function runExtraction(container: HTMLElement, options: PipelineOptions): void {
const config = {
targetSelector: options.selector,
includeNested: false,
sanitizePayloads: true
};
const isolatedTable = TableIsolator.isolate(container, config);
if (!isolatedTable) {
console.warn(`No valid table found matching "${options.selector}"`);
return;
}
const grid = SpatialMapper.buildGrid(isolatedTable);
const sanitizedGrid = grid.map(row =>
row.map(cell => cell !== null ? cell : '')
);
let exportContent: string;
let finalFilename: string;
let finalMime: string;
if (options.format === 'csv') {
exportContent = ExportSerializer.toCSV(sanitizedGrid, options.delimiter ?? ',');
finalFilename = options.filename ?? 'export.csv';
finalMime = options.mimeType ?? 'text/csv;charset=utf-8';
} else {
exportContent = ExportSerializer.toJSON(sanitizedGrid);
finalFilename = options.filename ?? 'export.json';
finalMime = options.mimeType ?? 'application/json;charset=utf-8';
}
ExportDispatcher.triggerDownload(exportContent, finalFilename, finalMime);
}
Quick Start Guide
- Install Dependencies: This pipeline uses zero external dependencies. Copy the TypeScript modules into your project's
utils/ directory.
- Initialize the Pipeline: Import
runExtraction and call it with a container reference (e.g., document.body) and configuration options.
- Test Against Target DOM: Open your browser console, paste the initialization call, and verify the downloaded file matches expected column alignment and formatting.
- Integrate into Workflow: Wrap the call in a UI button handler, bookmarklet, or automated script. Add error boundaries around
isolatedTable checks to gracefully handle missing selectors.
- Monitor & Iterate: Log extraction duration and row counts. If tables exceed 5,000 rows, consider chunking the grid mapping process using
requestIdleCallback to prevent main-thread blocking.