nsureRow(rIndex);
let cIndex = 0;
Array.from(rowEl.cells).forEach(cell => {
cIndex = this.findNextAvailableColumn(rIndex, cIndex);
const spanData = this.parseCellSpan(cell);
this.markOccupied(rIndex, cIndex, spanData);
cIndex += spanData.colSpan;
});
});
return this.grid.filter(row => row.some(cell => cell !== null));
}
private ensureRow(index: number): void {
if (!this.grid[index]) this.grid[index] = [];
}
private findNextAvailableColumn(row: number, col: number): number {
while (this.occupied.has(${row}-${col})) col++;
return col;
}
private parseCellSpan(cell: HTMLTableCellElement): CellSpan {
return {
rowSpan: cell.rowSpan || 1,
colSpan: cell.colSpan || 1,
text: this.sanitizeNode(cell)
};
}
private markOccupied(r: number, c: number, span: CellSpan): void {
for (let dr = 0; dr < span.rowSpan; dr++) {
for (let dc = 0; dc < span.colSpan; dc++) {
const targetR = r + dr;
const targetC = c + dc;
this.ensureRow(targetR);
this.occupied.add(${targetR}-${targetC});
if (!this.grid[targetR][targetC]) {
this.grid[targetR][targetC] = span.text;
}
}
}
}
}
**Architecture Rationale:**
- Using a `Set` for occupied coordinates provides O(1) lookup, avoiding nested loops during alignment.
- The grid is lazily initialized per row, preventing sparse array overhead.
- Separation of parsing (`parseCellSpan`) and marking (`markOccupied`) keeps the extraction loop clean and testable.
### Step 2: DOM Sanitization & Text Normalization
`textContent` captures everything, including injected scripts and hidden layout nodes. Cloning and pruning ensures only visible, meaningful data is extracted.
```typescript
class DOMSanitizer {
private readonly EXCLUDED_TAGS = new Set(['STYLE', 'SCRIPT', 'NOSCRIPT', 'TEMPLATE', 'LINK']);
public clean(element: HTMLElement): string {
const clone = element.cloneNode(true) as HTMLElement;
this.pruneInvisible(clone);
return this.normalizeWhitespace(clone.textContent || '');
}
private pruneInvisible(node: Node): void {
const walker = document.createTreeWalker(node, NodeFilter.SHOW_ELEMENT);
const toRemove: Element[] = [];
while (walker.nextNode()) {
const el = walker.currentNode as Element;
if (this.EXCLUDED_TAGS.has(el.tagName)) toRemove.push(el);
}
toRemove.forEach(el => el.remove());
}
private normalizeWhitespace(raw: string): string {
return raw.replace(/\s+/g, ' ').trim();
}
}
Architecture Rationale:
TreeWalker is significantly faster than querySelectorAll for deep DOM traversal.
- Cloning prevents mutation of the live DOM, which is critical when running in production environments or browser extensions.
- Whitespace normalization handles line breaks, tabs, and non-breaking spaces uniformly.
Export formats require strict compliance. CSV must follow RFC 4180; JSON requires deterministic key generation.
class FormatAdapter {
public static toCSV(matrix: string[][], delimiter: string = ','): string {
return matrix
.map(row => row.map(cell => this.escapeCSVCell(cell, delimiter)).join(delimiter))
.join('\r\n');
}
private static escapeCSVCell(value: string | null, delimiter: string): string {
if (value == null) return '';
const str = String(value);
const requiresQuoting = str.includes(delimiter) || /["\r\n]/.test(str);
const escaped = str.replace(/"/g, '""');
return requiresQuoting ? `"${escaped}"` : escaped;
}
public static toJSON(matrix: string[][]): string {
if (matrix.length < 2) return '[]';
const headers = matrix[0].map((h, i) => this.sanitizeKey(h, i));
const records = matrix.slice(1).map(row => {
const obj: Record<string, string> = {};
headers.forEach((key, idx) => {
obj[key] = row[idx] ?? '';
});
return obj;
});
return JSON.stringify(records, null, 2);
}
private static sanitizeKey(raw: string, fallbackIndex: number): string {
const cleaned = (raw || '')
.normalize('NFD')
.replace(/[\u0300-\u036f]/g, '')
.toLowerCase()
.replace(/[^a-z0-9]+/g, '_')
.replace(/^_+|_+$/g, '');
return cleaned || `col_${fallbackIndex + 1}`;
}
}
Architecture Rationale:
- RFC 4180 compliance prevents spreadsheet import failures caused by unescaped delimiters or newlines.
- Key sanitization uses Unicode normalization to strip accents, ensuring cross-platform compatibility.
- Fallback keys prevent JSON key collisions when headers are empty or duplicated.
Step 4: Client-Side Export Mechanism
Generating files without server roundtrips relies on the Blob API and programmatic anchor activation.
class ExportEngine {
public static triggerDownload(content: string, filename: string, mimeType: string): void {
const blob = new Blob([content], { type: mimeType });
const url = URL.createObjectURL(blob);
const anchor = document.createElement('a');
anchor.href = url;
anchor.download = filename;
anchor.style.display = 'none';
document.body.appendChild(anchor);
anchor.click();
setTimeout(() => {
document.body.removeChild(anchor);
URL.revokeObjectURL(url);
}, 100);
}
}
Architecture Rationale:
URL.createObjectURL handles large payloads efficiently compared to data: URIs, which can exceed browser string limits.
- Appending and removing the anchor prevents CSP violations in strict environments.
- Deferred cleanup via
setTimeout ensures the download event fires before memory release.
Pitfall Guide
| Pitfall | Explanation | Fix |
|---|
| Linear Index Drift | Ignoring rowSpan/colSpan causes subsequent cells to shift left, corrupting column alignment. | Implement a coordinate matrix that tracks occupied cells and skips indices during iteration. |
innerText vs textContent | innerText respects CSS layout and triggers reflow, causing performance degradation and missing hidden data. | Use textContent on cloned nodes after pruning invisible elements. |
| RFC 4180 Violations | Unescaped commas or quotes in CSV values break spreadsheet parsers and database imports. | Wrap fields containing delimiters/newlines in quotes and double-escape internal quotes. |
| Nested Table Contamination | Layout tables inside data cells duplicate rows or inject unrelated metadata. | Traverse parent nodes to detect nesting; filter out any <table> whose ancestor chain contains another <table>. |
| Memory Leaks from ObjectURLs | Forgetting to call revokeObjectURL accumulates blob references, causing browser memory bloat. | Always pair createObjectURL with revokeObjectURL in a cleanup callback or finally block. |
| Synchronous DOM Blocking | Parsing tables with 10k+ rows on the main thread freezes the UI. | Offload extraction to a Web Worker or use requestIdleCallback for chunked processing. |
| Header Key Collisions | Duplicate or empty column names produce invalid JSON objects with overwritten values. | Sanitize keys, append fallback indices, and validate uniqueness before serialization. |
Production Bundle
Action Checklist
Decision Matrix
| Scenario | Recommended Approach | Why | Cost Impact |
|---|
| Static internal dashboard | Client-side Matrix Parser | Zero infrastructure, instant execution, low maintenance | $0 |
| Dynamic SPA with virtual scrolling | Headless Browser (Playwright) | Waits for JS rendering, handles infinite scroll | Server/CI costs |
| One-off data export via bookmarklet | Client-side Matrix Parser | Self-contained, no dependencies, runs in console | $0 |
| Enterprise ETL pipeline | Server-side Parser + Queue | Scalable, auditable, integrates with data warehouses | Infrastructure + dev time |
| Legacy HTML with heavy layout tables | Client-side Parser + Nesting Filter | Avoids layout contamination, preserves data integrity | Low |
Configuration Template
// table-extractor.config.ts
export interface ExtractionConfig {
targetSelector: string;
outputFormat: 'csv' | 'json';
delimiter?: string;
includeHeaders: boolean;
maxRows?: number;
sanitizeOptions: {
removeScripts: boolean;
removeStyles: boolean;
normalizeWhitespace: boolean;
};
}
export const defaultConfig: ExtractionConfig = {
targetSelector: 'table.data-table',
outputFormat: 'csv',
delimiter: ',',
includeHeaders: true,
maxRows: 10000,
sanitizeOptions: {
removeScripts: true,
removeStyles: true,
normalizeWhitespace: true
}
};
Quick Start Guide
- Initialize the extractor: Instantiate
MatrixExtractor and DOMSanitizer in your environment.
- Select the target: Pass the
HTMLTableElement to extractor.extract(table).
- Format the output: Pipe the resulting matrix into
FormatAdapter.toCSV() or FormatAdapter.toJSON().
- Trigger download: Call
ExportEngine.triggerDownload(content, 'export.csv', 'text/csv;charset=utf-8').
- Validate: Open the output in a spreadsheet or JSON viewer to verify alignment and escaping.
This architecture scales from console snippets to production data pipelines. By decoupling structural alignment, content sanitization, and format serialization, you maintain testable, reusable components that withstand the unpredictability of real-world HTML.