const maxCols = new Set<number>();
rows.forEach((rowEl, rowIndex) => {
if (!matrix[rowIndex]) matrix[rowIndex] = [];
let colIndex = 0;
Array.from(rowEl.cells).forEach((cell) => {
// Skip coordinates already occupied by previous rowspans
while (matrix[rowIndex][colIndex] !== undefined) {
colIndex++;
}
const rowSpan = Math.max(1, parseInt(cell.getAttribute('rowspan') || '1', 10));
const colSpan = Math.max(1, parseInt(cell.getAttribute('colspan') || '1', 10));
const rawText = sanitizeCellContent(cell);
// Fill the rectangular region occupied by this cell
for (let r = 0; r < rowSpan; r++) {
const targetRow = rowIndex + r;
if (!matrix[targetRow]) matrix[targetRow] = [];
for (let c = 0; c < colSpan; c++) {
const targetCol = colIndex + c;
if (matrix[targetRow][targetCol] === undefined) {
matrix[targetRow][targetCol] = rawText;
}
}
}
colIndex += colSpan;
maxCols.add(colIndex);
});
});
// Normalize row lengths to prevent jagged arrays
const width = Math.max(...maxCols, 0);
return matrix.map(row => {
while (row.length < width) row.push(null);
return row.slice(0, width);
});
}
**Architectural Rationale:**
- Using a `Set` to track maximum column width prevents assumptions about uniform row lengths.
- Filling the rectangular region ensures that `rowspan`/`colspan` cells propagate correctly without overwriting adjacent data.
- Normalizing row lengths after construction guarantees a rectangular matrix, which simplifies downstream serialization.
### Phase 2: Content Sanitization
Raw `textContent` captures everything, including injected styles, scripts, and formatting artifacts. Production parsers must isolate visible text while preserving semantic structure.
```typescript
function sanitizeCellContent(cell: HTMLElement): string {
const clone = cell.cloneNode(true) as HTMLElement;
// Remove non-visible and executable nodes
const removableSelectors = 'style, script, noscript, template, link, svg, img';
clone.querySelectorAll(removableSelectors).forEach(el => el.remove());
// Normalize whitespace and collapse line breaks
const raw = clone.textContent || '';
return raw
.replace(/\s+/g, ' ')
.replace(/[\u00A0\u200B\u200C\u200D]/g, '') // Remove zero-width/nbsp artifacts
.trim();
}
Architectural Rationale:
- Cloning prevents mutation of the live DOM, which is critical in bookmarklets or extensions running on third-party pages.
- Explicit removal of executable and media nodes prevents XSS vectors and visual noise from leaking into data exports.
- Unicode normalization handles invisible formatting characters that frequently break CSV parsers and database imports.
Data must be serialized according to strict specifications. CSV requires RFC 4180 compliance; JSON requires deterministic key generation.
function serializeToRFC4180(matrix: TableMatrix, delimiter: string = ','): string {
return matrix
.map(row =>
row
.map(cell => {
const value = cell ?? '';
const str = String(value);
const requiresQuoting = str.includes(delimiter) || /["\r\n]/.test(str);
const escaped = str.replace(/"/g, '""');
return requiresQuoting ? `"${escaped}"` : escaped;
})
.join(delimiter)
)
.join('\r\n');
}
function serializeToJSON(matrix: TableMatrix): string {
if (matrix.length < 2) return '[]';
const headers = matrix[0].map((h, i) => sanitizeHeaderKey(h, i));
const dataRows = matrix.slice(1);
const records = dataRows.map(row => {
const record: Record<string, string> = {};
headers.forEach((key, idx) => {
record[key] = row[idx] ?? '';
});
return record;
});
return JSON.stringify(records, null, 2);
}
function sanitizeHeaderKey(raw: string | null, fallbackIndex: number): string {
if (!raw || !raw.trim()) return `column_${fallbackIndex + 1}`;
return raw
.normalize('NFD')
.replace(/[\u0300-\u036f]/g, '')
.toLowerCase()
.replace(/[^a-z0-9]+/g, '_')
.replace(/^_+|_+$/g, '')
.replace(/_{2,}/g, '_');
}
Architectural Rationale:
- RFC 4180 compliance prevents delimiter collisions and quote escaping failures that corrupt spreadsheet imports.
- Header sanitization guarantees valid JSON keys by removing accents, normalizing case, and collapsing underscores.
- Fallback indexing prevents key collisions when headers are empty or duplicated.
Phase 4: Client-Side Delivery
Browser-based extraction must manage memory carefully. Object URLs consume heap space until explicitly revoked.
function triggerClientDownload(content: string, filename: string, mimeType: string): void {
const blob = new Blob([content], { type: mimeType });
const url = URL.createObjectURL(blob);
const anchor = document.createElement('a');
anchor.href = url;
anchor.download = filename;
anchor.style.display = 'none';
document.body.appendChild(anchor);
anchor.click();
// Cleanup to prevent memory leaks
setTimeout(() => {
document.body.removeChild(anchor);
URL.revokeObjectURL(url);
}, 100);
}
Architectural Rationale:
- Appending the anchor to the DOM ensures click events fire reliably across browsers.
- Delayed cleanup prevents race conditions where the download cancels before the blob is fully streamed.
- Explicit revocation prevents heap accumulation during batch exports.
Pitfall Guide
1. Linear Column Assumption
Explanation: Assuming every row contains the same number of cells ignores colspan and missing <td> elements. This causes column drift and misaligned exports.
Fix: Always construct a coordinate-tracked matrix. Calculate maximum column width dynamically and pad shorter rows with null values.
2. Script/Style Leakage
Explanation: textContent extracts content inside <script> and <style> tags, injecting JavaScript code or CSS rules into data exports.
Fix: Clone the cell node, remove executable and styling selectors, then extract text. Never mutate the live DOM during extraction.
3. CSV Delimiter Collisions
Explanation: Unquoted commas, newlines, or double quotes in cell values break naive join(',') exports, corrupting spreadsheet imports.
Fix: Implement RFC 4180 quoting logic. Escape internal quotes by doubling them, and wrap values containing delimiters or line breaks in quotes.
4. Memory Leaks from Object URLs
Explanation: URL.createObjectURL allocates memory that persists until explicitly revoked. Batch exports without cleanup cause heap exhaustion.
Fix: Always pair createObjectURL with revokeObjectURL inside a microtask or short timeout. Remove temporary DOM elements after triggering downloads.
Explanation: Duplicate or empty headers generate invalid or conflicting JSON keys, breaking downstream schema validation.
Fix: Sanitize headers to snake_case, strip accents, and append fallback indices when keys are empty or collide. Validate uniqueness before serialization.
6. Nested Table Contamination
Explanation: Extracting all <table> elements without filtering captures sub-tables used for UI components, polluting the primary dataset.
Fix: Traverse the DOM upward from each table element. If a parent <table> exists, classify it as nested and exclude it from extraction.
7. Main Thread Blocking on Large Tables
Explanation: Synchronous DOM traversal and string concatenation on tables with 10,000+ rows freezes the UI and triggers browser watchdog timeouts.
Fix: Chunk extraction using requestAnimationFrame or offload to a Web Worker. Stream CSV output incrementally instead of building a single massive string.
Production Bundle
Action Checklist
Decision Matrix
| Scenario | Recommended Approach | Why | Cost Impact |
|---|
| One-off dashboard export | Client-side matrix parser | Zero infrastructure, instant execution, handles spans natively | $0 (browser-native) |
| Recurring weekly reports | Server-side headless scraper | Bypasses CORS, handles JS-rendered tables, schedulable | Moderate (compute + browser runtime) |
| Enterprise ETL pipeline | Structured API + validation layer | Guarantees schema compliance, audit trails, versioning | High (development + maintenance) |
| Legacy system with no API | Bookmarklet + matrix parser | Non-invasive, works on static HTML, requires no deployment | $0 (developer time only) |
Configuration Template
// table-extractor.config.ts
export interface ExtractionConfig {
targetSelector: string;
outputFormat: 'csv' | 'json';
delimiter?: string;
includeHeaders: boolean;
sanitizeHeaders: boolean;
maxRows?: number;
}
export const defaultConfig: ExtractionConfig = {
targetSelector: 'table',
outputFormat: 'csv',
delimiter: ',',
includeHeaders: true,
sanitizeHeaders: true,
maxRows: 10000,
};
export function validateConfig(config: Partial<ExtractionConfig>): ExtractionConfig {
const merged = { ...defaultConfig, ...config };
if (!merged.targetSelector) throw new Error('Target selector is required');
if (!['csv', 'json'].includes(merged.outputFormat)) {
throw new Error('Output format must be csv or json');
}
return merged;
}
Quick Start Guide
- Identify the target table: Use browser DevTools to locate the primary
<table> element. Verify it contains the data you need and isn't a nested UI component.
- Inject the parser: Paste the matrix builder and sanitization functions into the console, or package them as a bookmarklet. Ensure the script targets the correct selector.
- Execute extraction: Call
buildVirtualGrid(document.querySelector('table')) to generate the coordinate-mapped matrix. Verify alignment by logging the first three rows.
- Serialize and export: Pass the matrix to
serializeToRFC4180() or serializeToJSON(). Trigger the download using triggerClientDownload() with the appropriate MIME type.
- Validate output: Open the exported file in a spreadsheet or JSON validator. Check for column drift, quote escaping, and header consistency. Adjust sanitization rules if artifacts appear.
This pipeline transforms fragile DOM traversal into a deterministic, production-hardened extraction process. By decoupling structural reconstruction from content sanitization and enforcing strict serialization standards, you eliminate the silent data corruption that plagues naive implementations. Deploy this pattern across automation scripts, internal tools, or lightweight ETL agents to achieve reliable, repeatable table exports without infrastructure overhead.