aragraphs(docB)
};
}
private static async readDocumentXml(file: File): Promise<string> {
const zip = await JSZip.loadAsync(file);
const xmlEntry = zip.file('word/document.xml');
if (!xmlEntry) throw new Error('Invalid .docx structure: missing document.xml');
const raw = await xmlEntry.async('uint8array');
return new TextDecoder('utf-8').decode(raw);
}
private static tokenizeParagraphs(xmlString: string): string[] {
const parser = new DOMParser();
const doc = parser.parseFromString(xmlString, 'application/xml');
const paragraphs = doc.getElementsByTagName('w:p');
const result: string[] = [];
for (let i = 0; i < paragraphs.length; i++) {
const runs = paragraphs[i].getElementsByTagName('w:t');
const text = Array.from(runs).map(r => r.textContent || '').join('');
if (text.trim().length > 0) {
result.push(this.normalizeWhitespace(text));
}
}
return result;
}
private static normalizeWhitespace(input: string): string {
return input.replace(/\s+/g, ' ').trim();
}
}
**Rationale:** `JSZip` handles ZIP decompression efficiently without external WASM dependencies. `DOMParser` provides native XML traversal. We explicitly filter empty paragraphs and normalize whitespace to prevent alignment drift caused by formatting artifacts.
### Phase 2: Hybrid Paragraph Alignment
Pure Longest Common Subsequence (LCS) fails when paragraphs are reordered, merged, or split during editing. We implement a similarity threshold combined with index mapping to align structural blocks before diffing.
```typescript
export class ParagraphAligner {
private static similarityThreshold = 0.65;
static align(original: string[], revised: string[]): Array<{
status: 'match' | 'insert' | 'delete' | 'modify';
originalIndex: number | null;
revisedIndex: number | null;
text: string;
}> {
const alignmentMap: Array<{
status: 'match' | 'insert' | 'delete' | 'modify';
originalIndex: number | null;
revisedIndex: number | null;
text: string;
}> = [];
const usedRevised = new Set<number>();
for (let i = 0; i < original.length; i++) {
let bestMatch = -1;
let bestScore = 0;
for (let j = 0; j < revised.length; j++) {
if (usedRevised.has(j)) continue;
const score = this.computeSimilarity(original[i], revised[j]);
if (score > bestScore && score >= this.similarityThreshold) {
bestScore = score;
bestMatch = j;
}
}
if (bestMatch !== -1) {
usedRevised.add(bestMatch);
const isModified = bestScore < 0.95;
alignmentMap.push({
status: isModified ? 'modify' : 'match',
originalIndex: i,
revisedIndex: bestMatch,
text: isModified ? revised[bestMatch] : original[i]
});
} else {
alignmentMap.push({
status: 'delete',
originalIndex: i,
revisedIndex: null,
text: original[i]
});
}
}
for (let j = 0; j < revised.length; j++) {
if (!usedRevised.has(j)) {
alignmentMap.push({
status: 'insert',
originalIndex: null,
revisedIndex: j,
text: revised[j]
});
}
}
return alignmentMap;
}
private static computeSimilarity(a: string, b: string): number {
const tokensA = a.split(' ');
const tokensB = b.split(' ');
const intersection = tokensA.filter(t => tokensB.includes(t)).length;
const union = new Set([...tokensA, ...tokensB]).size;
return union === 0 ? 1 : intersection / union;
}
}
Rationale: Jaccard similarity provides a fast, token-based heuristic for paragraph matching. The threshold (0.65) filters noise while catching rephrased or partially edited blocks. Unmatched paragraphs are flagged as inserts or deletes. This hybrid approach avoids the O(N×M) computational penalty of pure LCS on large documents while maintaining structural integrity.
Phase 3: Word-Level Redline Rendering
Once paragraphs are aligned, we apply a word-level diff to modified blocks and generate semantic HTML. The output uses <ins> and <del> tags for accessibility and styling compatibility.
export class RedlineRenderer {
static generateMarkup(alignment: ReturnType<typeof ParagraphAligner.align>): string {
const fragment = document.createDocumentFragment();
for (const block of alignment) {
const wrapper = document.createElement('div');
wrapper.className = `diff-block diff-${block.status}`;
if (block.status === 'match') {
wrapper.textContent = block.text;
} else if (block.status === 'delete') {
const del = document.createElement('del');
del.textContent = block.text;
wrapper.appendChild(del);
} else if (block.status === 'insert') {
const ins = document.createElement('ins');
ins.textContent = block.text;
wrapper.appendChild(ins);
} else if (block.status === 'modify') {
const diff = this.computeWordDiff(block.text, alignment.find(b => b.originalIndex === block.originalIndex)?.text || '');
wrapper.innerHTML = diff;
}
fragment.appendChild(wrapper);
}
return fragment.innerHTML;
}
private static computeWordDiff(original: string, revised: string): string {
const origTokens = original.split(' ');
const revTokens = revised.split(' ');
const result: string[] = [];
const maxLen = Math.max(origTokens.length, revTokens.length);
for (let i = 0; i < maxLen; i++) {
const o = origTokens[i] || '';
const r = revTokens[i] || '';
if (o === r) {
result.push(o);
} else {
if (o) result.push(`<del>${o}</del>`);
if (r) result.push(`<ins>${r}</ins>`);
}
}
return result.join(' ');
}
}
Rationale: DocumentFragment prevents layout thrashing during DOM construction. Word-level diffing is intentionally simplified for performance; production systems should swap computeWordDiff with a Myers diff or Patience diff for higher accuracy. The semantic tags ensure screen readers and CSS theming work without custom JavaScript overlays.
Pitfall Guide
1. Namespace Stripping in DOMParser
Explanation: Browsers may strip w: prefixes when parsing OOXML, causing getElementsByTagName('w:p') to return empty results.
Fix: Use getElementsByTagNameNS('http://schemas.openxmlformats.org/wordprocessingml/2006/main', 'p') or fallback to regex-based extraction when namespace resolution fails.
2. Naive LCS on Reordered Paragraphs
Explanation: Pure LCS assumes sequential order. Legal drafts frequently move clauses, causing LCS to mark entire sections as deleted/inserted.
Fix: Implement similarity scoring with index tracking. Only fall back to LCS when similarity exceeds 0.90, indicating minor edits rather than structural moves.
3. Memory Spikes on Large Archives
Explanation: JSZip.loadAsync() buffers the entire file in memory. A 50MB contract can trigger GC pauses or OOM errors on low-end devices.
Fix: Enforce a client-side file size limit (e.g., 25MB). For larger files, implement chunked reading via ReadableStream or delegate to a Web Worker with explicit memory budgeting.
4. Whitespace Normalization Drift
Explanation: OOXML uses <w:space="preserve"> and soft returns (<w:br/>). Blindly collapsing all whitespace destroys intentional line breaks and indentation.
Fix: Parse <w:br/> and <w:tab/> elements explicitly. Replace them with \n or \t before tokenization. Preserve preserve-space attributes during extraction.
5. DOM Rendering Bottlenecks
Explanation: Injecting thousands of <ins>/<del> nodes triggers reflow/repaint cycles, freezing the UI thread.
Fix: Batch DOM updates using DocumentFragment. If rendering exceeds 500 blocks, implement virtual scrolling or render only the viewport. Offload diff computation to a Web Worker.
6. Cross-Browser XML Parsing Inconsistencies
Explanation: Safari and Firefox handle malformed XML differently. Missing closing tags or invalid entities cause silent parse failures.
Fix: Validate XML structure before parsing. Use TextDecoder with {fatal: true} to catch encoding errors early. Implement a fallback parser that strips invalid characters before DOM injection.
Explanation: Text in .docx is split across <w:r> elements with varying styles. Concatenating runs without tracking style boundaries loses bold/italic context.
Fix: If formatting preservation is required, store run metadata alongside text tokens. Apply style classes during redline rendering instead of plain text concatenation.
Production Bundle
Action Checklist
Decision Matrix
| Scenario | Recommended Approach | Why | Cost Impact |
|---|
| High compliance / air-gapped network | Browser-Side | Zero data exfiltration, offline capable | $0 infrastructure |
| Real-time collaborative editing | Server-Side | Requires state synchronization and conflict resolution | High (compute + storage) |
| Complex tables / embedded objects | Server-Side | Browser parsers lack full OOXML schema support | Medium (library licensing) |
| Budget-constrained / high volume | Browser-Side | Eliminates per-request compute costs | $0 marginal cost |
| Legacy browser support required | Server-Side | Fallback for environments without modern JS APIs | Medium (CDN + compute) |
Configuration Template
// diff-engine.config.ts
export interface DiffEngineConfig {
maxFileSizeMB: number;
similarityThreshold: number;
enableWebWorker: boolean;
renderMode: 'fragment' | 'virtual';
namespaceFallback: boolean;
}
export const defaultConfig: DiffEngineConfig = {
maxFileSizeMB: 25,
similarityThreshold: 0.65,
enableWebWorker: true,
renderMode: 'fragment',
namespaceFallback: true
};
export function validateConfig(config: Partial<DiffEngineConfig>): DiffEngineConfig {
const merged = { ...defaultConfig, ...config };
if (merged.similarityThreshold < 0.5 || merged.similarityThreshold > 0.95) {
throw new Error('Similarity threshold must be between 0.5 and 0.95');
}
if (merged.maxFileSizeMB > 50) {
console.warn('File size limit exceeds recommended threshold. Consider Web Worker offloading.');
}
return merged;
}
Quick Start Guide
- Install dependencies:
npm install jszip @types/jszip
- Initialize the engine: Import
ArchiveExtractor, ParagraphAligner, and RedlineRenderer. Pass two .docx File objects to ArchiveExtractor.extractTextPayloads().
- Align and diff: Feed the extracted paragraph arrays into
ParagraphAligner.align(). Pipe the result into RedlineRenderer.generateMarkup().
- Render output: Inject the returned HTML string into a container element. Apply CSS rules for
.diff-insert, .diff-delete, and .diff-modify to visualize changes.
- Optimize for production: Wrap the extraction and alignment steps in a Web Worker. Set
enableWebWorker: true in the configuration template to prevent UI thread blocking during large document processing.