ce causal delivery and detect missing dependencies.
3. Separation of Concerns: Document state, presence data, and metadata are handled through distinct channels. Presence updates (cursors, selections) use last-write-wins semantics with TTLs, avoiding unnecessary CRDT merge overhead.
4. Event Log + Immutable Snapshots: The server maintains an append-only event log for auditability and replay. Periodic snapshots are stored as immutable checkpoints to bound reconciliation time without breaking the event chain.
5. WebSocket Relay with Pub/Sub Scaling: A lightweight relay handles bi-directional streaming. Redis Pub/Sub or a similar broker enables horizontal scaling across multiple relay instances while preserving per-document ordering.
Implementation: Client Sync Manager
The client manages local state, offline queuing, and network synchronization. It wraps the CRDT engine and handles reconnection logic.
import * as Y from 'yjs';
import { IndexedDBAdapter } from './storage/IndexedDBAdapter';
import { SyncProtocol } from './protocol/SyncProtocol';
export class DocumentSyncEngine {
private doc: Y.Doc;
private adapter: IndexedDBAdapter;
private protocol: SyncProtocol;
private isOnline: boolean = false;
constructor(docId: string, clientId: string) {
this.doc = new Y.Doc();
this.adapter = new IndexedDBAdapter(docId);
this.protocol = new SyncProtocol(docId, clientId);
this.initialize();
}
private async initialize(): Promise<void> {
// Restore local state from IndexedDB
const localState = await this.adapter.loadDocumentState();
if (localState) {
Y.applyUpdate(this.doc, localState);
}
// Observe local changes and queue them
this.doc.on('update', (update: Uint8Array) => {
this.adapter.queueUpdate(update);
if (this.isOnline) {
this.protocol.broadcast(update);
}
});
// Handle incoming remote updates
this.protocol.onRemoteUpdate((update: Uint8Array) => {
Y.applyUpdate(this.doc, update);
});
this.connect();
}
private async connect(): Promise<void> {
try {
await this.protocol.connect();
this.isOnline = true;
// Fetch missing updates from server
const pending = await this.adapter.getPendingUpdates();
if (pending.length > 0) {
for (const update of pending) {
await this.protocol.send(update);
}
await this.adapter.clearQueue();
}
// Request server state delta
const serverDelta = await this.protocol.requestDelta(this.doc.getStateVector());
if (serverDelta) {
Y.applyUpdate(this.doc, serverDelta);
}
} catch (err) {
console.warn('Sync connection failed, operating offline');
this.isOnline = false;
}
}
public insertText(index: number, content: string): void {
const text = this.doc.getText('content');
text.insert(index, content);
}
public deleteText(index: number, length: number): void {
const text = this.doc.getText('content');
text.delete(index, length);
}
}
Implementation: Server Relay & Event Sourcing
The server acts as a deterministic relay. It validates permissions, enforces causal delivery, and persists updates to an event log.
import { FastifyInstance } from 'fastify';
import WebSocket from 'ws';
import { EventStore } from './persistence/EventStore';
import { PermissionValidator } from './auth/PermissionValidator';
import { CausalRouter } from './routing/CausalRouter';
export class SyncRelay {
private eventStore: EventStore;
private causalRouter: CausalRouter;
private permissionValidator: PermissionValidator;
constructor(server: FastifyInstance) {
this.eventStore = new EventStore();
this.causalRouter = new CausalRouter();
this.permissionValidator = new PermissionValidator();
this.setupWebSocket(server);
}
private setupWebSocket(server: FastifyInstance): void {
server.register(async (fastify) => {
fastify.register(require('@fastify/websocket'));
});
server.register(async (fastify) => {
fastify.get('/sync/:docId', { websocket: true }, async (socket, req) => {
const docId = req.params.docId as string;
const clientId = req.query.clientId as string;
const token = req.query.token as string;
// Validate permissions before joining
if (!await this.permissionValidator.validate(docId, token, 'write')) {
socket.close(4001, 'Unauthorized');
return;
}
// Join causal routing table
this.causalRouter.addClient(docId, clientId, socket);
// Stream pending events to new client
const clientVector = req.query.stateVector as string;
const missingEvents = await this.eventStore.getDelta(docId, clientVector);
if (missingEvents.length > 0) {
for (const event of missingEvents) {
socket.send(JSON.stringify({ type: 'delta', payload: event }));
}
}
// Handle incoming updates
socket.on('message', async (raw: WebSocket.Data) => {
const msg = JSON.parse(raw.toString());
if (msg.type === 'update') {
// Persist to event log
await this.eventStore.append(docId, {
clientId,
payload: msg.payload,
timestamp: Date.now(),
vector: msg.vector
});
// Broadcast to other clients in causal order
this.causalRouter.broadcast(docId, clientId, msg.payload, msg.vector);
}
});
socket.on('close', () => {
this.causalRouter.removeClient(docId, clientId);
});
});
});
}
}
Implementation: Snapshot & Reconciliation Strategy
To prevent unbounded event log growth, the system periodically generates immutable snapshots. Clients use these to bootstrap quickly, while the event log remains append-only for audit and replay.
export class SnapshotManager {
private eventStore: EventStore;
private snapshotInterval: number = 500; // ops
constructor(eventStore: EventStore) {
this.eventStore = eventStore;
}
public async evaluateAndSnapshot(docId: string, opCount: number): Promise<void> {
if (opCount % this.snapshotInterval !== 0) return;
const currentState = await this.eventStore.reconstructState(docId);
const snapshotHash = this.computeHash(currentState);
// Store as immutable checkpoint
await this.eventStore.saveSnapshot(docId, {
hash: snapshotHash,
payload: currentState,
createdAt: new Date().toISOString(),
opCount
});
}
private computeHash(state: Uint8Array): string {
// Simplified hash for demonstration
return Buffer.from(state).toString('base64').slice(0, 16);
}
}
Why these choices work:
- The client sync manager isolates offline logic, ensuring UI remains responsive during network partitions.
- The server relay avoids transformation complexity by relying on CRDT merge semantics and causal routing.
- Event sourcing with immutable snapshots provides auditability, fast recovery, and bounded storage costs.
- Separating presence from document state prevents unnecessary merge overhead and reduces bandwidth.
Pitfall Guide
1. Ignoring Causal Ordering in Broadcasts
Explanation: Broadcasting operations without version vectors or logical clocks causes clients to apply updates out of causal order, leading to divergent states and silent data corruption.
Fix: Attach a version vector to every operation. The server must queue operations until all causal dependencies are satisfied before broadcasting. Implement a causal router that tracks client vectors and delays delivery until prerequisites arrive.
2. Unbounded Local Operation Queues
Explanation: Offline users can accumulate thousands of local edits. When connectivity returns, the sync storm overwhelms the relay and causes client-side memory pressure.
Fix: Implement periodic local compaction. Merge consecutive edits into delta batches, enforce a maximum queue depth, and prioritize critical updates during reconnection. Use exponential backoff for retry attempts.
3. Blocking the Main Thread with CRDT Merges
Explanation: Applying large CRDT updates on the main thread freezes the UI, especially in rich-text editors with complex formatting.
Fix: Offload CRDT state application to a Web Worker. Use postMessage for update transmission and requestIdleCallback for non-critical rendering. Keep the main thread focused on input handling and visual updates.
4. Treating Presence as Part of the Document CRDT
Explanation: Presence data (cursors, selections, typing indicators) doesn't require merge semantics. Including it in the CRDT bloats state size and forces unnecessary reconciliation.
Fix: Separate presence into a lightweight stream with last-write-wins semantics. Attach TTLs to presence payloads and discard stale updates. Use a dedicated WebSocket channel or multiplexed message type for presence traffic.
5. Skipping Backpressure on WebSocket Relays
Explanation: Malicious or misbehaving clients can flood the relay with updates, causing memory exhaustion and cascading failures across connected users.
Fix: Implement token bucket rate limiting per connection. Set queue depth thresholds and drop or buffer messages when limits are exceeded. Monitor relay health with circuit breakers that temporarily suspend new connections during overload.
6. Naive Snapshot Overwrites
Explanation: Replacing the entire event log with a snapshot breaks audit trails, prevents replay, and complicates debugging when synchronization issues arise.
Fix: Store snapshots as immutable checkpoints alongside the append-only event log. Clients request the latest snapshot, then apply only the delta from the snapshot's op count. Preserve the event log for compliance and forensic analysis.
7. Weak Authorization on Sync Endpoints
Explanation: Authenticating users during the initial HTTP request but not validating permissions at the WebSocket layer allows token reuse or privilege escalation across documents.
Fix: Validate document-level ACLs during the WebSocket handshake. Re-check permissions periodically for long-lived connections. Use short-lived tokens scoped to specific documents and revoke access immediately when permissions change.
Production Bundle
Action Checklist
Decision Matrix
| Scenario | Recommended Approach | Why | Cost Impact |
|---|
| Small team (<10 concurrent editors) | CRDT + Single Relay | Simpler deployment, lower infrastructure overhead, fast iteration | Low |
| High-concurrency SaaS (>50 editors/doc) | CRDT + Pub/Sub Relay + Snapshots | Horizontal scaling, bounded memory, predictable latency under load | Medium |
| Offline-first field application | CRDT + IndexedDB + Delta Sync | Resilient to network partitions, deterministic merge, minimal server dependency | Low-Medium |
| Compliance-heavy enterprise | Event-Sourced CRDT + Immutable Snapshots | Full audit trail, replay capability, regulatory alignment | Medium-High |
Configuration Template
// sync.config.ts
export const SyncConfig = {
relay: {
maxConnectionsPerDoc: 100,
rateLimit: {
tokensPerSecond: 30,
burstSize: 50
},
queueDepthThreshold: 500,
heartbeatInterval: 15000
},
persistence: {
snapshotIntervalOps: 500,
eventLogRetentionDays: 90,
snapshotStorage: 's3',
eventLogStorage: 'postgresql'
},
client: {
offlineQueueMaxSize: 2000,
reconnectBackoff: {
initial: 1000,
max: 30000,
multiplier: 1.5
},
presenceTTL: 5000,
workerOffload: true
},
security: {
tokenValidation: 'jwt',
docScopeEnforcement: true,
tlsRequired: true,
messageSigning: 'hmac-sha256'
}
};
Quick Start Guide
- Initialize the CRDT Document: Create a new Yjs document instance, bind a text type to your editor component, and attach an update observer that queues changes locally.
- Establish WebSocket Connection: Connect to the sync relay endpoint with a document-scoped JWT. Request the current state vector and apply any missing deltas from the server.
- Configure Offline Persistence: Set up IndexedDB to store pending updates and local document state. Implement a reconnection handler that flushes the queue and fetches server deltas.
- Deploy the Relay: Run the WebSocket relay with Redis Pub/Sub for horizontal scaling. Configure event log persistence and snapshot intervals according to your concurrency targets.
- Validate Convergence: Open two browser windows, edit simultaneously, simulate network loss, and verify that both clients converge to identical state after reconnection.