okController {
constructor(private readonly repo: WebhookRepository) {}
async handleIncoming(req: Request, res: Response): Promise<void> {
const signature = req.headers['x-provider-signature'] as string;
const rawBody = req.body;
if (!this.verifyPayload(rawBody, signature)) {
res.status(401).json({ error: 'Invalid signature' });
return;
}
const eventId = rawBody.id;
const eventType = rawBody.type;
const payload = JSON.stringify(rawBody);
try {
await this.repo.insertRaw(eventId, eventType, payload);
res.status(200).json({ status: 'acknowledged' });
} catch (error) {
// Return 500 to trigger provider retry. The PK constraint prevents duplicates.
res.status(500).json({ error: 'Storage failure' });
}
}
private verifyPayload(payload: string, signature: string): boolean {
const expected = createHmac('sha256', process.env.WEBHOOK_SECRET)
.update(payload)
.digest('hex');
return signature === expected;
}
}
**Architecture Rationale:** By isolating verification and insertion, the endpoint remains stateless and fast. Returning `500` on storage failure is intentional—it signals the provider to retry, while the primary key constraint ensures the retry won't create duplicate rows.
### Layer 2: Idempotency Enforcement
Deduplication must occur at the database constraint level, not in application logic. Relying on `SELECT` checks before `INSERT` introduces race conditions under concurrent delivery.
```sql
CREATE TABLE incoming_events (
event_id VARCHAR(255) PRIMARY KEY,
event_type VARCHAR(100) NOT NULL,
raw_payload JSONB NOT NULL,
status VARCHAR(20) DEFAULT 'pending',
attempts INT DEFAULT 0,
next_retry_at TIMESTAMPTZ,
error_message TEXT,
created_at TIMESTAMPTZ DEFAULT NOW()
);
CREATE INDEX idx_events_status_retry ON incoming_events(status, next_retry_at);
The PRIMARY KEY on event_id guarantees that duplicate deliveries from the provider are silently ignored by the database engine. Application-level deduplication is fragile; constraint-level deduplication is deterministic.
Layer 3: Asynchronous Worker with Backoff & Dead-Letter Queue
Business logic executes in a separate process that polls the storage table. This worker implements exponential backoff, concurrency control, and a dead-letter mechanism for poison events.
import { Pool } from 'pg';
import { EventProcessor } from './processors/EventProcessor';
export class WebhookWorker {
private readonly backoffSchedule = [10, 60, 300, 1800, 7200];
private readonly maxAttempts = 10;
constructor(private readonly db: Pool, private readonly processor: EventProcessor) {}
async run(): Promise<void> {
while (true) {
const client = await this.db.connect();
try {
await client.query('BEGIN');
const result = await client.query(
`SELECT event_id, event_type, raw_payload, attempts
FROM incoming_events
WHERE status = 'pending'
AND (next_retry_at IS NULL OR next_retry_at <= NOW())
ORDER BY created_at ASC
LIMIT 1
FOR UPDATE SKIP LOCKED`,
);
if (result.rows.length === 0) {
await client.query('COMMIT');
await this.sleep(2000);
continue;
}
const row = result.rows[0];
await client.query('COMMIT');
await this.processEvent(row);
} catch (error) {
await client.query('ROLLBACK');
console.error('Worker transaction failure', error);
} finally {
client.release();
}
}
}
private async processEvent(row: any): Promise<void> {
const attempt = row.attempts + 1;
const isTerminal = attempt >= this.maxAttempts;
try {
await this.processor.execute(row.event_type, row.raw_payload);
await this.db.query(
`UPDATE incoming_events SET status = 'completed', processed_at = NOW() WHERE event_id = $1`,
[row.event_id],
);
} catch (error) {
const delay = this.backoffSchedule[Math.min(attempt - 1, this.backoffSchedule.length - 1)];
const newStatus = isTerminal ? 'dead_letter' : 'pending';
await this.db.query(
`UPDATE incoming_events
SET attempts = $1, status = $2, error_message = $3, next_retry_at = NOW() + ($4 || ' seconds')::INTERVAL
WHERE event_id = $5`,
[attempt, newStatus, (error as Error).message, delay, row.event_id],
);
if (isTerminal) {
await this.notifyDeadLetter(row.event_id, error as Error);
}
}
}
private sleep(ms: number): Promise<void> {
return new Promise(resolve => setTimeout(resolve, ms));
}
}
Architecture Rationale: FOR UPDATE SKIP LOCKED enables multiple worker instances to run concurrently without contention. The backoff schedule prevents cascading failures during downstream outages. Events exceeding the attempt threshold move to dead_letter, triggering alerts instead of infinite retry loops.
Layer 4: Deterministic Reconciliation
Async processing handles real-time events, but it cannot recover from provider retry expiration or manual portal changes. A scheduled reconciliation job compares provider state against local state and repairs divergence.
import { ProviderClient } from './clients/ProviderClient';
import { UserRepository } from './repositories/UserRepository';
export class StateReconciler {
constructor(
private readonly provider: ProviderClient,
private readonly users: UserRepository,
private readonly alerting: AlertService
) {}
async syncSubscriptions(): Promise<void> {
const providerSubs = await this.provider.listActiveSubscriptions();
const driftReport: Array<{ userId: string; expected: string; actual: string }> = [];
for (const sub of providerSubs) {
const localUser = await this.users.findByProviderId(sub.customerId);
if (!localUser) continue;
if (localUser.planTier !== sub.planKey) {
driftReport.push({
userId: localUser.id,
expected: sub.planKey,
actual: localUser.planTier,
});
await this.users.updatePlanTier(localUser.id, sub.planKey);
}
}
if (driftReport.length > 0) {
await this.alerting.sendReconciliationReport(driftReport);
}
}
}
Architecture Rationale: Reconciliation runs on a fixed schedule (e.g., daily at low-traffic hours). It acts as a deterministic safety net, correcting state that slipped through the async pipeline due to expired retries, customer portal modifications, or silent worker failures.
Pitfall Guide
| Pitfall | Explanation | Fix |
|---|
| Synchronous Side-Effects in HTTP Handlers | Executing DB updates, email dispatch, or analytics calls inside the webhook request increases latency and causes provider timeouts. | Restrict the HTTP handler to signature verification and raw event insertion. Offload all business logic to a background worker. |
| Application-Level Deduplication | Using SELECT before INSERT to check for duplicates creates race conditions. Concurrent deliveries will bypass the check and create duplicate rows. | Enforce deduplication at the database layer using a PRIMARY KEY or UNIQUE constraint on the provider's event ID. |
| Tying Workers to Web Process Lifecycles | Running webhook consumers inside the same process as the HTTP server means workers die during deployments, scaling events, or container restarts. | Deploy workers as independent processes or services. Use process managers (PM2, systemd) or container orchestrators to guarantee uptime. |
| Ignoring Provider Retry Windows | Assuming events will eventually arrive ignores the hard cutoff (e.g., 72 hours for Stripe). After expiration, the provider discards the event permanently. | Implement a reconciliation cron that fetches provider state directly via API. This catches events lost after retry expiration. |
| Non-Idempotent Business Logic | Granting access, sending receipts, or processing refunds without idempotency checks causes duplicate charges or access violations when retries occur. | Design all side-effects to be safe for repeated execution. Use idempotency keys for financial operations. Wrap state changes in transactions that check current state before mutating. |
| Reconciliation Without Atomic Updates | Running reconciliation queries that read and write without transactions can corrupt state if the job is interrupted or runs concurrently. | Use database transactions for reconciliation updates. Prefer UPDATE ... WHERE plan != $1 to avoid unnecessary writes and reduce lock contention. |
| Silent Dead-Letter Accumulation | Failing events that hit the retry limit are often logged but never monitored, leading to unnoticed state corruption. | Route dead-letter events to a dedicated alerting channel. Implement a dashboard for dead_letter status and require manual review or automated retry policies. |
Production Bundle
Action Checklist
Decision Matrix
| Scenario | Recommended Approach | Why | Cost Impact |
|---|
| Low volume (<100 events/day) | Single worker + daily cron | Simplicity reduces operational overhead. Single process handles load without contention. | Minimal (single VM/container) |
| Medium volume (100-10k events/day) | Multi-worker pool + DB-backed queue | SKIP LOCKED enables horizontal scaling. DB queue avoids external dependencies. | Moderate (read replicas, connection pooling) |
| High volume (>10k events/day) | Message broker (SQS/RabbitMQ) + async consumers | Decouples storage from processing. Enables fan-out, prioritization, and advanced retry policies. | Higher (broker infrastructure, monitoring) |
| Strict financial compliance | Synchronous verification + async processing + hourly reconciliation | Ensures audit trail, immediate acknowledgment, and frequent state correction. | Higher (compliance tooling, dedicated reconciliation jobs) |
Configuration Template
# docker-compose.worker.yml
version: '3.8'
services:
webhook-worker:
build: .
command: node dist/workers/WebhookWorker.js
environment:
- DATABASE_URL=postgresql://app_user:secure_pass@db:5432/webhooks
- WEBHOOK_SECRET=${WEBHOOK_SECRET}
- WORKER_CONCURRENCY=3
- MAX_RETRY_ATTEMPTS=10
depends_on:
- db
restart: unless-stopped
deploy:
resources:
limits:
memory: 256M
reconciliation-cron:
build: .
command: node dist/jobs/StateReconciler.js
environment:
- DATABASE_URL=postgresql://app_user:secure_pass@db:5432/webhooks
- PROVIDER_API_KEY=${PROVIDER_API_KEY}
depends_on:
- db
restart: "no"
# Run via external scheduler (cron, GitHub Actions, or cloud scheduler)
Quick Start Guide
- Initialize Storage: Run the DDL script to create the
incoming_events table with a PRIMARY KEY on event_id and indexes for status/retry filtering.
- Deploy Acknowledgment Endpoint: Implement the HTTP handler to verify signatures and insert raw payloads. Test with provider CLI tools to confirm
200 OK responses under load.
- Launch Worker Process: Start the async worker with
FOR UPDATE SKIP LOCKED polling. Verify it processes pending rows, applies backoff on failure, and routes exhausted events to dead_letter.
- Schedule Reconciliation: Configure a daily cron job to fetch provider state, diff against local records, and apply corrections. Validate drift detection by manually altering a test record and running the job.
- Monitor & Alert: Wire dead-letter status and reconciliation reports to your incident management system. Confirm end-to-end flow by triggering test events and verifying state synchronization within 5 minutes.