rror: 'Invalid HMAC' });
}
// Acknowledge immediately to prevent platform redelivery
res.status(200).json({ status: 'accepted' });
try {
const payload: WebhookPayload = JSON.parse(rawBody.toString());
await ingestQueue.add('process-event', {
eventId: req.get('X-Shopify-Webhook-Id') || crypto.randomUUID(),
shopDomain: req.get('X-Shopify-Shop-Domain') || 'unknown',
topic,
payload,
ingestedAt: new Date().toISOString()
}, {
attempts: 5,
backoff: { type: 'exponential', delay: 2000 },
removeOnComplete: true,
removeOnFail: false // Preserve for DLQ routing
});
} catch (queueError) {
// Fallback: queue infrastructure failure
await routeToFailureArchive({
eventId: req.get('X-Shopify-Webhook-Id') || 'unknown',
topic,
payload: JSON.parse(rawBody.toString()),
error: 'Ingestion queue unavailable',
details: queueError.message
});
}
});
function verifySignature(body: Buffer, header: string, secret: string): boolean {
const digest = crypto
.createHmac('sha256', secret)
.update(body)
.digest('base64');
return crypto.timingSafeEqual(Buffer.from(digest), Buffer.from(header));
}
**Architecture Rationale:** Immediate acknowledgment prevents platform-side timeout loops. HMAC verification happens synchronously because it's CPU-bound and fast. Queue insertion failures are caught and routed directly to the failure archive, ensuring zero data loss even if the primary queue is down.
### 2. Processing Worker & Error Triage
The worker consumes jobs from the primary queue. It must distinguish between transient infrastructure failures (network timeouts, rate limits, deadlocks) and permanent business logic failures (invalid payload schema, missing merchant configuration). Transient errors trigger BullMQ's built-in backoff. Permanent errors bypass retries and move directly to the DLQ.
```typescript
import { Worker } from 'bullmq';
import { FailureArchive } from '../storage/failure-archive';
import { executeBusinessLogic } from '../services/event-handler';
const worker = new Worker('webhook-ingest', async (job) => {
const { eventId, topic, payload, shopDomain } = job.data;
try {
await executeBusinessLogic(topic, payload, shopDomain);
return { success: true, eventId };
} catch (error) {
if (isTransientFailure(error)) {
throw error; // BullMQ handles exponential backoff
}
// Permanent failure: bypass retries, archive immediately
await FailureArchive.record({
eventId,
topic,
shopDomain,
payload,
error: error.message,
stack: error.stack,
attemptCount: job.attemptsMade,
firstFailureAt: new Date().toISOString(),
lastFailureAt: new Date().toISOString()
});
return { success: false, routedToArchive: true };
}
}, { connection: { host: 'redis', port: 6379 } });
function isTransientFailure(error: any): boolean {
const transientCodes = ['ECONNREFUSED', 'ETIMEDOUT', 'ECONNRESET'];
const httpStatuses = [429, 502, 503, 504];
return transientCodes.includes(error.code) ||
httpStatuses.includes(error.statusCode) ||
error.message?.includes('deadlock') ||
error.message?.includes('pool exhausted');
}
Architecture Rationale: Error classification prevents wasting retry attempts on unfixable payloads. By throwing transient errors, BullMQ manages backoff natively. Permanent errors are archived immediately, freeing queue capacity and reducing latency for healthy events.
3. Failure Archive & Retry Orchestrator
The DLQ is implemented as a PostgreSQL table with JSONB payload storage. An upsert strategy prevents duplicate entries during concurrent failures. A background scheduler polls the archive, respects a maximum retry cap, and applies dynamic backoff.
import { Pool } from 'pg';
import { WebhookFailure } from '../types';
const db = new Pool({ connectionString: process.env.DATABASE_URL });
export class FailureArchive {
static async record(data: WebhookFailure): Promise<void> {
const query = `
INSERT INTO event_failures (
event_id, topic, shop_domain, payload,
error_message, stack_trace, attempt_count,
first_failure_at, last_failure_at
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9)
ON CONFLICT (event_id) DO UPDATE SET
attempt_count = event_failures.attempt_count + 1,
last_failure_at = $9,
error_message = $5,
stack_trace = $6
RETURNING id;
`;
await db.query(query, [
data.eventId, data.topic, data.shopDomain, JSON.stringify(data.payload),
data.error, data.stack, data.attemptCount,
data.firstFailureAt, data.lastFailureAt
]);
}
static async fetchPending(batchSize: number = 50): Promise<WebhookFailure[]> {
const result = await db.query(
`SELECT * FROM event_failures
WHERE status = 'pending'
ORDER BY last_failure_at ASC
LIMIT $1`,
[batchSize]
);
return result.rows;
}
static async markResolved(eventId: string, resolutionNote: string): Promise<void> {
await db.query(
`UPDATE event_failures
SET status = 'resolved', resolved_at = NOW(), notes = $2
WHERE event_id = $1`,
[eventId, resolutionNote]
);
}
}
The retry orchestrator runs on a cron schedule. It locks records to prevent concurrent processing, applies business logic, and enforces a hard cap on retries.
import { FailureArchive } from '../storage/failure-archive';
import { executeBusinessLogic } from '../services/event-handler';
export async function runRetryCycle(): Promise<void> {
const batch = await FailureArchive.fetchPending(50);
for (const record of batch) {
try {
await db.query('UPDATE event_failures SET status = $1 WHERE event_id = $2', ['processing', record.event_id]);
await executeBusinessLogic(record.topic, record.payload, record.shop_domain);
await FailureArchive.markResolved(record.event_id, 'Auto-recovered via retry cycle');
} catch (retryError) {
const nextAttempt = record.attempt_count + 1;
if (nextAttempt >= 10) {
await db.query('UPDATE event_failures SET status = $1 WHERE event_id = $2', ['abandoned', record.event_id]);
await notifyPermanentFailure(record);
} else {
await FailureArchive.record({
eventId: record.event_id,
topic: record.topic,
shopDomain: record.shop_domain,
payload: record.payload,
error: retryError.message,
stack: retryError.stack,
attemptCount: nextAttempt,
firstFailureAt: record.first_failure_at,
lastFailureAt: new Date().toISOString()
});
}
}
}
}
Architecture Rationale: PostgreSQL JSONB enables flexible payload inspection without rigid schema migrations. The ON CONFLICT upsert prevents duplicate archive entries during race conditions. The retry cap prevents infinite loops on fundamentally broken payloads. Status transitions (pending β processing β resolved/abandoned) provide clear operational state tracking.
Pitfall Guide
1. Synchronous Processing Blocking the HTTP Response
Explanation: Performing database writes, third-party API calls, or heavy transformations inside the webhook endpoint delays the 200 OK response. Shopify's timeout threshold is strict; delayed responses trigger duplicate deliveries and queue storms.
Fix: Always acknowledge receipt immediately. Push payloads to a message broker or job queue. Process asynchronously in isolated workers.
2. Ignoring Idempotency in DLQ Replays
Explanation: DLQ retries replay identical payloads. Without idempotency checks, downstream systems process duplicate orders, double-charge customers, or corrupt inventory counts.
Fix: Implement idempotency keys using the platform's webhook_id or a composite hash of payload fields. Use database UNIQUE constraints or distributed locks to reject duplicate processing attempts.
3. Treating All Failures as Transient
Explanation: Routing every error through exponential backoff wastes compute resources and delays recovery for healthy events. A malformed payload or missing merchant configuration will never succeed regardless of retry count.
Fix: Implement error classification logic. Route network timeouts, rate limits, and deadlocks to backoff. Route validation errors, missing dependencies, and business rule violations directly to the DLQ.
4. Unbounded DLQ Growth Without Tombstoning
Explanation: DLQs accumulate indefinitely if abandoned records are never purged. Storage costs balloon, query performance degrades, and monitoring dashboards become noisy.
Fix: Implement automated tombstoning. Archive records older than 30 days to cold storage (S3/Glacier). Purge resolved records after 7 days. Enforce retention policies via scheduled jobs.
5. Alert Fatigue from High-Volume Failure Streams
Explanation: Triggering an alert for every DLQ entry overwhelms on-call engineers. Critical failures get buried in noise, delaying response to actual outages.
Fix: Implement alert aggregation. Trigger notifications only when DLQ depth exceeds thresholds (e.g., >100 pending), when specific high-value topics fail (e.g., orders/paid), or when abandonment rates spike. Use sliding windows and deduplication.
6. Missing Database Indexes on JSONB Payloads
Explanation: Querying DLQ records by topic, shop domain, or failure time without indexes causes full table scans. As the archive grows, retry cycles slow down and database connections exhaust.
Fix: Create B-tree indexes on status, topic, shop_domain, and first_failure_at. Use GIN indexes on JSONB columns if querying nested payload fields. Partition tables by month for high-volume environments.
7. Hardcoding Retry Intervals Instead of Dynamic Backoff
Explanation: Fixed retry intervals (e.g., every 5 minutes) ignore downstream system recovery patterns. Aggressive retries during cascading failures worsen outages.
Fix: Implement dynamic backoff that scales with attempt count. Add jitter to prevent thundering herd effects. Pause retries for specific error codes (e.g., 429 Too Many Requests) until rate limit windows reset.
Production Bundle
Action Checklist
Decision Matrix
| Scenario | Recommended Approach | Why | Cost Impact |
|---|
| High-volume store (>10k events/day) | Redis + BullMQ + PostgreSQL partitioning | Decouples ingestion, scales horizontally, prevents DB lock contention | Moderate infrastructure cost, high ROI on data integrity |
| Low-volume store (<500 events/day) | SQLite/PostgreSQL single table + cron retry | Simplifies deployment, reduces operational overhead | Minimal cost, acceptable for low failure rates |
| Multi-tenant SaaS platform | AWS SQS + DynamoDB DLQ + Step Functions | Native cloud scaling, managed retries, audit compliance | Higher cloud spend, reduces engineering maintenance |
| Payment-critical workflows | DLQ with synchronous verification + manual review queue | Prevents financial discrepancies, enables compliance auditing | Operational overhead, mandatory for PCI/financial accuracy |
Configuration Template
// config/dlq-policy.ts
export const DLQ_POLICY = {
primaryQueue: {
maxAttempts: 5,
backoffType: 'exponential',
initialDelayMs: 2000,
removeOnComplete: true,
removeOnFail: false
},
failureArchive: {
maxRetryAttempts: 10,
batchSize: 50,
retryIntervalMs: 300000, // 5 minutes
retentionDays: 30,
tombstoneBatchSize: 1000
},
alerting: {
depthThreshold: 100,
criticalTopics: ['orders/paid', 'fulfillments/update'],
slackChannel: '#webhook-ops',
deduplicationWindowMs: 900000 // 15 minutes
}
};
Quick Start Guide
- Initialize Infrastructure: Deploy Redis and PostgreSQL. Create the
event_failures table using the schema from the Core Solution section. Add required indexes.
- Deploy Ingestion Endpoint: Run the Express server with HMAC verification and async queue routing. Test with Shopify's webhook simulator or
curl replay tools.
- Start Worker & Scheduler: Launch the BullMQ worker and the retry orchestrator cron job. Verify transient errors trigger backoff and permanent errors route to the archive.
- Validate Observability: Query the
event_failures table. Confirm status transitions, retry counts, and payload persistence. Configure Slack alerts using the depth threshold.
- Run Chaos Test: Simulate downstream API failure. Verify DLQ captures payloads, retry cycle pauses, and alert triggers at threshold. Restore service and confirm auto-recovery.