e stockRepository: StockRepository,
private channels: ChannelAdapter[]
) {}
async process(mutation: InventoryMutation): Promise<void> {
// 1. Idempotency check prevents double-decrements on retries
if (await this.idempotencyStore.isProcessed(mutation.transactionId)) {
return;
}
// 2. Atomic stock update with optimistic locking
const updatedStock = await this.stockRepository.decrementWithCas(
mutation.sku,
mutation.delta
);
// 3. Fan-out to connected channels
const propagationTasks = this.channels
.filter(ch => ch.id !== mutation.sourceChannelId)
.map(ch => ch.applyMutation(mutation));
await Promise.allSettled(propagationTasks);
// 4. Mark transaction complete
await this.idempotencyStore.markProcessed(mutation.transactionId);
}
}
**Rationale:** Idempotency is critical. In distributed systems, network blips may cause message redelivery. Without transaction ID checks, a retry could decrement stock twice, causing an undersell. The `Promise.allSettled` ensures that a failure in one channel does not block propagation to others.
#### 2. Optimistic Concurrency Control (CAS)
Race conditions occur when multiple orders target the same SKU simultaneously. Pessimistic locking serializes requests, creating bottlenecks. Optimistic locking using Compare-And-Swap (CAS) allows high concurrency with safe conflict resolution.
```typescript
interface VersionedStock {
sku: string;
availableQty: number;
version: number;
}
class StockRepository {
async decrementWithCas(sku: string, qty: number): Promise<VersionedStock> {
const MAX_RETRIES = 5;
let attempt = 0;
while (attempt < MAX_RETRIES) {
// Fetch current state with version token
const current = await this.getVersionedStock(sku);
if (current.availableQty < qty) {
throw new InsufficientStockError(sku, current.availableQty, qty);
}
// Attempt atomic update; fails if version changed
const success = await this.compareAndSwap(
sku,
current.version,
current.availableQty - qty
);
if (success) {
return { ...current, availableQty: current.availableQty - qty, version: current.version + 1 };
}
// Version mismatch indicates concurrent modification; retry
attempt++;
}
throw new ConcurrencyLimitExceededError(sku);
}
}
Rationale: CAS avoids database row locks. If two requests read the same version, only one CAS succeeds; the other retries with the new version. This maximizes throughput while guaranteeing consistency.
3. Resilient Propagation with Dead Letter Queues
Channel APIs rate-limit, timeout, or return errors. Silently dropping failed propagations leads to inventory drift. A Dead Letter Queue (DLQ) with exponential backoff ensures eventual consistency.
interface PropagationFailure {
mutation: InventoryMutation;
channelId: string;
error: string;
retryAfter: number;
attemptCount: number;
}
class ResilientPropagator {
constructor(
private dlq: DeadLetterQueue,
private metrics: MetricsCollector
) {}
async handleFailure(failure: PropagationFailure): Promise<void> {
const backoffMs = this.calculateBackoff(failure.attemptCount);
// Re-enqueue with calculated delay
await this.dlq.enqueue({
...failure,
retryAfter: Date.now() + backoffMs,
attemptCount: failure.attemptCount + 1
});
this.metrics.increment('propagation_retry', {
channel: failure.channelId,
sku: failure.mutation.sku
});
}
private calculateBackoff(attempts: number): number {
const base = 1000;
const max = 30000;
return Math.min(base * Math.pow(2, attempts), max);
}
}
Rationale: Exponential backoff prevents thundering herds when a channel recovers. The DLQ acts as a buffer, allowing the system to absorb transient failures without losing data integrity.
4. Dynamic Carrier Routing
Relying on a single carrier introduces single-point-of-failure risk. Dynamic routing evaluates multiple carriers based on cost, delivery estimates, and reliability scores.
interface CarrierQuote {
carrierId: string;
cost: number;
estimatedDelivery: Date;
reliabilityScore: number; // 0.0 to 1.0
}
class SmartCarrierRouter {
async selectBestCarrier(
order: Order,
quotes: CarrierQuote[]
): Promise<CarrierQuote> {
const promisedDelivery = order.requiredDeliveryDate;
// Filter by delivery promise
const viableQuotes = quotes.filter(
q => q.estimatedDelivery <= promisedDelivery
);
if (viableQuotes.length === 0) {
throw new NoCarrierAvailableError(order.id);
}
// Score based on cost efficiency weighted by reliability
return viableQuotes.reduce((best, current) => {
const bestScore = best.cost / best.reliabilityScore;
const currentScore = current.cost / current.reliabilityScore;
return currentScore < bestScore ? current : best;
});
}
}
Rationale: This approach optimizes for total cost of ownership, not just shipping price. A cheaper carrier with low reliability increases customer support costs and chargebacks. The routing logic dynamically adapts to carrier performance degradation.
5. Sync Lag Monitoring
Instrumentation must focus on tail latency. Average sync lag masks critical delays affecting specific SKUs or channels.
class SyncLagMonitor {
constructor(private metrics: MetricsCollector) {}
observeSyncCompletion(
mutation: InventoryMutation,
results: PropagationResult[]
): void {
results.forEach(result => {
const lagMs = result.completedAt - mutation.timestamp;
// Record histogram for percentile analysis
this.metrics.histogram('sync_lag_ms', lagMs, {
channel: result.channelId,
sku: mutation.sku
});
// Alert on critical threshold
if (lagMs > 5000) {
this.metrics.alert('sync_lag_critical', {
lagMs,
channel: result.channelId,
sku: mutation.sku
});
}
});
}
}
Rationale: Alerts should trigger on p99 lag exceeding 5 seconds. This threshold indicates that the sync architecture is degrading and may cause oversells during peak load. Monitoring per SKU and channel allows for granular diagnosis.
Pitfall Guide
-
Silent Failure Drops
- Explanation: Catching propagation errors and logging them without retrying leads to inventory drift. Over time, stock counts diverge across channels.
- Fix: Implement a DLQ with exponential backoff. Every failure must be retried until success or a maximum attempt limit is reached.
-
Polling Frequency Escalation
- Explanation: When polling lag becomes unacceptable, teams often increase polling frequency. This linearly increases API costs and database load, creating a scaling wall.
- Fix: Migrate to event-driven architecture. Sync only on state changes to decouple cost from volume.
-
Idempotency Gaps
- Explanation: Retrying failed mutations without checking transaction IDs causes double decrements. This results in undersells and lost revenue.
- Fix: Maintain an idempotency store keyed by transaction ID. Check existence before processing any mutation.
-
Pessimistic Locking Bottlenecks
- Explanation: Using database row locks for stock updates serializes all requests for a SKU. Under high concurrency, this creates a queue and increases latency.
- Fix: Use optimistic concurrency control (CAS). Allow concurrent reads and resolve conflicts via retries.
-
Average Latency Blindness
- Explanation: Monitoring average sync lag hides outliers. A p50 of 100ms with a p99 of 10s indicates a broken system for a subset of requests.
- Fix: Instrument histograms and alert on p99 or p999 latency thresholds. Focus on tail behavior.
-
Single Carrier Dependency
- Explanation: Hardcoding a default carrier exposes the business to rate limits, service outages, and price hikes.
- Fix: Implement dynamic carrier routing that evaluates multiple options per order based on cost, ETA, and reliability.
-
Ignoring Version Conflicts
- Explanation: CAS retries must handle version mismatches correctly. If the retry logic does not fetch the latest version, it will fail repeatedly.
- Fix: Ensure the retry loop fetches the current versioned stock before each CAS attempt.
Production Bundle
Action Checklist
Decision Matrix
| Scenario | Recommended Approach | Why | Cost Impact |
|---|
| High-Volume SKU | CAS + Event-Driven | Maximizes throughput; prevents oversells during spikes. | Higher initial complexity; lower marginal cost. |
| Low-Volume SKU | Polling with Reconciliation | Simpler implementation; low concurrency risk. | Lower dev cost; API costs scale linearly. |
| Critical Outbound | DLQ + Retry | Ensures data integrity; prevents inventory drift. | Storage cost for DLQ; negligible compute. |
| Multi-Channel Sales | Fan-out with allSettled | Isolates channel failures; maintains availability. | No direct cost; improves reliability. |
| Rate-Limited Channels | Backoff + Throttling | Prevents API bans; manages retry storms. | No direct cost; preserves channel access. |
Configuration Template
inventory_sync:
strategy: event_driven
concurrency:
max_retries: 5
cas_timeout_ms: 2000
propagation:
dlq_enabled: true
backoff_base_ms: 1000
backoff_max_ms: 30000
max_attempts: 10
monitoring:
p99_lag_alert_ms: 5000
oversell_threshold: 0
metrics_interval_s: 10
channels:
- id: shopify
rate_limit_rps: 100
timeout_ms: 3000
- id: amazon
rate_limit_rps: 50
timeout_ms: 5000
Quick Start Guide
- Define Mutation Schema: Create a typed interface for
InventoryMutation including SKU, delta, source channel, and transaction ID.
- Implement CAS Repository: Build a stock repository that supports
getVersionedStock and compareAndSwap operations.
- Wire Event Bus: Set up an event processor that consumes mutations, checks idempotency, updates stock via CAS, and fans out to channels.
- Add DLQ Handler: Implement a retry mechanism that captures propagation failures and re-enqueues them with exponential backoff.
- Deploy Monitoring: Instrument sync lag histograms and configure alerts for p99 latency exceeding 5 seconds.