rviceAntiPattern {
// ANTI-PATTERN: Storing state in memory prevents scaling
private userSessions: Map<string, any> = new Map();
}
export class StatelessAuthService {
private redis: RedisClientType;
constructor(redisUrl: string) {
this.redis = createClient({ url: redisUrl });
this.redis.connect();
}
async getSession(sessionId: string): Promise<any | null> {
const session = await this.redis.get(`session:${sessionId}`);
return session ? JSON.parse(session) : null;
}
async setSession(sessionId: string, data: any, ttlSeconds: number): Promise<void> {
await this.redis.set(
`session:${sessionId}`,
JSON.stringify(data),
{ EX: ttlSeconds }
);
}
// Middleware to inject session context without blocking
async loadSessionMiddleware(req: Request, res: Response, next: NextFunction) {
const sessionId = req.headers['x-session-id'] as string;
if (!sessionId) return next();
const session = await this.getSession(sessionId);
if (session) {
req.user = session;
}
next();
}
}
### 2. Asynchronous Event-Driven Decoupling
Synchronous calls between services create tight coupling. Replace direct calls with events. Producers publish events; consumers process them independently. This introduces backpressure handling and fault tolerance.
**Implementation Strategy:**
* Use a message broker (Kafka, RabbitMQ, or AWS SQS).
* Implement idempotent consumers to handle duplicate messages.
* Use dead-letter queues (DLQ) for poison messages.
**TypeScript Implementation:**
```typescript
import { Producer, Consumer, Kafka, logLevel } from 'kafkajs';
interface OrderCreatedEvent {
orderId: string;
userId: string;
amount: number;
timestamp: number;
}
export class EventDrivenOrderService {
private kafka: Kafka;
private producer: Producer;
private consumer: Consumer;
constructor(brokers: string[], groupId: string) {
this.kafka = new Kafka({
brokers,
logLevel: logLevel.WARN,
});
this.producer = this.kafka.producer({
retry: { retries: 5, initialRetryTime: 100 },
});
this.consumer = this.kafka.consumer({
groupId,
maxBytesPerPartition: 1048576,
});
}
async connect() {
await this.producer.connect();
await this.consumer.connect();
}
async publishOrderCreated(event: OrderCreatedEvent): Promise<void> {
await this.producer.send({
topic: 'orders.created',
messages: [
{
key: event.orderId,
value: JSON.stringify(event),
headers: {
'correlation-id': crypto.randomUUID(),
'event-version': '1.0',
},
},
],
});
}
async subscribe(handler: (event: OrderCreatedEvent) => Promise<void>) {
await this.consumer.subscribe({ topic: 'orders.created', fromBeginning: false });
await this.consumer.run({
eachMessage: async ({ message }) => {
if (!message.value) return;
const event = JSON.parse(message.value.toString()) as OrderCreatedEvent;
try {
await handler(event);
} catch (error) {
// Idempotency check and DLQ logic should be implemented here
console.error(`Failed to process message ${message.offset}:`, error);
throw error;
}
},
});
}
}
3. CQRS Pattern for Read/Write Separation
High-read workloads degrade write performance when using a single database model. CQRS separates the write model (optimized for consistency and integrity) from the read model (optimized for query performance).
Implementation Strategy:
- Write commands update the write model and emit events.
- Event handlers update the read model (materialized view).
- Queries hit the read model directly.
- Accept eventual consistency for read operations.
TypeScript Implementation:
// Write Model
export class WriteOrderRepository {
async createOrder(orderData: any): Promise<string> {
// Transactional write to primary DB
// Emits 'OrderCreated' event upon success
return 'order-uuid';
}
}
// Read Model (Materialized View)
export class ReadOrderRepository {
async getDashboardStats(): Promise<any> {
// Optimized query against read replica or search index
// Pre-aggregated data for O(1) or fast retrieval
return { totalRevenue: 0, activeUsers: 0 };
}
}
// Event Handler to sync Read Model
export class OrderReadModelSync {
constructor(
private readRepo: ReadOrderRepository,
private eventBus: EventDrivenOrderService
) {}
async initialize() {
await this.eventBus.subscribe(async (event) => {
// Update read model based on event
// This runs asynchronously, decoupling read latency from write latency
console.log(`Syncing read model for ${event.orderId}`);
});
}
}
Architecture Decisions and Rationale
- Redis for State: Redis provides O(1) access times for sessions and caching. It supports persistence and clustering, ensuring state survives service restarts and scales horizontally.
- Kafka for Events: Kafka offers high throughput, persistence, and consumer groups. It allows replaying events for debugging or rebuilding read models, which is critical for data integrity in distributed systems.
- TypeScript for Safety: Distributed systems require strict contracts. TypeScript enforces type safety across service boundaries when combined with shared interfaces, reducing runtime errors caused by payload mismatches.
- Idempotency: All write operations must be idempotent. Network retries can cause duplicate messages. Implementing idempotency keys ensures safe retries without data corruption.
Pitfall Guide
1. Synchronous Chaining Across Services
Mistake: Service A calls Service B, which calls Service C, which calls Service D synchronously.
Impact: Latency multiplies. If Service D degrades, Service A's thread pool exhausts, causing a cascading failure.
Fix: Replace chains with events. Service A publishes an event; B, C, and D consume independently.
2. Database as a Message Queue
Mistake: Using a database table with polling to implement task queues.
Impact: Lock contention on the table, high CPU usage from polling, and inability to handle high throughput.
Fix: Use a dedicated message broker (Kafka, RabbitMQ, SQS) designed for high-throughput message passing.
3. Ignoring Backpressure
Mistake: Producers publish events faster than consumers can process them.
Impact: Consumer memory exhaustion, message drops, or broker disk full errors.
Fix: Implement consumer prefetch limits, auto-scaling based on queue depth, and dead-letter queues.
4. Premature Sharding
Mistake: Sharding the database before hitting capacity limits.
Impact: Unnecessary complexity in query routing, rebalancing hot partitions, and cross-shard transactions.
Fix: Vertical scale first. Use read replicas. Shard only when single-node capacity is exhausted and query patterns allow clean partitioning.
5. Stateful API Servers
Mistake: Storing user tokens, rate limits, or temporary data in server memory.
Impact: Horizontal scaling impossible; session loss during deployments; uneven load distribution.
Fix: Externalize all state to distributed caches or databases.
6. N+1 Query Patterns in Microservices
Mistake: Fetching a list of items, then making individual calls to fetch details for each item.
Impact: Network overhead explodes; latency increases linearly with list size.
Fix: Use batch endpoints, GraphQL, or data loaders to fetch related data in bulk.
7. Lack of Circuit Breaking
Mistake: No protection against downstream service failures.
Impact: Requests pile up waiting for failed dependencies, consuming resources and causing timeouts.
Fix: Implement circuit breakers that fail fast when error thresholds are exceeded, allowing the system to recover.
Production Bundle
Action Checklist
Decision Matrix
| Scenario | Recommended Approach | Why | Cost Impact |
|---|
| Startup MVP | Monolith + Vertical Scale | Fastest time to market; low operational overhead. | Low |
| High Read Traffic | CQRS + Read Replicas | Decouples reads from writes; scales reads independently. | Medium |
| Bursty Traffic | Event-Driven + Auto-scaling | Absorbs spikes via queues; scales workers based on demand. | Variable |
| Global Users | Geo-Distributed + CDN | Reduces latency for international users; improves availability. | High |
| Strict Consistency | Sync + Distributed Transactions | Ensures data integrity across services; acceptable for financial ops. | Medium |
| High Throughput | Async + Sharding | Maximizes throughput; partitions data to eliminate bottlenecks. | High |
Configuration Template
scalable-server.config.ts
export const ScalableConfig = {
redis: {
url: process.env.REDIS_URL || 'redis://localhost:6379',
poolSize: 10,
retryStrategy: (times: number) => Math.min(times * 50, 2000),
},
kafka: {
brokers: process.env.KAFKA_BROKERS?.split(',') || ['localhost:9092'],
consumer: {
groupId: 'backend-service-group',
maxBytesPerPartition: 1048576,
sessionTimeout: 30000,
},
producer: {
idempotent: true,
transactionalId: 'order-service-txn',
retry: { retries: 5, initialRetryTime: 100 },
},
},
circuitBreaker: {
timeout: 5000,
errorThresholdPercentage: 50,
resetTimeout: 30000,
},
rateLimiting: {
windowMs: 60000,
maxRequests: 100,
redisStore: true,
},
};
Quick Start Guide
-
Scaffold Stateless API:
Create a new TypeScript project using Fastify or Express. Remove all in-memory state. Add Redis client using the configuration template.
npm init -y
npm install fastify redis @types/redis
-
Add Message Broker Integration:
Install KafkaJS. Implement a producer service to publish events and a consumer service to handle them. Configure idempotency checks in the consumer.
npm install kafkajs
-
Implement CQRS Router:
Create separate routes for commands (POST/PUT) and queries (GET). Commands should write to the primary DB and emit events. Queries should read from a read-optimized store or replica.
-
Run Load Test:
Use autocannon or k6 to simulate traffic. Verify that p99 latency remains stable under load and that the system scales horizontally by adding instances.
npm install -g autocannon
autocannon -c 100 -d 60 http://localhost:3000/health
-
Verify Observability:
Ensure logs include correlation IDs. Check that Redis and Kafka metrics are exposed. Validate that circuit breakers trigger correctly by simulating downstream failures.
This article provides the architectural foundation for scalable backend systems. Implementation requires rigorous testing, observability, and iterative refinement based on production metrics. Adhere to the pitfalls guide to avoid common scalability traps.