a Script (Atomic Token Bucket)
This script performs the token bucket logic atomically within Redis, ensuring no race conditions.
-- KEYS[1] = rate_limit_key
-- ARGV[1] = capacity (max tokens)
-- ARGV[2] = refill_rate (tokens per second)
-- ARGV[3] = current_time (unix timestamp ms)
-- ARGV[4] = requested_tokens
local key = KEYS[1]
local capacity = tonumber(ARGV[1])
local refill_rate = tonumber(ARGV[2])
local now = tonumber(ARGV[3])
local requested = tonumber(ARGV[4])
local bucket = redis.call('HMGET', key, 'tokens', 'last_refill')
local tokens = tonumber(bucket[1]) or capacity
local last_refill = tonumber(bucket[2]) or now
-- Calculate refill
local elapsed = math.max(0, now - last_refill)
local new_tokens = math.min(capacity, tokens + (elapsed * refill_rate / 1000))
local allowed = 0
local remaining = 0
if new_tokens >= requested then
new_tokens = new_tokens - requested
allowed = 1
remaining = math.floor(new_tokens)
redis.call('HMSET', key, 'tokens', new_tokens, 'last_refill', now)
-- Set expiry to auto-cleanup (e.g., 2x window size)
redis.call('EXPIRE', key, math.ceil(capacity / refill_rate) * 2 + 10)
else
remaining = math.floor(new_tokens)
-- Update last_refill even on denial to keep time accurate
redis.call('HMSET', key, 'tokens', new_tokens, 'last_refill', now)
redis.call('EXPIRE', key, math.ceil(capacity / refill_rate) * 2 + 10)
end
return {allowed, remaining, math.ceil((capacity - new_tokens) * 1000 / refill_rate)}
-- Returns: {allowed: 0/1, remaining_tokens, retry_after_ms}
2. TypeScript Hybrid Limiter Class
This class manages the L1 local cache and asynchronously syncs with Redis.
import Redis from 'ioredis';
import { LRUCache } from 'lru-cache';
interface RateLimitConfig {
capacity: number;
refillRate: number; // tokens per second
localCacheTTL: number; // ms
redisKeyPrefix: string;
}
export class HybridRateLimiter {
private redis: Redis;
private luaScript: string;
private localCache: LRUCache<string, { allowed: boolean; remaining: number }>;
private config: RateLimitConfig;
constructor(redis: Redis, config: RateLimitConfig) {
this.redis = redis;
this.config = config;
// L1 Cache: Stores results to avoid Redis round-trips for rapid successive checks
// Evicts entries based on TTL to ensure eventual consistency
this.localCache = new LRUCache<string, any>({
max: 50000,
ttl: config.localCacheTTL,
allowStale: true,
});
// Pre-load Lua script
this.luaScript = `
${this.getLuaScriptContent()}
`;
this.redis.defineCommand('ratelimit', {
numberOfKeys: 1,
lua: this.luaScript,
});
}
async isAllowed(identifier: string): Promise<{ allowed: boolean; headers: Record<string, string> }> {
const key = `${this.config.redisKeyPrefix}:${identifier}`;
const now = Date.now();
// Fast Path: Check L1 Cache
// Note: In a true hybrid model, L1 might track local consumption.
// Here we demonstrate async write-back pattern where L1 caches the Redis result
// to batch updates or reduce read pressure.
const cached = this.localCache.get(key);
if (cached && !cached.stale) {
return {
allowed: cached.allowed,
headers: this.buildHeaders(cached.remaining, key)
};
}
// Slow Path: Redis Evaluation
try {
const result = await this.redis.call('ratelimit',
key,
this.config.capacity,
this.config.refillRate,
now,
1 // requested tokens
) as number[];
const allowed = result[0] === 1;
const remaining = result[1];
const retryAfter = result[2];
// Update L1 Cache
this.localCache.set(key, { allowed, remaining, stale: false });
return {
allowed,
headers: this.buildHeaders(remaining, key, retryAfter)
};
} catch (error) {
// Fallback: Open or Close based on risk profile
// Defaulting to allow on Redis failure for availability (Fail-Open)
console.error('Rate limiter Redis failure, failing open', error);
return { allowed: true, headers: { 'X-RateLimit-Fallback': 'true' } };
}
}
private buildHeaders(remaining: number, key: string, retryAfter?: number): Record<string, string> {
const headers: Record<string, string> = {
'X-RateLimit-Limit': String(this.config.capacity),
'X-RateLimit-Remaining': String(remaining),
};
if (retryAfter && retryAfter > 0) {
headers['Retry-After'] = String(Math.ceil(retryAfter / 1000));
}
return headers;
}
private getLuaScriptContent(): string {
// Insert the Lua script string here
return `
local key = KEYS[1]
local capacity = tonumber(ARGV[1])
local refill_rate = tonumber(ARGV[2])
local now = tonumber(ARGV[3])
local requested = tonumber(ARGV[4])
local bucket = redis.call('HMGET', key, 'tokens', 'last_refill')
local tokens = tonumber(bucket[1]) or capacity
local last_refill = tonumber(bucket[2]) or now
local elapsed = math.max(0, now - last_refill)
local new_tokens = math.min(capacity, tokens + (elapsed * refill_rate / 1000))
local allowed = 0
local remaining = 0
if new_tokens >= requested then
new_tokens = new_tokens - requested
allowed = 1
remaining = math.floor(new_tokens)
else
remaining = math.floor(new_tokens)
end
redis.call('HMSET', key, 'tokens', new_tokens, 'last_refill', now)
redis.call('EXPIRE', key, math.ceil(capacity / refill_rate) * 2 + 10)
local retry_after = 0
if allowed == 0 then
retry_after = math.ceil((requested - new_tokens) * 1000 / refill_rate)
end
return {allowed, remaining, retry_after}
`;
}
}
Architecture Rationale
- Lua Atomicity: The Lua script ensures that the read-calculate-write cycle happens in a single server-side operation. This eliminates race conditions without requiring distributed locks, which are expensive.
- LRU Cache: The
lru-cache library provides a memory-efficient L1 layer. By caching results, we reduce the frequency of Redis calls for rapid-fire requests from the same client, though the primary scaling win comes from the async write-back pattern in high-throughput variants.
- Fail-Open Strategy: The
catch block defaults to allowing traffic. In high-scale systems, availability often trumps strict enforcement. If Redis goes down, blocking all traffic is usually worse than allowing potential abuse. This should be configurable based on the API's risk profile.
- Header Propagation: Returning standard headers (
X-RateLimit-*) allows clients to implement backoff logic, reducing unnecessary load on the server.
Pitfall Guide
1. The Thundering Herd on the Limiter
Mistake: Every request hits Redis synchronously. Under a DDoS, the rate limiter cluster becomes the bottleneck, causing latency spikes that cascade to the application.
Fix: Implement L1 caching and async updates. Use local token buckets that only sync to Redis periodically or on state change.
2. Clock Skew in Distributed Systems
Mistake: Relying on Date.now() in application code for window calculations. Different nodes may have skewed clocks, allowing users to bypass limits by routing to specific nodes.
Fix: Always use server-side time or Redis TIME command for window calculations. Never trust client timestamps.
3. Boundary Spikes in Fixed Windows
Mistake: Using a fixed window counter allows a user to send limit requests at the end of window A and limit requests at the start of window B, effectively doubling the rate.
Fix: Use Sliding Window Log or Sliding Window Counter algorithms. Alternatively, use Token Bucket which has no boundaries.
4. Ignoring Multi-Tenancy and Hierarchy
Mistake: Implementing a single global rate limit per API key without considering tenant-level or IP-level aggregation.
Fix: Design a hierarchy of limits. Enforce per-key, per-tenant, and per-IP limits simultaneously. A request must pass all checks.
5. Blocking Thread on Redis Latency
Mistake: Using synchronous blocking calls or not setting timeouts. If Redis latency spikes to 500ms, the request thread hangs, exhausting the thread pool.
Fix: Always set aggressive timeouts on Redis calls (e.g., 5-10ms). If the timeout hits, trigger the fallback policy immediately.
6. Leaky Bucket vs. Token Bucket Confusion
Mistake: Implementing Leaky Bucket when Token Bucket is required. Leaky bucket smooths output but doesn't allow bursts, which can degrade user experience for legitimate traffic spikes.
Fix: Choose Token Bucket for APIs that should allow controlled bursts. Choose Leaky Bucket for smoothing traffic to downstream services.
7. Storage Explosion
Mistake: Storing a log of every request for Sliding Window Log. This consumes massive memory and degrades performance.
Fix: Use Sliding Window Counter (approximate) or Token Bucket. If exact counting is needed, use Redis Sorted Sets but implement aggressive cleanup and TTLs.
Production Bundle
Action Checklist
Decision Matrix
| Scenario | Recommended Approach | Why | Cost Impact |
|---|
| Public API with Abuse Risk | Hybrid (L1 + Redis Sliding Window) | Balances accuracy and performance; prevents boundary attacks. | Medium (Redis + Cache infra) |
| Internal Microservice Mesh | Local In-Memory Token Bucket | Latency is critical; global fairness less important. | Low (No external deps) |
| High-Frequency Trading / IoT | Edge-Node Local Counters + Batch Sync | Massive throughput; eventual consistency acceptable. | Low |
| Strict Compliance (PCI/HIPAA) | Centralized Redis with Sync Check | Auditability and strict enforcement override latency concerns. | High |
| Cost-Sensitive Startup | Nginx/Envoy Rate Limiting | Offload to proxy layer; zero app overhead. | Low (Proxy resources) |
Configuration Template
Use this YAML structure for a centralized rate limiting configuration service.
rate_limiter:
algorithm: token_bucket
storage:
type: redis_cluster
endpoints:
- redis-node-1:6379
- redis-node-2:6379
timeout_ms: 5
max_retries: 1
fallback_policy: allow # or deny
tiers:
free:
capacity: 100
refill_rate: 10 # tokens/sec
burst: 20
pro:
capacity: 1000
refill_rate: 100
burst: 200
enterprise:
capacity: 10000
refill_rate: 1000
burst: 5000
headers:
enabled: true
include_retry_after: true
monitoring:
metrics_prefix: api.ratelimit
alert_threshold:
hit_rate: 0.8 # Alert if 80% of requests are rate limited
latency_p99_ms: 10
Quick Start Guide
- Deploy Redis: Provision a Redis Cluster (e.g., AWS ElastiCache or self-hosted Cluster). Ensure network latency to app servers is <1ms.
- Install Dependencies:
npm install ioredis lru-cache
- Initialize Limiter: Instantiate the
HybridRateLimiter class with your Redis client and configuration. Load the Lua script via defineCommand.
- Integrate Middleware: Add the limiter check to your request pipeline (e.g., Express middleware, Koa middleware, or Gateway filter).
app.use(async (req, res, next) => {
const key = req.ip || req.user.id;
const result = await limiter.isAllowed(key);
Object.entries(result.headers).forEach(([k, v]) => res.setHeader(k, v));
if (!result.allowed) {
return res.status(429).json({ error: 'Too Many Requests' });
}
next();
});
- Verify: Send requests and check response headers. Use a tool like
wrk or k6 to simulate load and verify that p99 latency remains stable and limits are enforced.