, Promise<unknown>>;
class RequestCoalescer {
private inFlight: InFlightCache = new Map();
async execute<T>(key: RequestKey, fn: () => Promise<T>): Promise<T> {
if (this.inFlight.has(key)) {
return this.inFlight.get(key) as Promise<T>;
}
const promise = fn().finally(() => this.inFlight.delete(key));
this.inFlight.set(key, promise);
return promise;
}
}
### Step 2: Response Caching with Validation
Static caching fails with dynamic APIs. Implement cache-control parsing and conditional requests using `ETag` and `Last-Modified` headers to validate freshness without re-downloading payloads.
```typescript
interface CacheEntry {
data: unknown;
etag?: string;
lastModified?: string;
expiresAt: number;
}
class ValidatedCache {
private store = new Map<string, CacheEntry>();
get(key: string): CacheEntry | undefined {
const entry = this.store.get(key);
if (!entry || Date.now() > entry.expiresAt) {
this.store.delete(key);
return undefined;
}
return entry;
}
set(key: string, data: unknown, headers: Headers, ttlMs: number) {
this.store.set(key, {
data,
etag: headers.get('etag') ?? undefined,
lastModified: headers.get('last-modified') ?? undefined,
expiresAt: Date.now() + ttlMs
});
}
}
Step 3: Adaptive Batching & Rate-Aware Queuing
Fixed-size batching causes latency spikes during low traffic. Adaptive batching groups requests within a time window, dynamically adjusting size based on throughput and provider rate limits.
class AdaptiveBatcher<T, R> {
private queue: Array<{ params: T; resolve: (r: R) => void; reject: (e: Error) => void }> = [];
private flushTimer: NodeJS.Timeout | null = null;
private readonly flushInterval: number;
private readonly maxBatchSize: number;
constructor(flushInterval: number, maxBatchSize: number) {
this.flushInterval = flushInterval;
this.maxBatchSize = maxBatchSize;
}
enqueue(params: T): Promise<R> {
return new Promise((resolve, reject) => {
this.queue.push({ params, resolve, reject });
if (this.queue.length >= this.maxBatchSize) this.flush();
else if (!this.flushTimer) this.flushTimer = setTimeout(() => this.flush(), this.flushInterval);
});
}
private async flush() {
if (this.flushTimer) clearTimeout(this.flushTimer);
this.flushTimer = null;
const batch = this.queue.splice(0, this.queue.length);
if (batch.length === 0) return;
try {
const results = await this.executeBatch(batch.map(b => b.params));
batch.forEach((item, i) => item.resolve(results[i]));
} catch (err) {
batch.forEach(item => item.reject(err as Error));
}
}
protected async executeBatch(params: T[]): Promise<R[]> {
// Provider-specific batch API call
throw new Error('Implement provider batch execution');
}
}
Step 4: Payload Compression & Field Selection
Compressing responses reduces egress costs and latency, but only when the payload is compressible. Combine with GraphQL-like field selection or provider-specific fields parameters to minimize payload size before compression.
import { gzip, gunzip } from 'zlib';
import { promisify } from 'util';
const gzipAsync = promisify(gzip);
const gunzipAsync = promisify(gunzip);
async function compressPayload(data: unknown): Promise<Buffer> {
const json = JSON.stringify(data);
const compressed = await gzipAsync(json);
return compressed.length < json.length ? compressed : Buffer.from(json);
}
async function decompressPayload(buffer: Buffer): Promise<unknown> {
const decompressed = await gunzipAsync(buffer);
return JSON.parse(decompressed.toString());
}
Step 5: Architecture Decisions & Rationale
- Application-layer middleware vs. API Gateway: Implement optimization at the application layer when business logic dictates caching rules, field selection, or fallback behavior. Use API gateways only for universal rate limiting and TLS termination. Application-layer control enables per-endpoint cost strategies.
- LRU cache vs. Redis: For single-node or containerized deployments, in-memory LRU caching eliminates network hops and Redis serialization overhead. Distribute cache state via Redis only when horizontal scaling demands shared state.
- Adaptive vs. Fixed Batching: Fixed batching introduces artificial latency during low traffic. Adaptive batching respects provider rate limits while maintaining responsiveness, critical for user-facing APIs.
- Observability Hooks: Instrument every optimization layer with metrics (cache hit rate, batch size distribution, compression ratio, cost per call). Without measurement, optimization is guesswork.
Pitfall Guide
-
Caching without validation headers: Storing responses indefinitely causes stale data delivery and compliance violations. Always parse Cache-Control, ETag, and Last-Modified. Use conditional If-None-Match requests to validate freshness before returning cached data.
-
Aggressive compression on incompressible payloads: Compressing already-compressed media, encrypted payloads, or small JSON objects increases CPU usage while yielding negligible bandwidth savings. Implement a size threshold (e.g., >1KB) and a compression ratio check before storing compressed data.
-
Blind exponential backoff with cost multiplier: Retry logic that ignores cost-per-call creates retry storms during partial outages. Implement circuit breakers that track failure rates and cost exposure, then degrade gracefully instead of retrying indefinitely.
-
Optimizing only high-frequency endpoints: Long-tail APIs often carry premium pricing or data egress fees. A rarely called geocoding endpoint charging $0.05 per request can outspend a high-frequency public API charging $0.001. Audit all outbound calls, not just top traffic routes.
-
Ignoring data transfer pricing tiers: Compression reduces payload size, but cloud providers charge by egress volume with tiered pricing. Moving from $0.12/GB to $0.08/GB requires crossing volume thresholds. Track cumulative egress and align optimization targets with pricing tier breakpoints.
-
Over-relying on client-side caching: Client caches break in serverless, containerized, or multi-tenant environments where requests originate from different nodes. Implement server-side or edge caching for shared state, reserving client caching only for user-specific, non-sensitive data.
-
Missing cost observability: Without per-endpoint cost tracking, optimization efforts cannot be validated. Instrument OpenTelemetry or custom metrics to record request count, payload size, cache hits, and estimated cost per call. Correlate these with business metrics to calculate true ROI.
Production Bundle
Action Checklist
Decision Matrix
| Scenario | Recommended Approach | Why | Cost Impact |
|---|
| High-volume read APIs (public data, static catalogs) | Edge caching + aggressive TTL | Redundant calls dominate spend; data changes infrequently | 60–75% reduction |
| Real-time write APIs (payments, transactions) | Request deduplication + idempotency keys | Writes cannot be cached; duplicate submissions cause double billing | 30–45% reduction |
| Third-party AI/ML APIs (LLM inference, embeddings) | Prompt caching + response streaming + batch embeddings | Token pricing scales non-linearly; caching repeated prompts yields compounding savings | 50–80% reduction |
| Data-heavy geospatial/telemetry APIs | Field selection + Brotli compression + adaptive batching | Large payloads drive egress costs; compression + batching optimizes transfer efficiency | 40–65% reduction |
Configuration Template
// api-cost-optimizer.config.ts
export interface ApiOptimizerConfig {
cache: {
enabled: boolean;
defaultTtlMs: number;
validateWithEtag: boolean;
maxEntries: number;
};
compression: {
enabled: boolean;
minSizeBytes: number;
algorithm: 'gzip' | 'brotli' | 'auto';
minCompressionRatio: number;
};
batching: {
enabled: boolean;
flushIntervalMs: number;
maxBatchSize: number;
rateLimitAware: boolean;
};
coalescing: {
enabled: boolean;
ttlMs: number;
};
circuitBreaker: {
enabled: boolean;
failureThreshold: number;
costExposureLimitPerMinute: number; // in USD
resetTimeoutMs: number;
};
observability: {
trackCostPerCall: boolean;
trackCacheHitRate: boolean;
trackCompressionRatio: boolean;
metricsPrefix: string;
};
}
export const defaultConfig: ApiOptimizerConfig = {
cache: { enabled: true, defaultTtlMs: 300_000, validateWithEtag: true, maxEntries: 10_000 },
compression: { enabled: true, minSizeBytes: 1024, algorithm: 'auto', minCompressionRatio: 0.85 },
batching: { enabled: true, flushIntervalMs: 50, maxBatchSize: 50, rateLimitAware: true },
coalescing: { enabled: true, ttlMs: 2_000 },
circuitBreaker: { enabled: true, failureThreshold: 5, costExposureLimitPerMinute: 1.5, resetTimeoutMs: 30_000 },
observability: { trackCostPerCall: true, trackCacheHitRate: true, trackCompressionRatio: true, metricsPrefix: 'api.cost' }
};
Quick Start Guide
- Install dependencies:
npm install @opentelemetry/api @opentelemetry/sdk-metrics zod
- Create the optimizer wrapper: Copy the
ApiOptimizerConfig template and implement the middleware class using the coalescing, caching, batching, and compression modules provided in the Core Solution.
- Instrument your fetch/axios layer: Replace direct API calls with the optimizer wrapper. Pass endpoint-specific TTLs and field selection parameters. Enable metrics collection for
api.cost.requests, api.cost.cache_hits, and api.cost.bytes_compressed.
- Validate in staging: Run load tests comparing baseline vs. optimized traffic. Verify cache hit rates, batch distribution, and compression ratios. Adjust
flushIntervalMs and maxBatchSize to match provider rate limits.
- Deploy with feature flags: Roll out optimization behind a toggle. Monitor cost-per-call metrics in production. Gradually increase coverage to high-spend endpoints. Disable circuit breaker cost limits until baseline exposure is established.