e<void> {
this.remoteSync = setInterval(async () => {
try {
const response = await fetch(this.syncEndpoint);
const updates: ReleaseConfig[] = await response.json();
updates.forEach(config => this.configs.set(config.featureId, config));
this.emit('configUpdated', updates);
} catch (error) {
console.error('[ReleaseManager] Sync failed:', error);
}
}, 5000);
}
public isEnabled(featureId: string, userId?: string): boolean {
const config = this.configs.get(featureId);
if (!config || !config.enabled) return false;
if (!userId) return true;
const hash = this.hashCode(userId) % 100;
return hash < config.rolloutPercentage;
}
private hashCode(str: string): number {
let hash = 0;
for (let i = 0; i < str.length; i++) {
hash = ((hash << 5) - hash) + str.charCodeAt(i);
hash |= 0;
}
return Math.abs(hash);
}
public destroy(): void {
if (this.remoteSync) clearInterval(this.remoteSync);
}
}
### Step 2: Implement Graceful Shutdown and Connection Draining
Kubernetes terminates pods after receiving `SIGTERM`. Applications must stop accepting new requests, drain active connections, and close database pools before the grace period expires.
```typescript
// graceful-shutdown.ts
import { Server } from 'http';
import { Pool } from 'pg';
export class GracefulLifecycle {
private server: Server;
private dbPool: Pool;
private isShuttingDown = false;
constructor(server: Server, dbPool: Pool) {
this.server = server;
this.dbPool = dbPool;
this.registerSignals();
}
private registerSignals(): void {
process.on('SIGTERM', () => this.shutdown('SIGTERM'));
process.on('SIGINT', () => this.shutdown('SIGINT'));
}
public async shutdown(signal: string): Promise<void> {
if (this.isShuttingDown) return;
this.isShuttingDown = true;
console.log(`[${signal}] Initiating graceful shutdown...`);
// Stop accepting new connections
this.server.close(() => {
console.log('[Shutdown] HTTP server closed');
});
// Wait for active requests to complete (max 30s)
await new Promise(resolve => setTimeout(resolve, 30000));
// Close database connections
await this.dbPool.end();
console.log('[Shutdown] Database pool drained');
process.exit(0);
}
}
Kubernetes routing relies on readiness probes to determine traffic eligibility. Liveness probes must not overlap with readiness logic to prevent restart loops during transient failures.
// health-endpoints.ts
import { FastifyInstance } from 'fastify';
export function registerHealthRoutes(app: FastifyInstance): void {
// Liveness: checks if process is alive
app.get('/health/live', async () => {
return { status: 'alive', timestamp: new Date().toISOString() };
});
// Readiness: checks if dependencies are healthy
app.get('/health/ready', async () => {
const dbHealthy = await app.dbPool.query('SELECT 1');
const cacheHealthy = await app.redisClient.ping();
const isReady = dbHealthy.rowCount === 1 && cacheHealthy === 'PONG';
if (!isReady) {
app.server.log.warn('[Readiness] Dependency check failed');
throw app.httpErrors.serviceUnavailable('Dependencies unhealthy');
}
return { status: 'ready', timestamp: new Date().toISOString() };
});
}
Step 4: Enforce Expand/Contract Database Migration Pattern
Zero-downtime schema changes require a two-phase migration strategy. Phase 1 expands the schema without breaking existing code. Phase 2 contracts legacy columns after all instances run the new version.
-- Phase 1: Expand (Deploy with v1.0)
ALTER TABLE users ADD COLUMN email_verified_v2 BOOLEAN DEFAULT FALSE;
UPDATE users SET email_verified_v2 = email_verified;
-- Phase 2: Contract (Deploy after v1.1 rollout)
ALTER TABLE users DROP COLUMN email_verified;
ALTER TABLE users RENAME COLUMN email_verified_v2 TO email_verified;
Architecture Decisions and Rationale
The solution prioritizes stateless application design, explicit health boundaries, and contract-compatible migrations. Traffic shifting is delegated to the infrastructure layer (service mesh or load balancer) to avoid application-level routing complexity. Feature toggles decouple deployment velocity from release timing, enabling rollback without redeployment. Database migrations follow the expand/contract pattern to eliminate schema locks and prevent write failures during version transitions. Readiness probes gate traffic entry, ensuring new pods only receive requests after dependency validation. Graceful shutdown handlers prevent request drops during pod termination, aligning Kubernetes lifecycle events with application state.
Pitfall Guide
1. Non-Idempotent Database Migrations
Running migrations twice during failed rollbacks corrupts data or creates duplicate constraints. Migrations must be idempotent by design. Use conditional column additions, IF NOT EXISTS clauses, and version tracking tables. Never assume linear execution order in distributed environments.
2. Session Affinity Breaking During Traffic Shift
Stateful applications storing sessions in-memory lose user context when traffic routes to new pods. Replace in-memory sessions with externalized stores (Redis, DynamoDB) or enable sticky sessions at the load balancer level. Validate session serialization compatibility across versions.
3. Health Check Misconfiguration
Overly aggressive readiness probes cause traffic oscillation. Overly lenient probes route requests to degraded instances. Configure readiness checks to validate critical dependencies only. Set appropriate initialDelaySeconds to account for cold starts. Use separate endpoints for liveness and readiness to prevent restart loops.
4. Ignoring Downstream Cache Invalidation
New code versions often change response schemas or caching keys. Old caches return stale or incompatible data, causing serialization errors. Implement cache versioning, TTL alignment with deployment windows, and explicit invalidation triggers during rollout. Monitor cache hit rate degradation as a leading indicator of rollout failure.
5. Feature Flag Explosion Without Lifecycle Management
Unmanaged toggles accumulate technical debt, increase conditional complexity, and degrade performance. Enforce flag expiration dates, automate cleanup via CI pipelines, and limit active toggles per service. Use flag analytics to track usage and remove dead code paths within two release cycles.
6. Cold Start Latency in Containerized Environments
JIT compilation, dependency loading, and connection pool initialization cause 5-15 second latency spikes during pod startup. Pre-warm containers, use AOT compilation where applicable, and implement connection pool sharing via sidecars. Configure horizontal pod autoscaler to maintain warm capacity during deployments.
7. Inadequate Rollback Automation
Manual rollback decisions delay recovery and increase user impact. Automate rollback triggers using error rate thresholds, latency percentiles, and business metric degradation. Ensure rollback paths are idempotent and tested in staging. Never rely on manual intervention for production traffic reversal.
Production Bundle
Action Checklist
Decision Matrix
| Scenario | Recommended Approach | Why | Cost Impact |
|---|
| Monolithic legacy app with tight DB coupling | Blue-Green | Guarantees instant rollback, isolates schema migration risks, requires minimal code changes | High (2x compute during transition) |
| Microservices with independent release cycles | Canary + Feature Toggles | Minimizes blast radius, enables gradual validation, supports logical release control independent of deployment | Medium (1.3x compute + observability overhead) |
| High-traffic e-commerce checkout flow | Feature Toggles | Zero infrastructure overhead, instant logical rollback, prevents user-facing errors during schema transitions | Low (1.1x compute, flag management cost) |
| Stateful workloads with session affinity | Blue-Green with Sticky Sessions | Preserves user context, avoids session migration complexity, maintains consistent routing during validation | High (2x compute + load balancer licensing) |
| AI/ML model serving with cold start latency | Canary with Pre-warming | Validates model inference accuracy gradually, mitigates cold start impact, enables A/B testing against baseline | Medium (1.4x compute + GPU pre-warm cost) |
Configuration Template
# kubernetes-canary-deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: api-service-stable
spec:
replicas: 3
selector:
matchLabels:
app: api-service
track: stable
template:
metadata:
labels:
app: api-service
track: stable
spec:
containers:
- name: api
image: registry/api-service:1.2.0
ports:
- containerPort: 3000
readinessProbe:
httpGet:
path: /health/ready
port: 3000
initialDelaySeconds: 10
periodSeconds: 5
failureThreshold: 3
livenessProbe:
httpGet:
path: /health/live
port: 3000
initialDelaySeconds: 5
periodSeconds: 15
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: api-service-canary
spec:
replicas: 1
selector:
matchLabels:
app: api-service
track: canary
template:
metadata:
labels:
app: api-service
track: canary
spec:
containers:
- name: api
image: registry/api-service:1.3.0
ports:
- containerPort: 3000
readinessProbe:
httpGet:
path: /health/ready
port: 3000
initialDelaySeconds: 10
periodSeconds: 5
failureThreshold: 3
---
apiVersion: v1
kind: Service
metadata:
name: api-service
spec:
selector:
app: api-service
ports:
- protocol: TCP
port: 80
targetPort: 3000
---
# istio-virtual-service.yaml
apiVersion: networking.istio.io/v1alpha3
kind: VirtualService
metadata:
name: api-service-routing
spec:
hosts:
- api-service
http:
- route:
- destination:
host: api-service
subset: stable
weight: 90
- destination:
host: api-service
subset: canary
weight: 10
---
apiVersion: networking.istio.io/v1alpha3
kind: DestinationRule
metadata:
name: api-service-destination
spec:
host: api-service
subsets:
- name: stable
labels:
track: stable
- name: canary
labels:
track: canary
Quick Start Guide
- Initialize the deployment manifests: Apply the stable and canary Kubernetes deployments simultaneously. The canary replica runs the new image version with identical resource requests and environment variables.
- Configure traffic routing: Deploy the Istio VirtualService and DestinationRule to split traffic 90/10 between stable and canary subsets. Verify routing distribution using
istioctl proxy-config routes.
- Validate health and metrics: Monitor
/health/ready endpoints and Prometheus metrics for error rate, latency p95, and database connection pool utilization. Ensure canary pod passes readiness checks before proceeding.
- Progress or rollback: If error rate remains below 0.1% and latency p95 stays within baseline for 15 minutes, increment canary weight to 50%, then 100%. If thresholds breach, apply
kubectl rollout undo and revert VirtualService weights to 100/0 immediately.