nstances that can actually process requests.
// src/middleware/health-check.ts
import { Request, Response } from 'express';
import { db } from '../infrastructure/db';
export const healthCheck = async (req: Request, res: Response) => {
try {
// Verify database connectivity
await db.$queryRaw`SELECT 1`;
res.status(200).json({
status: 'healthy',
uptime: process.uptime(),
timestamp: new Date().toISOString(),
version: process.env.npm_package_version || 'unknown',
});
} catch (error) {
res.status(503).json({ status: 'unhealthy', error: 'Database connection failed' });
}
};
Graceful Shutdown with Connection Draining:
When the orchestrator sends a termination signal, the app must stop accepting new connections, finish in-flight requests, and close database pools before exiting.
// src/lifecycle/shutdown.ts
import { Server } from 'http';
import { db } from '../infrastructure/db';
export const setupGracefulShutdown = (server: Server) => {
const shutdown = async (signal: string) => {
console.log(`[${signal}] Initiating graceful shutdown...`);
// Stop accepting new connections
server.close(async () => {
console.log('HTTP server closed. Draining connections.');
try {
// Close database connections
await db.$disconnect();
console.log('Database connections closed.');
process.exit(0);
} catch (err) {
console.error('Error during shutdown cleanup:', err);
process.exit(1);
}
});
// Force exit if drain takes too long
setTimeout(() => {
console.error('Shutdown timeout exceeded. Forcing exit.');
process.exit(1);
}, 10_000);
};
process.on('SIGTERM', () => shutdown('SIGTERM'));
process.on('SIGINT', () => shutdown('SIGINT'));
};
2. Process Orchestration: PM2 vs. Systemd
The process manager ensures the application restarts on crash and utilizes multi-core architectures.
PM2 for Cluster Mode and Zero-Downtime Reloads:
PM2 is ideal when you need cluster mode to leverage all CPU cores and zero-downtime deployments via pm2 reload. It manages the process lifecycle and provides built-in monitoring.
// ecosystem.config.js
module.exports = {
apps: [{
name: 'api-gateway',
script: 'dist/main.js',
instances: 'max',
exec_mode: 'cluster',
max_memory_restart: '750M',
kill_timeout: 5000,
listen_timeout: 8000,
env_production: {
NODE_ENV: 'production',
PORT: 3000,
},
// Security: Run as non-root user
user: 'appuser',
group: 'appgroup',
}],
};
Systemd for OS-Level Integration:
Systemd is preferred for strict security isolation, integration with OS logging (journalctl), and environments where Docker is not used. It offers robust namespace isolation.
# /etc/systemd/system/api-gateway.service
[Unit]
Description=Node.js API Gateway
After=network.target postgresql.service
[Service]
Type=notify
User=appuser
Group=appgroup
WorkingDirectory=/opt/api-gateway
ExecStart=/usr/local/bin/node dist/main.js
Restart=on-failure
RestartSec=5
EnvironmentFile=/opt/api-gateway/.env.production
# Security Hardening
NoNewPrivileges=true
PrivateTmp=true
ProtectSystem=strict
ProtectHome=true
ReadWritePaths=/opt/api-gateway/logs
[Install]
WantedBy=multi-user.target
3. Edge Routing and Security with Nginx
Nginx acts as the reverse proxy, handling TLS termination, static asset serving, and security headers. This offloads I/O work from the Node.js event loop.
# /etc/nginx/conf.d/api-gateway.conf
upstream node_cluster {
server 127.0.0.1:3000;
server 127.0.0.1:3001;
keepalive 128;
}
server {
listen 443 ssl http2;
server_name api.example.com;
# TLS Configuration
ssl_certificate /etc/letsencrypt/live/api.example.com/fullchain.pem;
ssl_certificate_key /etc/letsencrypt/live/api.example.com/privkey.pem;
ssl_protocols TLSv1.2 TLSv1.3;
ssl_ciphers ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-GCM-SHA256;
ssl_prefer_server_ciphers off;
# Security Headers
add_header Strict-Transport-Security "max-age=63072000" always;
add_header X-Frame-Options DENY;
add_header X-Content-Type-Options nosniff;
add_header Content-Security-Policy "default-src 'self'";
# Proxy Configuration
location / {
proxy_pass http://node_cluster;
proxy_http_version 1.1;
proxy_set_header Connection "";
proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
proxy_set_header X-Forwarded-Proto $scheme;
# Timeouts aligned with application logic
proxy_connect_timeout 10s;
proxy_read_timeout 30s;
proxy_send_timeout 30s;
}
# Static Assets: Serve directly to save Node memory
location /assets/ {
alias /opt/api-gateway/public/;
expires 30d;
add_header Cache-Control "public, immutable";
access_log off;
}
# Gzip Compression
gzip on;
gzip_vary on;
gzip_comp_level 6;
gzip_types text/plain text/css application/json application/javascript;
}
4. Structured Observability
Console logs are insufficient for production. Use a structured logger like Pino to generate machine-readable logs that can be ingested by log aggregators.
// src/infrastructure/logger.ts
import pino from 'pino';
const logger = pino({
level: process.env.LOG_LEVEL || 'info',
base: { service: 'api-gateway', pid: process.pid },
redact: [
'req.headers.authorization',
'req.headers.cookie',
'user.email',
'user.password',
],
transport: process.env.NODE_ENV === 'development'
? { target: 'pino-pretty', options: { colorize: true } }
: undefined,
});
export default logger;
Pitfall Guide
-
The Zombie Process Trap
- Explanation: Failing to handle
SIGTERM or closing the HTTP server immediately without draining connections results in dropped requests and zombie processes that consume resources.
- Fix: Implement a shutdown handler that calls
server.close(), waits for active connections to finish, and includes a forced timeout.
-
Nginx Buffering Black Hole
- Explanation: Default Nginx buffering can mask application errors or cause memory spikes if the response is large. It may also delay streaming responses.
- Fix: Tune
proxy_buffer_size and proxy_buffers. For streaming endpoints, use proxy_buffering off;.
-
PM2 Cluster Memory Leaks
- Explanation: In cluster mode, each worker is a separate process. A memory leak in one worker won't affect others, but without limits, it can crash the node.
- Fix: Configure
max_memory_restart in the PM2 ecosystem file to automatically restart workers that exceed memory thresholds.
-
Static File Leakage
- Explanation: Serving static files through Node.js consumes event loop cycles and memory for every request.
- Fix: Configure Nginx to serve static assets directly using
alias or root directives, bypassing the application entirely.
-
Environment Drift
- Explanation: Differences between development and production environment variables lead to runtime errors that are hard to reproduce.
- Fix: Use schema validation (e.g., Zod) at application startup to enforce required variables and types. Fail fast if configuration is invalid.
-
SSL Renewal Failures
- Explanation: Certbot may fail to renew certificates if Nginx configuration is invalid or if the renewal hook is misconfigured.
- Fix: Regularly test renewal with
certbot renew --dry-run. Ensure Nginx config is valid before renewal attempts. Monitor renewal status via systemd timers.
-
Deployment Without Health Verification
- Explanation: Restarting the application without verifying it starts correctly can leave the service in a broken state.
- Fix: Include a health check step in the deployment script that polls the
/health endpoint and verifies a 200 OK response before considering the deployment successful.
Production Bundle
Action Checklist
Decision Matrix
| Scenario | Recommended Approach | Why | Cost Impact |
|---|
| Small VPS / Low Traffic | Systemd + Nginx | Minimal overhead; OS-level security; no extra process manager memory. | Low (Free) |
| High Traffic / Multi-Core | PM2 Cluster + Nginx | Maximizes CPU utilization; zero-downtime reloads; easy monitoring. | Low (Free) |
| Strict Security Compliance | Systemd + Nginx + AppArmor/SELinux | Granular namespace isolation; integration with OS security frameworks. | Medium (Config complexity) |
| Microservices / Scale | Docker + Kubernetes | Container isolation; orchestration; auto-scaling; service mesh. | High (Infrastructure complexity) |
Configuration Template: Zero-Downtime Deployment Script
This script includes rollback capability and health verification to ensure safe deployments.
#!/bin/bash
# deploy.sh β Zero-downtime deployment with rollback
set -e
APP_DIR="/opt/api-gateway"
BACKUP_DIR="/opt/api-gateway-backup"
HEALTH_URL="http://localhost:3000/health"
MAX_RETRIES=5
RETRY_INTERVAL=2
echo "π Starting deployment..."
# 1. Backup current version
if [ -d "$BACKUP_DIR" ]; then rm -rf "$BACKUP_DIR"; fi
cp -r "$APP_DIR" "$BACKUP_DIR"
# 2. Pull latest code
cd "$APP_DIR"
git fetch origin
git reset --hard origin/main
# 3. Install dependencies and build
npm ci --production
npm run build
# 4. Run migrations
npx prisma migrate deploy
# 5. Restart application
pm2 reload api-gateway
# 6. Verify health
echo "β³ Waiting for application to stabilize..."
sleep 3
for i in $(seq 1 $MAX_RETRIES); do
if curl -sf "$HEALTH_URL" > /dev/null; then
echo "β
Health check passed. Deployment successful!"
exit 0
fi
echo "β οΈ Health check attempt $i failed. Retrying in ${RETRY_INTERVAL}s..."
sleep $RETRY_INTERVAL
done
# Rollback on failure
echo "β Health check failed after $MAX_RETRIES attempts. Rolling back..."
rm -rf "$APP_DIR"
cp -r "$BACKUP_DIR" "$APP_DIR"
pm2 reload api-gateway
echo "π Rollback complete. Check logs for details."
exit 1
Quick Start Guide
- Initialize Project: Set up your Node.js application with TypeScript and configure
package.json scripts for build and start.
- Configure Process Manager: Create an
ecosystem.config.js for PM2 or a systemd service file. Ensure the user has appropriate permissions.
- Set Up Nginx: Install Nginx, create a server block for your domain, and configure the reverse proxy to your application port.
- Enable SSL: Run
certbot --nginx to obtain and configure TLS certificates. Verify the auto-renewal timer.
- Deploy: Run the deployment script to push code, build assets, restart the service, and verify health. Monitor logs using
pm2 logs or journalctl.