DK } from '@opentelemetry/sdk-node';
import { OTLPTraceExporter } from '@opentelemetry/exporter-trace-otlp-http';
import { BatchSpanProcessor } from '@opentelemetry/sdk-trace-base';
import { Resource } from '@opentelemetry/resources';
import { SEMRESATTRS_SERVICE_NAME, SEMRESATTRS_SERVICE_VERSION } from '@opentelemetry/semantic-conventions';
import { TraceIdRatioBasedSampler } from '@opentelemetry/sdk-trace-base';
const sdk = new NodeSDK({
resource: new Resource({
[SEMRESATTRS_SERVICE_NAME]: 'payment-gateway',
[SEMRESATTRS_SERVICE_VERSION]: '2.4.1',
}),
traceExporter: new OTLPTraceExporter({
url: process.env.OTEL_EXPORTER_OTLP_ENDPOINT || 'http://localhost:4318/v1/traces',
}),
spanProcessor: new BatchSpanProcessor(new OTLPTraceExporter()),
sampler: new TraceIdRatioBasedSampler(0.05), // 5% head-based sampling
});
sdk.start();
**Architecture Rationale:** We use `BatchSpanProcessor` to aggregate spans in memory before network transmission. This reduces HTTP round-trips and prevents blocking the event loop. The `TraceIdRatioBasedSampler` ensures consistent sampling across distributed services because it derives the decision from the trace ID itself, guaranteeing that all spans belonging to a single request are either kept or dropped together.
### Step 2: Context-Aware Request Instrumentation
Auto-instrumentation covers HTTP frameworks, but business-critical paths require manual span creation with semantic attributes. We wrap the request handler to capture routing parameters, user context, and downstream dependencies.
```typescript
import { trace, SpanStatusCode } from '@opentelemetry/api';
import { Request, Response, NextFunction } from 'express';
const tracer = trace.getTracer('payment-gateway');
export function instrumentRequest(req: Request, res: Response, next: NextFunction) {
const span = tracer.startSpan(`http.${req.method.toLowerCase()}.${req.path}`);
span.setAttribute('http.route', req.route?.path || req.path);
span.setAttribute('http.client_ip', req.ip);
span.setAttribute('app.environment', process.env.NODE_ENV || 'development');
// Propagate context to downstream calls
req.span = span;
res.on('finish', () => {
span.setAttribute('http.status_code', res.statusCode);
span.setAttribute('http.response_content_length', res.getHeader('content-length') as string);
if (res.statusCode >= 400) {
span.setStatus({ code: SpanStatusCode.ERROR });
} else {
span.setStatus({ code: SpanStatusCode.OK });
}
span.end();
});
next();
}
Architecture Rationale: Spans are named using HTTP method and route pattern to prevent cardinality explosion. We avoid embedding dynamic values like user_id or transaction_id in span names. Instead, we attach them as attributes. The finish event listener guarantees span closure regardless of whether the request succeeds, fails, or times out.
Step 3: Downstream Dependency Tracing
External calls and database queries must inherit the parent span context. We demonstrate a simplified HTTP client wrapper that propagates trace headers automatically.
import { context, propagation } from '@opentelemetry/api';
import { SpanKind } from '@opentelemetry/api';
async function callDownstreamService(endpoint: string, payload: object) {
const parentSpan = trace.getSpan(context.active());
const childSpan = tracer.startSpan(`downstream.http.${endpoint}`, {
kind: SpanKind.CLIENT,
attributes: { 'http.url': endpoint },
}, context.active());
// Inject trace context into outgoing headers
const headers: Record<string, string> = {};
propagation.inject(context.active(), headers);
try {
const response = await fetch(endpoint, {
method: 'POST',
headers: { ...headers, 'Content-Type': 'application/json' },
body: JSON.stringify(payload),
});
if (!response.ok) {
childSpan.setStatus({ code: SpanStatusCode.ERROR, message: response.statusText });
}
return await response.json();
} catch (err) {
childSpan.recordException(err as Error);
childSpan.setStatus({ code: SpanStatusCode.ERROR });
throw err;
} finally {
childSpan.end();
}
}
Architecture Rationale: Context propagation is non-negotiable in distributed systems. The propagation.inject() call embeds traceparent and tracestate headers into outbound requests. Downstream services instrumented with OTel will automatically extract these headers and continue the trace. We use SpanKind.CLIENT to explicitly mark outbound calls, enabling the backend to distinguish between server-side processing and network latency.
Step 4: Backend Routing & Visualization
The OpenTelemetry Collector receives OTLP signals and routes them to specialized engines:
- Jaeger stores and indexes traces for distributed debugging.
- Prometheus scrapes or receives metrics for time-series aggregation.
- Loki ingests structured logs with lightweight indexing.
- Grafana unifies querying across all three sources.
This separation allows each backend to optimize for its signal type. Jaeger uses columnar storage optimized for span hierarchies. Prometheus applies compression and downsampling for time-series efficiency. Loki relies on log line compression and label-based indexing rather than full-text search, drastically reducing storage costs.
Pitfall Guide
1. Unbounded Metric Cardinality
Explanation: Attaching high-cardinality fields like user_id, session_id, or request_id to Prometheus metrics creates millions of unique time series. This exhausts memory, slows queries, and can crash the metrics backend.
Fix: Restrict metric labels to low-cardinality dimensions (service, endpoint, status_code, region). Use trace attributes for high-cardinality data. Enforce label limits in Prometheus configuration (maximum-labels-per-timeseries).
2. Silent Context Loss
Explanation: When spawning background workers, message queues, or async tasks, the active trace context is often lost. Subsequent spans become orphaned, breaking the request flow visualization.
Fix: Explicitly pass the Context object to worker threads or queue payloads. Use context.with() to bind the parent context to the new execution scope before creating child spans.
3. Over-Sampling in Production
Explanation: Running 100% head-based sampling in production generates terabytes of trace data, inflating storage costs and degrading collector throughput. Most successful requests provide zero diagnostic value.
Fix: Implement dynamic sampling. Use 1-5% for healthy traffic, but configure tail-based sampling to retain 100% of traces containing errors, high latency, or specific business events. The OTel Collector supports this via the tail_sampling processor.
4. Volatile Span Names
Explanation: Naming spans with dynamic values (e.g., db.query.users.12345) creates cardinality explosions in Jaeger and Grafana. The UI becomes unusable as span name dropdowns contain thousands of entries.
Fix: Name spans by operation and resource type (db.query.users, http.client.payment-api). Embed dynamic identifiers as span attributes, not span names.
5. Alert Threshold Misalignment
Explanation: Setting static thresholds (e.g., CPU > 80%) without considering traffic patterns or deployment cycles triggers alert fatigue. Engineers begin ignoring notifications, missing real incidents.
Fix: Align alerts with service-level objectives (SLOs). Use rate-of-change thresholds, multi-metric conditions (e.g., error rate AND latency), and evaluation windows that account for deployment rollouts. Attach runbooks directly to alert definitions.
6. Sensitive Data Leakage in Attributes
Explanation: Developers frequently log request bodies, authentication tokens, or PII as span attributes. This violates compliance requirements (GDPR, HIPAA) and creates security liabilities.
Fix: Implement a span processor that sanitizes attributes before export. Maintain a denylist of sensitive keys (password, token, ssn, credit_card). Use data classification tags to enforce policy at the collector level.
7. Dashboard Sprawl Without Runbooks
Explanation: Teams build dozens of Grafana panels but lack documented diagnostic procedures. When an alert fires, engineers waste time guessing which panel to inspect.
Fix: Tie every alert to a runbook. Structure runbooks with: symptom, suspected root causes, exact Grafana panel URLs, Loki query templates, and step-by-step remediation. Review runbooks quarterly during incident post-mortems.
Production Bundle
Action Checklist
Decision Matrix
| Scenario | Recommended Approach | Why | Cost Impact |
|---|
| High-throughput public API | Head-based sampling (1-3%) + Tail-based error retention | Reduces storage volume while preserving failure diagnostics | Lowers trace storage costs by 60-80% |
| Event-driven microservices | Context propagation via message headers + async span linking | Maintains trace continuity across message brokers | Minimal overhead; requires header serialization |
| Compliance-heavy fintech | Attribute sanitization processor + PII denylist | Prevents regulatory violations and audit failures | Adds ~2% CPU overhead for sanitization |
| Multi-region deployment | Region label in metrics + geographic span attributes | Enables latency comparison and regional failure isolation | Increases metric cardinality slightly; manageable with label limits |
| Legacy monolith migration | Auto-instrumentation first + manual span enrichment for critical paths | Accelerates rollout while ensuring business-critical visibility | Low initial effort; scales with incremental refinement |
Configuration Template
# otel-collector-config.yaml
receivers:
otlp:
protocols:
grpc:
endpoint: 0.0.0.0:4317
http:
endpoint: 0.0.0.0:4318
processors:
batch:
timeout: 5s
send_batch_max_size: 1000
tail_sampling:
policies:
- name: error-policy
type: status_code
status_code: { status_codes: [ERROR] }
- name: latency-policy
type: latency
latency: { threshold_ms: 500 }
exporters:
otlp/jaeger:
endpoint: jaeger:14250
tls:
insecure: true
prometheus:
endpoint: 0.0.0.0:8889
loki:
endpoint: http://loki:3100/loki/api/v1/push
service:
pipelines:
traces:
receivers: [otlp]
processors: [batch, tail_sampling]
exporters: [otlp/jaeger]
metrics:
receivers: [otlp]
processors: [batch]
exporters: [prometheus]
logs:
receivers: [otlp]
processors: [batch]
exporters: [loki]
Quick Start Guide
- Deploy the backend stack: Run
docker compose up -d with services for Jaeger, Prometheus, Loki, and Grafana. Verify endpoints are reachable on ports 16686, 9090, 3100, and 3000.
- Configure the collector: Mount the
otel-collector-config.yaml into the OTel Collector container. Ensure the OTEL_EXPORTER_OTLP_ENDPOINT environment variable points to http://collector:4318/v1/traces.
- Instrument your service: Initialize the OTel SDK at startup, apply the request middleware, and propagate context to downstream calls. Set
NODE_ENV=production to activate sampling.
- Validate signals: Send test requests and trigger fault injection (
?fault=latency or ?fault=error). Confirm traces appear in Jaeger, metrics in Prometheus, and logs in Loki. Verify Grafana dashboards update within 15 seconds.
- Attach runbooks: Create a Grafana alert for p95 latency > 300ms over 5 minutes. Link the alert to a markdown runbook containing diagnostic queries, escalation contacts, and rollback procedures. Test the alert by simulating sustained latency.