'deployment.environment',
TEAM: 'team.owner',
COST_CENTER: 'cost.center',
DATA_TYPE: 'telemetry.type' // 'metrics' | 'logs' | 'traces'
} as const;
export function attachCostTags(span: Span, service: string, team: string): void {
span.setAttributes({
[OBSERVABILITY_TAGS.SERVICE]: service,
[OBSERVABILITY_TAGS.TEAM]: team,
[OBSERVABILITY_TAGS.COST_CENTER]: ${team}-cc,
[OBSERVABILITY_TAGS.ENV]: process.env.NODE_ENV || 'production',
[OBSERVABILITY_TAGS.DATA_TYPE]: 'traces'
});
}
Without attribution, optimization is guesswork. Cost tags enable budget alerts, showback reporting, and policy routing.
### Step 2: Adaptive Trace Sampling
Uniform sampling discards critical failure paths. Implement context-aware sampling that adjusts probability based on error status, latency thresholds, and business priority.
```typescript
// adaptive-sampler.ts
import { Sampler, SamplingResult, SpanKind } from '@opentelemetry/sdk-trace-base';
import { Span } from '@opentelemetry/api';
export class AdaptiveTraceSampler implements Sampler {
private readonly errorBudget: number;
private readonly latencyThreshold: number;
private readonly highValueEndpoints: Set<string>;
constructor(config: { errorBudget?: number; latencyThreshold?: number; highValueEndpoints?: string[] }) {
this.errorBudget = config.errorBudget ?? 0.05;
this.latencyThreshold = config.latencyThreshold ?? 500;
this.highValueEndpoints = new Set(config.highValueEndpoints ?? []);
}
shouldSample(context: unknown, traceId: string, name: string, spanKind: SpanKind, attributes: SpanAttributes): SamplingResult {
const isHighValue = this.highValueEndpoints.has(name);
const isServerError = attributes['http.status_code'] >= 500;
const isSlow = (attributes['http.response_time'] ?? 0) > this.latencyThreshold;
if (isServerError || isSlow) {
return { decision: 2 /* RECORD_AND_SAMPLE */, attributes: { sampling.reason: 'error_or_slow' } };
}
if (isHighValue) {
return { decision: 2, attributes: { sampling.reason: 'high_value' } };
}
const hash = parseInt(traceId.slice(0, 8), 16);
const threshold = Math.floor((2 ** 32) * this.errorBudget);
return hash < threshold
? { decision: 2, attributes: { sampling.reason: 'probabilistic' } }
: { decision: 0 /* DROP */, attributes: {} };
}
}
This sampler preserves 100% of error and slow paths, guarantees sampling for critical endpoints, and drops routine successful requests probabilistically. It reduces trace storage by 60-80% without losing incident context.
Step 3: Log Tiering & Pre-Ingestion Filtering
Logs are the largest cost driver. Implement structured logging with severity routing, regex filtering, and compression before export.
// log-router.ts
import { createLogger, format, transports } from 'winston';
import { CloudWatchLogTransport } from 'winston-aws-cloudwatch';
const LOG_LEVEL_COST = { error: 1, warn: 0.7, info: 0.3, debug: 0.1 };
export const costAwareLogger = createLogger({
format: format.combine(
format.timestamp(),
format.errors({ stack: true }),
format.json()
),
defaultMeta: { service: process.env.SERVICE_NAME },
transports: [
new transports.Console({ level: 'info' }),
new CloudWatchLogTransport({
logGroupName: `/prod/${process.env.SERVICE_NAME}`,
logStreamName: 'application',
level: 'warn',
createLogGroup: true,
awsOptions: { region: process.env.AWS_REGION },
format: format.combine(
format((info) => {
if (info.level === 'debug' && Math.random() > 0.1) return false;
if (info.message?.includes('healthcheck')) return false;
if (info.message?.includes('heartbeat')) return false;
return info;
})()
)
})
]
});
Filtering health checks, heartbeats, and debug noise before ingestion reduces log volume by 40-60%. CloudWatch/ELK/Elasticsearch pricing scales with ingested GBs, not stored GBs, so pre-export filtering yields immediate savings.
Step 4: Metric Cardinality Control
Unbounded dimensions cause metric storage costs to explode. Enforce cardinality budgets and replace high-cardinality tags with aggregated histograms or derived metrics.
// metric-validator.ts
const ALLOWED_DIMENSIONS = new Set(['service', 'environment', 'method', 'status_code']);
const MAX_CARDINALITY = 1000;
export function validateMetricDimensions(metricName: string, dimensions: Record<string, string>): void {
const keys = Object.keys(dimensions);
const invalid = keys.filter(k => !ALLOWED_DIMENSIONS.has(k));
if (invalid.length > 0) {
console.warn(`[METRIC_COST] Blocked high-cardinality dimensions for ${metricName}: ${invalid.join(', ')}`);
throw new Error('Cardinality policy violation');
}
// In production, integrate with a cardinality tracker that drops metrics exceeding thresholds
}
Replace per-request IDs with percentile aggregations (histogram, summary). Drop unused tags during pipeline processing. This prevents metric storage from scaling linearly with traffic.
Step 5: Cost-Aware Routing & Budget Enforcement
Route telemetry to tiered storage based on age, severity, and business value. Implement budget alerts that throttle or drop low-priority data when thresholds are breached.
// budget-controller.ts
export class TelemetryBudgetController {
private currentSpend: number = 0;
private readonly monthlyBudget: number;
constructor(monthlyBudget: number) {
this.monthlyBudget = monthlyBudget;
}
isWithinBudget(cost: number): boolean {
if (this.currentSpend + cost > this.monthlyBudget) {
this.triggerThrottlePolicy();
return false;
}
this.currentSpend += cost;
return true;
}
private triggerThrottlePolicy(): void {
// Dynamically adjust sampling rates, downgrade log levels, pause debug exports
console.warn('[BUDGET] Throttling non-critical telemetry');
}
}
Budget controllers integrate with CI/CD pipelines and runtime environments to enforce financial guardrails without manual intervention.
Architecture Decisions & Rationale
- Centralized OTel Collector as Policy Enforcement Point: Decouples instrumentation from cost logic. Collectors apply sampling, filtering, and routing before data reaches cloud providers, ensuring consistent policy application across services.
- Tiered Storage Over Uniform Retention: Hot storage (7 days) for active debugging, warm (30 days) for trend analysis, cold (180 days) for compliance. Reduces storage costs by 70% while preserving auditability.
- Pipeline-Level Filtering Over Application-Level: Filtering at the collector avoids shipping noise to expensive ingestion endpoints. Application code remains focused on business logic, not cost optimization.
- FinOps Integration: Cost tags, budget alerts, and showback dashboards align engineering behavior with financial accountability. Observability becomes a managed data product, not an unbounded utility.
Pitfall Guide
-
Uniform Sampling Across All Services
Applying a fixed 10% sample rate to every service discards error paths in low-traffic APIs while preserving noise in high-throughput endpoints. Best practice: implement adaptive sampling tied to status codes, latency, and business criticality.
-
Unbounded Metric Dimensions
Tagging metrics by user_id, request_id, or dynamic endpoint values creates millions of unique time series. Storage and query costs scale exponentially. Best practice: enforce dimension allowlists, use histograms for latency, and aggregate at the collector.
-
Debug Logs in Production Without Rotation
Leaving DEBUG level enabled in production inflates ingestion costs and degrades query performance. Best practice: route debug logs to local buffers, sample at 1-5%, and enforce environment-based log levels in deployment manifests.
-
Ignoring Query Costs
Storage costs are only half the equation. Scanning cold log data or aggregating unbounded metric series during incidents triggers compute overages. Best practice: index strategically, use materialized views for common queries, and set query timeouts with cost-aware caching.
-
Static Retention Policies
Keeping all telemetry at 90-day hot retention wastes budget on low-value data. Best practice: implement lifecycle policies that transition data to cheaper storage tiers based on age and access patterns.
-
Assuming Open-Source Equals Zero Cost
Self-hosted stacks (Prometheus, Loki, Jaeger) shift costs to compute, storage, and engineering time. Scaling collectors, managing stateful backends, and maintaining HA configurations often exceed commercial SaaS pricing. Best practice: calculate TCO including personnel, infrastructure, and incident response time.
-
Neglecting Cross-Service Trace Correlation Costs
Distributed tracing multiplies ingestion volume because each service exports spans for the same request. Without sampling or span dropping, trace costs scale with service count. Best practice: use tail-based sampling, drop internal service spans, and rely on metrics for inter-service health checks.
Production Bundle
Action Checklist
Decision Matrix
| Scenario | Recommended Approach | Why | Cost Impact |
|---|
| High-throughput API gateway | Tail-based sampling + metric aggregation | Preserves error paths, drops routine requests | 60-75% reduction |
| Event-driven microservices | Log tiering + debug sampling | Event logs are verbose; cold storage handles replay | 40-50% reduction |
| Compliance-heavy workloads | Immutable cold storage + hot debug | Regulatory retention requires auditability; hot tier enables debugging | 30% increase in storage, 60% reduction in ingestion |
| Legacy monolith | Gradual OTel migration + static sampling | Monoliths generate dense logs; sampling stabilizes costs during refactoring | 20-35% reduction initially, scales with decomposition |
| Real-time analytics pipeline | Metric rollups + log drop | Analytics rely on aggregates, not raw telemetry | 50-70% reduction |
Configuration Template
# otel-cost-optimized-config.yaml
receivers:
otlp:
protocols:
grpc:
http:
processors:
memory_limiter:
check_interval: 1s
limit_mib: 1024
spike_limit_mib: 256
batch:
timeout: 5s
send_batch_max_size: 1000
probabilistic_sampler:
sampling_percentage: 25
hash_seed: 22
filter:
logs:
exclude:
match_type: regexp
body: "^(healthcheck|heartbeat|debug:.*trace_id=.*)$"
resource:
attributes:
- key: team.owner
value: "platform"
action: upsert
- key: cost.center
value: "cc-observability"
action: upsert
transform:
metric_statements:
- context: datapoint
statements:
- set(attributes["sampling.reason"], "adaptive") where IsMatch(attributes["http.status_code"], "^[45]")
- delete_key(attributes, "request_id")
- delete_key(attributes, "session_id")
exporters:
prometheus:
endpoint: "0.0.0.0:8889"
namespace: "prod"
awscloudwatchlogs:
log_group_name: "/prod/telemetry"
log_stream_name: "application"
region: "${AWS_REGION}"
awsxray:
region: "${AWS_REGION}"
service:
pipelines:
traces:
receivers: [otlp]
processors: [memory_limiter, batch, probabilistic_sampler, resource, transform]
exporters: [awsxray]
logs:
receivers: [otlp]
processors: [memory_limiter, batch, filter, resource]
exporters: [awscloudwatchlogs]
metrics:
receivers: [otlp]
processors: [memory_limiter, batch, resource]
exporters: [prometheus]
Quick Start Guide
- Instrument with OTel SDKs: Replace vendor-specific agents with OpenTelemetry SDKs across services. Ensure consistent
service.name, deployment.environment, and team.owner attributes.
- Deploy the Collector: Run the OTel Collector as a sidecar or daemonset. Apply the configuration template, adjusting sampling percentages and filter regex to match your workload.
- Validate Cost Attribution: Query your telemetry backend to verify tags are propagating. Set up a simple cost dashboard grouping spend by
team.owner and telemetry.type.
- Enable Budget Alerts: Configure threshold alerts in your cloud provider or FinOps tool. Test throttle policies by temporarily lowering the budget and verifying that debug logs and low-priority traces are dropped.
- Iterate on Policies: Monitor MTTR, alert noise, and query latency for 14 days. Adjust sampling rates, cardinality limits, and retention tiers based on actual incident data. Optimization is continuous, not one-time.