context typing and OpenTelemetry integration.
// toggle-engine.ts
import { MeterProvider, Meter } from '@opentelemetry/api';
export interface EvaluationContext {
userId?: string;
tenantId?: string;
environment: 'development' | 'staging' | 'production';
metadata?: Record<string, string | number | boolean>;
}
export type FlagValue = boolean | string | number;
export interface FlagDefinition {
key: string;
defaultValue: FlagValue;
rolloutPercentage?: number;
allowedEnvironments?: string[];
telemetryTags?: Record<string, string>;
}
export class ToggleEngine {
private flags: Map<string, FlagDefinition> = new Map();
private meter: Meter;
constructor(meterProvider: MeterProvider) {
this.meter = meterProvider.getMeter('feature-toggle-service');
this.initializeMetrics();
}
private initializeMetrics(): void {
this.meter.createCounter('flag_evaluation_total', {
description: 'Total flag evaluations with outcome and environment tags'
});
this.meter.createHistogram('flag_evaluation_latency_ms', {
description: 'Time taken to resolve flag state'
});
}
register(flag: FlagDefinition): void {
if (this.flags.has(flag.key)) {
throw new Error(`Flag "${flag.key}" is already registered.`);
}
this.flags.set(flag.key, flag);
}
evaluate(key: string, context: EvaluationContext): FlagValue {
const start = performance.now();
const flag = this.flags.get(key);
if (!flag) {
throw new Error(`Unregistered flag: ${key}`);
}
if (flag.allowedEnvironments && !flag.allowedEnvironments.includes(context.environment)) {
return flag.defaultValue;
}
const resolved = this.resolveRollout(flag, context);
const latency = performance.now() - start;
this.meter
.getCounter('flag_evaluation_total')
.add(1, { flag: key, outcome: String(resolved), env: context.environment });
this.meter
.getHistogram('flag_evaluation_latency_ms')
.record(latency, { flag: key });
return resolved;
}
private resolveRollout(flag: FlagDefinition, context: EvaluationContext): FlagValue {
if (flag.rolloutPercentage === undefined || flag.rolloutPercentage >= 100) {
return flag.defaultValue;
}
const stableId = context.userId ?? context.tenantId ?? 'anonymous';
const hash = this.stableHash(stableId);
const cohort = hash % 100;
return cohort < flag.rolloutPercentage ? flag.defaultValue : this.invertValue(flag.defaultValue);
}
private stableHash(input: string): number {
let hash = 0x811c9dc5;
for (let i = 0; i < input.length; i++) {
hash ^= input.charCodeAt(i);
hash = Math.imul(hash, 0x01000193);
}
return hash >>> 0;
}
private invertValue(value: FlagValue): FlagValue {
if (typeof value === 'boolean') return !value;
if (typeof value === 'number') return value * -1;
return value;
}
}
Architecture Rationale:
- Deterministic Hashing: Uses FNV-1a for consistent cohort assignment without external dependencies. Prevents session flickering.
- Explicit Context Typing: Forces callers to provide environment and identity data, reducing runtime ambiguity.
- Metric Emission on Every Evaluation: Guarantees observability coverage. Latency tracking catches evaluation bottlenecks before they impact request paths.
- Environment Allowlisting: Prevents accidental exposure in non-production environments during development.
Step 3: Remote Synchronization & Delta Gates
Flags should not be hardcoded. A remote configuration layer enables delta deploys by allowing runtime adjustments without redeployment. Implement a lightweight sync client that polls or subscribes to a flag service.
// flag-sync-client.ts
import { ToggleEngine, FlagDefinition } from './toggle-engine';
export class FlagSyncClient {
constructor(private engine: ToggleEngine, private endpoint: string) {}
async sync(): Promise<void> {
const response = await fetch(`${this.endpoint}/flags`);
const payload = await response.json();
for (const raw of payload.flags) {
const definition: FlagDefinition = {
key: raw.key,
defaultValue: raw.defaultValue,
rolloutPercentage: raw.rolloutPercentage ?? 0,
allowedEnvironments: raw.allowedEnvironments,
telemetryTags: raw.telemetryTags
};
if (this.engine['flags'].has(definition.key)) {
this.engine['flags'].set(definition.key, definition);
} else {
this.engine.register(definition);
}
}
}
startPeriodicSync(intervalMs: number = 30000): void {
setInterval(() => this.sync(), intervalMs);
}
}
Delta Gate Strategy:
- Dark Launch: Deploy code with flag registered at
0%. Validate telemetry pipelines and error handling.
- Cohort Canary: Increase to
5β10%. Monitor latency, error rates, and business metrics.
- Progressive Expansion: Scale to
25%, 50%, 100% with automated guardrails.
- Stabilization: Lock flag state, remove conditional branching if permanent, or migrate to configuration.
Step 4: Observability Integration
Observability must be mandatory, not optional. Every flag evaluation should emit structured data that correlates with request traces, error logs, and business metrics.
// telemetry-bridge.ts
import { trace, Span } from '@opentelemetry/api';
export class TelemetryBridge {
static annotateSpan(span: Span, flagKey: string, value: unknown, context: Record<string, unknown>): void {
span.setAttribute('feature.flag.key', flagKey);
span.setAttribute('feature.flag.value', String(value));
span.setAttribute('feature.flag.context_hash', this.hashContext(context));
}
private static hashContext(ctx: Record<string, unknown>): string {
const serialized = JSON.stringify(ctx);
let hash = 0;
for (let i = 0; i < serialized.length; i++) {
const char = serialized.charCodeAt(i);
hash = ((hash << 5) - hash) + char;
hash |= 0;
}
return hash.toString(16);
}
}
Wire this into your request middleware to automatically attach flag states to distributed traces. This enables post-incident analysis by correlating flag exposure with latency spikes or error clusters.
Step 5: Rollback & Retirement Automation
Manual rollbacks are error-prone. Implement hard caps on rollout percentages for critical features. Require explicit approval workflows to exceed thresholds. For retirement, schedule automated deprecation alerts 14 days before flag removal. Strip conditional branches using codemods, then delete flag definitions from the registry.
Pitfall Guide
| Pitfall | Explanation | Fix |
|---|
| Non-deterministic Routing | Using Math.random() or session-based toggles causes inconsistent user experiences and breaks A/B test validity. | Implement stable hashing (FNV-1a, xxHash, or Murmur3) keyed to user/tenant ID. Cache resolved values per request lifecycle. |
| Ignoring Flag Retirement | Dead flags accumulate, increasing bundle size, evaluation latency, and cognitive overhead. New engineers inherit legacy conditionals. | Enforce lifecycle stages. Use automated cleanup scripts that scan for flags in Deprecated state for >30 days. Integrate with CI to block merges containing retired flags. |
| Client-Side Evaluation for Sensitive Logic | Exposing business rules, pricing tiers, or security toggles to browsers enables reverse engineering and unauthorized access. | Evaluate sensitive flags server-side. Use hybrid models where client receives a signed token representing resolved state, not raw rules. |
| Missing Observability Hooks | Blind rollouts hide performance degradation. Teams cannot correlate flag exposure with latency spikes or error clusters. | Mandate metric emission on every evaluation. Attach flag keys to distributed traces. Alert on evaluation latency >5ms or error rate >0.1%. |
| Testing Only the Active Path | CI pipelines skip disabled branches, allowing regressions to accumulate in unused code paths. | Parameterize unit and integration tests to run against all flag states. Use test harnesses that simulate 0%, 50%, and 100% rollout scenarios. |
| Environment Drift | Flags configured differently across staging and production cause false positives during validation. | Store flag definitions as infrastructure-as-code. Sync configurations via CI/CD pipelines. Validate parity before promotion. |
| Manual Toggle Fatigue | Relying on engineers to manually adjust percentages leads to inconsistent rollout pacing and human error. | Automate delta gates with scheduled percentage increments. Require approval only for threshold breaches or rollback triggers. |
Production Bundle
Action Checklist
Decision Matrix
| Scenario | Recommended Approach | Why | Cost Impact |
|---|
| High-security backend (payments, auth) | Server-side evaluation with signed tokens | Prevents rule exposure and unauthorized manipulation | Higher infrastructure cost, lower security risk |
| Low-latency mobile/web client | Client-side evaluation with periodic sync | Reduces network roundtrips, improves UX | Minimal infra cost, requires strict allowlisting |
| Cross-platform consistency (web, mobile, backend) | Hybrid model with centralized service | Ensures uniform rollout logic across all surfaces | Moderate infra cost, simplifies compliance |
| Experimental feature with rapid iteration | In-process engine with remote sync | Fast evaluation, easy rollback, low overhead | Near-zero infra cost, requires disciplined retirement |
Configuration Template
# flags-config.yaml
flags:
- key: "new-checkout-flow"
defaultValue: true
rolloutPercentage: 10
allowedEnvironments:
- staging
- production
telemetryTags:
team: "payments"
tier: "critical"
lifecycle: "experimental"
successCriteria:
maxErrorRate: 0.005
latencyP99Threshold: 250ms
retirement:
enabled: true
gracePeriodDays: 30
cleanupScript: "scripts/remove-checkout-flags.sh"
// init-flags.ts
import { MeterProvider } from '@opentelemetry/sdk-metrics';
import { ToggleEngine } from './toggle-engine';
import { FlagSyncClient } from './flag-sync-client';
import { readFileSync } from 'fs';
import { parse } from 'yaml';
const meterProvider = new MeterProvider();
const engine = new ToggleEngine(meterProvider);
const config = parse(readFileSync('./flags-config.yaml', 'utf8'));
for (const raw of config.flags) {
engine.register({
key: raw.key,
defaultValue: raw.defaultValue,
rolloutPercentage: raw.rolloutPercentage,
allowedEnvironments: raw.allowedEnvironments,
telemetryTags: raw.telemetryTags
});
}
const syncClient = new FlagSyncClient(engine, process.env.FLAG_SERVICE_URL || 'http://localhost:8080');
syncClient.startPeriodicSync(30000);
export { engine };
Quick Start Guide
- Initialize the Engine: Import
ToggleEngine and attach an OpenTelemetry MeterProvider. Register flags using the configuration template or programmatic definitions.
- Wire Evaluation into Request Path: Call
engine.evaluate('flag-key', context) in middleware or route handlers. Pass environment, user/tenant ID, and metadata.
- Attach Telemetry: Use
TelemetryBridge.annotateSpan() in your tracing middleware to correlate flag states with distributed traces. Verify metrics appear in your dashboard.
- Test Rollout Behavior: Run unit tests simulating
0%, 10%, and 100% cohorts. Validate deterministic routing by hashing the same user ID multiple times and confirming consistent outcomes.
- Deploy & Monitor: Ship code with flags at
0%. Validate telemetry pipelines. Increment rollout percentages using the sync client or remote API. Monitor latency and error rates before expansion.