99_511_627_776
};
normalize(raw: RawPlanPayload): NormalizedPlan {
const p = raw.payload;
const dataBytes = this.parseDataAmount(p);
const validityHours = this.parseValidity(p);
const priceCents = this.parsePrice(p);
const currency = this.extractCurrency(p);
const coverage = this.mapCoverage(p);
const features = this.extractFeatures(p);
return {
providerId: raw.source,
planId: this.generateStableId(raw.source, p),
dataBytes,
validityHours,
priceCents,
currency,
coverageIsoCodes: coverage,
features,
lastVerifiedAt: Date.now()
};
}
private parseDataAmount(payload: Record<string, unknown>): number {
const rawValue = payload.dataAmount ?? payload.data_mb ?? payload.planDetails?.gb ?? payload.data;
if (typeof rawValue === 'number') return rawValue * this.unitMap.MB;
if (typeof rawValue === 'string') {
const match = rawValue.match(/([\d.]+)\s*(TB|GB|MB|KB)/i);
if (match) {
const val = parseFloat(match[1]);
const unit = match[2].toUpperCase();
return val * (this.unitMap[unit] || this.unitMap.GB);
}
}
throw new Error(Unrecognized data format: ${rawValue});
}
private parseValidity(payload: Record<string, unknown>): number {
const raw = payload.validDays ?? payload.validity ?? payload.planDetails?.days ?? payload.duration;
if (typeof raw === 'number') return raw * 24;
if (typeof raw === 'string') {
const match = raw.match(/(\d+)\s*(days|hours|hrs)/i);
if (match) return match[2].toLowerCase().startsWith('day') ? parseInt(match[1]) * 24 : parseInt(match[1]);
}
throw new Error(Unrecognized validity format: ${raw});
}
private parsePrice(payload: Record<string, unknown>): number {
const raw = payload.price ?? payload.cost_usd ?? payload.pricing?.USD ?? payload.retail_price;
const clean = typeof raw === 'string' ? raw.replace(/[^0-9.]/g, '') : raw;
return Math.round(parseFloat(clean) * 100);
}
private extractCurrency(payload: Record<string, unknown>): string {
return (payload.currency ?? payload.pricing?.currency ?? 'USD').toString().toUpperCase();
}
private mapCoverage(payload: Record<string, unknown>): string[] {
const raw = payload.countries ?? payload.coverage ?? payload.regions;
if (Array.isArray(raw)) return raw.map(c => this.resolveIsoCode(c));
if (typeof raw === 'string') return [this.resolveIsoCode(raw)];
return [];
}
private resolveIsoCode(input: string | number): string {
if (typeof input === 'number') return input.toString().padStart(3, '0');
const clean = input.trim().toUpperCase();
if (/^[A-Z]{2}$/.test(clean)) return clean;
if (/^[A-Z]{3}$/.test(clean)) return clean;
// Fallback to region mapping table in production
return this.regionLookup[clean] || 'XX';
}
private extractFeatures(payload: Record<string, unknown>): string[] {
const flags = payload.features ?? payload.tags ?? [];
return Array.isArray(flags) ? flags.filter(f => typeof f === 'string') : [];
}
private generateStableId(provider: string, payload: Record<string, unknown>): string {
const seed = ${provider}:${payload.dataAmount ?? payload.data_mb}:${payload.validDays ?? payload.validity};
return Buffer.from(seed).toString('base64url').slice(0, 16);
}
private regionLookup: Record<string, string> = {
'EUROPE': 'EU',
'ASIA_PACIFIC': 'AP',
'GLOBAL': 'ZZ'
};
}
### Step 3: Tiered Refresh Scheduler
Uniform polling wastes resources and hits rate limits unnecessarily. Implement a tiered scheduler that adjusts fetch intervals based on query volume and provider criticality.
```typescript
type RefreshTier = 'critical' | 'standard' | 'longtail';
interface TierConfig {
intervalMs: number;
maxConcurrency: number;
retryBackoffMs: number;
}
const TIER_SCHEDULE: Record<RefreshTier, TierConfig> = {
critical: { intervalMs: 6 * 3_600_000, maxConcurrency: 8, retryBackoffMs: 2_000 },
standard: { intervalMs: 12 * 3_600_000, maxConcurrency: 4, retryBackoffMs: 5_000 },
longtail: { intervalMs: 24 * 3_600_000, maxConcurrency: 2, retryBackoffMs: 10_000 }
};
class TieredScheduler {
private queues: Map<RefreshTier, Set<string>> = new Map();
private timers: Map<string, NodeJS.Timeout> = new Map();
registerProvider(providerId: string, tier: RefreshTier) {
if (!this.queues.has(tier)) this.queues.set(tier, new Set());
this.queues.get(tier)!.add(providerId);
}
start(tier: RefreshTier) {
const config = TIER_SCHEDULE[tier];
const providers = this.queues.get(tier) || new Set();
providers.forEach(providerId => {
const timer = setInterval(async () => {
try {
await this.fetchAndNormalize(providerId);
} catch (err) {
console.error(`[${providerId}] Tier ${tier} fetch failed:`, err);
}
}, config.intervalMs);
this.timers.set(providerId, timer);
});
}
private async fetchAndNormalize(providerId: string) {
// Route to API client, scraper, or webhook consumer
// Pass raw payload to PlanNormalizer
// Store normalized result in search index
}
}
Step 4: Anomaly Detection & Validation
Price volatility is expected, but data corruption is not. Run a post-refresh validator that flags deviations exceeding a configurable threshold. This catches flash sales (which should be indexed immediately) and parsing errors (which should be quarantined).
class PriceAnomalyDetector {
private thresholdPercent: number;
private historicalPrices: Map<string, number[]> = new Map();
constructor(thresholdPercent: number = 15) {
this.thresholdPercent = thresholdPercent;
}
validate(planId: string, currentPriceCents: number): { isAnomaly: boolean; deviation: number } {
const history = this.historicalPrices.get(planId) || [];
if (history.length === 0) {
this.historicalPrices.set(planId, [currentPriceCents]);
return { isAnomaly: false, deviation: 0 };
}
const avgPrice = history.reduce((a, b) => a + b, 0) / history.length;
const deviation = Math.abs((currentPriceCents - avgPrice) / avgPrice) * 100;
const isAnomaly = deviation > this.thresholdPercent;
if (!isAnomaly) {
history.push(currentPriceCents);
if (history.length > 5) history.shift();
this.historicalPrices.set(planId, history);
}
return { isAnomaly, deviation };
}
}
Architecture Decisions & Rationale
- Why tiered scheduling? Query volume correlates with user impact. Critical providers drive 80% of searches and require tighter freshness. Long-tail providers can tolerate 24-hour windows without degrading UX. This reduces API costs by ~60% compared to uniform polling.
- Why rule-based normalization over LLMs? Deterministic parsing guarantees idempotency and auditability. LLMs introduce non-determinism and latency spikes. Rule chains are easier to version, test, and roll back.
- Why anomaly detection post-refresh? Real-time validation prevents corrupted data from propagating to the search index. The 15% threshold balances sensitivity to flash sales against noise from minor FX fluctuations.
- Why separate fetchers from normalizers? Decoupling allows independent scaling. Scrapers can be containerized with headless browsers, while API clients run lightweight HTTP workers. Normalizers remain stateless and easily parallelizable.
Pitfall Guide
Explanation: Assuming all providers will align their payloads to a single contract. In reality, data units, validity formats, and pricing structures vary across 200+ patterns.
Fix: Implement a multi-stage parser with explicit fallback chains. Never rely on a single field name. Log unmapped fields for manual review.
2. Silent SKU Retirement
Explanation: Providers frequently discontinue plans without API deprecation signals or webhook events. Stale plans linger in the index, causing checkout failures.
Fix: Deploy a staleness detector that flags plans absent from fetch cycles for 48+ hours. Route these to a manual review queue before deletion.
3. Currency Cache Staleness
Explanation: Converting prices at query time using cached FX rates introduces drift during volatile markets. A 0.5–1% variance compounds across multi-currency comparisons.
Fix: Implement a dual-cache strategy: short-lived in-memory rates (5-minute TTL) for query-time conversion, and a persistent fallback table updated hourly via a dedicated FX sync worker.
4. Scraper Fragility & UI Drift
Explanation: Headless parsers break when providers redesign checkout flows, add CAPTCHAs, or shift to client-side rendering. Unmonitored scrapers silently return empty payloads.
Fix: Attach health checks to every scrape job. Monitor response size, DOM structure hashes, and field presence. Alert when success rates drop below 90% for two consecutive cycles.
5. Region-to-Country Mapping Ambiguity
Explanation: Providers use vague terms like "Europe" or "Asia-Pacific" that map to different country sets. Hardcoding these mappings causes coverage mismatches.
Fix: Maintain a versioned region registry with explicit country expansions. Require manual verification for any region that changes composition. Store both the raw region string and expanded ISO codes.
6. Authenticated Pricing Tier Leakage
Explanation: Some providers expose wholesale rates to API partners and retail rates to direct customers. Indexing wholesale prices misleads users and breaks checkout flows.
Fix: Route retail-price fetches through authenticated sessions that mimic end-user behavior. Validate against known retail baselines and flag discrepancies >10%.
7. Idempotency Gaps in Ingestion
Explanation: Retrying failed fetches without deduplication creates duplicate plan records or overwrites verified data with corrupted payloads.
Fix: Implement idempotent upserts using stable plan IDs. Compare incoming payloads against stored checksums before writing. Reject writes that fail schema validation or anomaly thresholds.
Production Bundle
Action Checklist
Decision Matrix
| Scenario | Recommended Approach | Why | Cost Impact |
|---|
| High-volume provider with stable API | Tiered API polling (6h) | Maximizes freshness for critical search traffic | Moderate (rate limit management) |
| Low-volume niche provider | Tiered API polling (24h) | Reduces infrastructure spend without impacting UX | Low |
| Provider lacks API or has incomplete endpoints | Headless scraper with DOM hash monitoring | Ensures coverage where APIs fail | High (browser instance costs, maintenance) |
| Strategic partner with real-time inventory | Push webhook ingestion | Eliminates polling overhead, guarantees freshness | Low (message broker costs) |
| Price volatility during flash sales | Anomaly-triggered immediate re-fetch | Captures legitimate discounts without propagating errors | Low (burst capacity only) |
Configuration Template
ingestion:
tiers:
critical:
interval_hours: 6
max_concurrency: 8
providers:
- airalo
- holafly
- saily
standard:
interval_hours: 12
max_concurrency: 4
providers:
- yesim
- nomad
- aloha
longtail:
interval_hours: 24
max_concurrency: 2
providers:
- local_carrier_a
- regional_b
- niche_c
normalization:
data_units:
GB: 1073741824
MB: 1048576
KB: 1024
TB: 1099511627776
region_mapping:
EUROPE: ["AT","BE","BG","HR","CY","CZ","DK","EE","FI","FR","DE","GR","HU","IE","IT","LV","LT","LU","MT","NL","PL","PT","RO","SK","SI","ES","SE"]
ASIA_PACIFIC: ["AU","NZ","JP","KR","CN","SG","MY","TH","VN","ID","PH"]
GLOBAL: ["ZZ"]
validation:
price_anomaly_threshold_percent: 15
staleness_hours: 48
fx_cache_ttl_minutes: 5
fx_fallback_update_hours: 1
Quick Start Guide
- Initialize the ingestion workers: Deploy three containerized fetchers (API client, headless scraper, webhook consumer) configured with provider routing rules. Set environment variables for rate limits and retry policies.
- Load the normalizer rules: Import the unit conversion table, region mapping registry, and schema fallback chains. Run a dry-pass against historical payloads to verify field extraction accuracy.
- Configure the tiered scheduler: Register providers into critical/standard/longtail tiers based on query volume analytics. Start the scheduler with staggered initial offsets to prevent thundering herd effects.
- Attach anomaly & staleness monitors: Enable the price deviation detector with a 15% threshold. Configure the staleness queue to flag plans absent for 48 hours. Set up alerting for fetcher success rates below 90%.
- Validate end-to-end: Run a full ingestion cycle, verify normalized plans in the search index, and confirm query latency remains under 2 seconds. Adjust tier intervals based on observed refresh costs and freshness requirements.