[];
constructor(customInfraPatterns: string[]) {
this.infraPatterns = customInfraPatterns.map(p => new RegExp(p, 'i'));
}
normalize(rawFailures: Array<{ suite: string; test: string; error: string }>): FailureSignature[] {
return rawFailures.map(f => {
const normalizedKey = this.extractCanonicalKey(f.error);
const category = this.infraPatterns.some(p => p.test(f.error)) ? 'INFRA' : 'UNKNOWN';
return {
suite: f.suite,
test: f.test,
rawError: f.error,
normalizedKey,
category
};
});
}
private extractCanonicalKey(error: string): string {
// Strip dynamic values (ports, timestamps, trace IDs) to create stable grouping keys
return error
.replace(/\b\d{1,3}.\d{1,3}.\d{1,3}.\d{1,3}:\d+\b/g, '[HOST:PORT]')
.replace(/\b[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}\b/gi, '[UUID]')
.replace(/at\s+.*:\d+:\d+/g, 'at [STACK_TRACE]')
.trim();
}
}
**Why this design:** Dynamic values in stack traces prevent accurate deduplication. By stripping IPs, ports, UUIDs, and line numbers, we create stable grouping keys that allow the system to recognize that 40 failing tests sharing a `ECONNREFUSED` pattern represent a single infrastructure event, not 40 independent regressions.
### Phase 2: Context Enrichment
Once failures are normalized, the orchestrator queries two external knowledge sources: a flakiness baseline and a code-change impact map. These are typically provided by dedicated MCP servers that maintain historical run data and AST-level dependency graphs.
```typescript
interface FlakinessRecord {
testIdentifier: string;
failureProbability: number; // 0.0 to 1.0
lastObserved: string; // ISO date
}
interface CodeImpactMap {
changedFiles: string[];
affectedTests: string[];
}
class ContextEnricher {
async enrich(
signatures: FailureSignature[],
flakinessDB: FlakinessRecord[],
impactMap: CodeImpactMap
): Promise<EnrichedFailure[]> {
return signatures.map(sig => {
const flakyRecord = flakinessDB.find(r => r.testIdentifier === `${sig.suite} > ${sig.test}`);
const isAffected = impactMap.affectedTests.includes(`${sig.suite} > ${sig.test}`);
return {
...sig,
flakinessScore: flakyRecord?.failureProbability ?? 0,
isCodeCorrelated: isAffected,
riskLevel: this.computeRisk(sig.category, flakyRecord?.failureProbability ?? 0, isAffected)
};
});
}
private computeRisk(category: string, flakiness: number, isCorrelated: boolean): 'BLOCKER' | 'WARNING' | 'IGNORE' {
if (category === 'INFRA') return 'IGNORE';
if (flakiness > 0.6) return 'IGNORE';
if (isCorrelated && category !== 'INFRA') return 'BLOCKER';
if (flakiness > 0.2) return 'WARNING';
return 'BLOCKER';
}
}
Why this design: Flakiness probability must be weighted against code-change correlation. A test with a 0.73 historical failure rate is statistically unlikely to indicate a regression, even if it fails. Conversely, a test with zero flakiness history that correlates with modified source files represents a high-confidence blocker. The risk computation explicitly separates environmental noise from logical regressions.
Phase 3: Verdict Generation
The final phase aggregates enriched signals into a release recommendation with a quantified confidence score. The output is structured for direct consumption by CI platforms, PR comments, or Slack notifications.
interface ReleaseVerdict {
status: 'GO' | 'NO_GO' | 'REVIEW_REQUIRED';
confidence: number; // 0.0 to 1.0
blockers: string[];
safeToIgnore: string[];
summary: string;
}
class VerdictGenerator {
generate(enrichedFailures: EnrichedFailure[]): ReleaseVerdict {
const blockers = enrichedFailures.filter(f => f.riskLevel === 'BLOCKER');
const ignored = enrichedFailures.filter(f => f.riskLevel === 'IGNORE');
const warnings = enrichedFailures.filter(f => f.riskLevel === 'WARNING');
const confidence = this.calculateConfidence(blockers.length, ignored.length, warnings.length);
const status = blockers.length > 0 ? 'NO_GO' : warnings.length > 0 ? 'REVIEW_REQUIRED' : 'GO';
return {
status,
confidence,
blockers: blockers.map(f => `${f.suite} > ${f.test}`),
safeToIgnore: ignored.map(f => `${f.suite} > ${f.test}`),
summary: status === 'NO_GO'
? `${blockers.length} confirmed regression(s) correlated with code changes. Release blocked.`
: status === 'REVIEW_REQUIRED'
? `${warnings.length} ambiguous failure(s) require manual validation.`
: 'Pipeline clear. No blocking regressions detected.'
};
}
private calculateConfidence(blockers: number, ignored: number, warnings: number): number {
const total = blockers + ignored + warnings;
if (total === 0) return 1.0;
// Higher confidence when blockers are clearly separated from noise
const signalClarity = ignored / total;
const regressionWeight = blockers > 0 ? 0.85 : 0.95;
return Math.min(0.99, regressionWeight + (signalClarity * 0.1));
}
}
Architecture Rationale: The pipeline follows a strict separation of concerns. Normalization handles log volatility, enrichment handles historical and topological context, and verdict generation handles business logic. This design enables independent scaling of each phase, straightforward testing, and seamless integration with the broader MCP ecosystem. The orchestrator delegates flakiness tracking to flakiness-knowledge-graph-mcp, AST impact analysis to ast-impact-mapper-mcp, and trace decoding to playwright-trace-decoder-mcp, maintaining a clean boundary between orchestration and domain-specific tooling.
Pitfall Guide
1. Treating All Failures as Equal Weight
Explanation: CI platforms often report failures without severity classification. Assuming every failed test carries equal release risk leads to unnecessary rollbacks and deployment delays.
Fix: Implement explicit risk categorization based on flakiness baselines and code correlation. Only tests with low historical failure rates and direct source-file correlation should trigger NO_GO status.
2. Stale Flakiness Baselines
Explanation: Flakiness probabilities drift as test suites are refactored, infrastructure is upgraded, or test data changes. Using a 6-month-old baseline produces false negatives on newly unstable tests.
Fix: Apply a decay window to historical data. Weight recent runs (last 30 days) at 70% and older runs at 30%. Trigger baseline recalculation when test suite composition changes by more than 15%.
3. Over-Reliance on AST Mapping
Explanation: Static analysis tools like ast-impact-mapper-mcp identify syntactic dependencies but miss runtime coupling, shared fixtures, or environment-dependent behavior. A test may fail due to a changed utility function that the AST mapper doesn't flag.
Fix: Combine AST output with execution trace analysis. Use playwright-trace-decoder-mcp or equivalent runtime profilers to validate static predictions. Maintain a fallback correlation heuristic based on shared test data directories.
4. Hardcoded Confidence Thresholds
Explanation: Fixed thresholds (e.g., always block at 0.7 confidence) break under varying pipeline loads. A high-throughput repository may tolerate lower confidence during off-hours, while a hotfix branch requires stricter gates.
Fix: Implement dynamic thresholding based on branch type, deployment window, and historical false-positive rates. Expose threshold configuration as environment variables rather than embedding them in the orchestrator logic.
5. Ignoring Custom Infrastructure Patterns
Explanation: Cloud providers emit environment-specific errors (GCP quota exceeded, AWS throttling, Kubernetes OOMKilled). Without explicit pattern registration, these are misclassified as application regressions.
Fix: Maintain a versioned pattern registry. Update it alongside infrastructure-as-code changes. Validate new patterns against a holdout set of historical runs before deploying to production triage.
6. Cascading Failure Misattribution
Explanation: A single infrastructure event (e.g., database connection pool exhaustion) can trigger dozens of seemingly unrelated test failures. Deduplication that only groups by error string may miss cross-suite correlations.
Fix: Implement temporal clustering. Group failures that occur within the same execution window and share underlying resource contention signatures. Tag the entire cluster as a single infra event.
7. No Rollback Strategy for False NO_GOs
Explanation: Automated blocking without manual override capability can halt critical deployments during false positives. Engineering teams lose trust in the system and bypass it entirely.
Fix: Implement a two-tier gate: automated NO_GO with a mandatory 5-minute review window, followed by explicit human override logging. Track override frequency to recalibrate confidence scoring.
Production Bundle
Action Checklist
Decision Matrix
| Scenario | Recommended Approach | Why | Cost Impact |
|---|
| Hotfix to production | Strict NO_GO with 0.85+ confidence threshold | Zero tolerance for regressions during emergency deployments | Low (prevents rollback costs) |
| Feature branch PR | Standard triage with REVIEW_REQUIRED fallback | Balances velocity with regression detection | Medium (minor delay for ambiguous cases) |
| Scheduled nightly build | Relaxed threshold, infra-only blocking | High flakiness tolerance acceptable for non-release pipelines | Low (reduces alert fatigue) |
| Multi-service monorepo | AST + trace correlation with temporal clustering | Prevents cross-service noise from blocking unrelated changes | High (requires additional compute) |
Configuration Template
{
"mcpServers": {
"ci-triage-orchestrator": {
"command": "npx",
"args": ["-y", "release-readiness-triage-mcp"],
"env": {
"FLAKINESS_DECAY_DAYS": "30",
"CONFIDENCE_THRESHOLD_HOTFIX": "0.85",
"CONFIDENCE_THRESHOLD_FEATURE": "0.70",
"INFRA_PATTERN_VERSION": "v2.4.1",
"OUTPUT_FORMAT": "markdown",
"ENABLE_TEMPORAL_CLUSTERING": "true"
}
}
}
}
Quick Start Guide
- Install the orchestrator package via
npx and configure the MCP server definition in your IDE or CI runner configuration.
- Connect the flakiness knowledge graph and AST impact mapper endpoints. Ensure both services expose stable JSON-RPC interfaces compatible with the orchestrator's enrichment phase.
- Register your infrastructure error patterns using the environment variable registry. Validate against a historical run to confirm accurate
INFRA classification.
- Trigger a test triage by passing raw CI output, flakiness snapshots, and the current PR diff to the orchestrator. Review the generated markdown verdict in your pull request comment.
- Monitor override frequency and confidence calibration weekly. Adjust decay windows and thresholds based on false-positive trends before enabling automated release blocking.