ecord<RiskDimension, RiskDimensionConfig> = {
security: {
weight: 0.35,
thresholds: { critical: 80, warning: 50 }
},
stability: {
weight: 0.25,
thresholds: { critical: 70, warning: 40 }
},
debt: {
weight: 0.20,
thresholds: { critical: 60, warning: 30 }
},
operational: {
weight: 0.20,
thresholds: { critical: 75, warning: 45 }
}
};
#### 2. Metric Normalization
Raw metrics must be normalized to a 0-100 scale where higher values indicate higher risk.
```typescript
export class MetricNormalizer {
static normalizeVulnerabilityCount(count: number): number {
// Exponential risk curve: risk increases non-linearly with count
const risk = Math.min(100, (Math.log2(count + 1) * 15));
return risk;
}
static normalizeErrorRate(rate: number): number {
// Rate is 0.0 to 1.0; risk scales with severity
if (rate === 0) return 0;
return Math.min(100, rate * 1000); // 10% error rate = max risk
}
static normalizeTechnicalDebtRatio(ratio: number): number {
// Ratio from static analysis (e.g., SonarQube)
return Math.min(100, ratio * 100);
}
}
3. Portfolio Risk Calculator
import { MetricNormalizer } from './MetricNormalizer';
import { DEFAULT_DIMENSIONS, RiskDimension } from './RiskDimensions';
export interface ServiceMetrics {
serviceId: string;
vulnCount: number;
errorRate: number;
debtRatio: number;
mttrMinutes: number;
deploymentFrequency: number; // deployments per week
}
export interface RiskScore {
serviceId: string;
compositeScore: number;
dimensions: Record<RiskDimension, number>;
riskLevel: 'low' | 'medium' | 'high' | 'critical';
}
export class PortfolioRiskCalculator {
private dimensions: Record<RiskDimension, RiskDimensionConfig>;
constructor(configOverrides?: Partial<Record<RiskDimension, RiskDimensionConfig>>) {
this.dimensions = { ...DEFAULT_DIMENSIONS, ...configOverrides };
}
calculate(service: ServiceMetrics): RiskScore {
const dimensionScores: Partial<Record<RiskDimension, number>> = {};
// Security Score
dimensionScores.security = MetricNormalizer.normalizeVulnerabilityCount(service.vulnCount);
// Stability Score (combines error rate and MTTR)
const errorRisk = MetricNormalizer.normalizeErrorRate(service.errorRate);
const mttrRisk = Math.min(100, service.mttrMinutes / 120 * 100); // 120 mins = max risk
dimensionScores.stability = (errorRisk * 0.6) + (mttrRisk * 0.4);
// Debt Score
dimensionScores.debt = MetricNormalizer.normalizeTechnicalDebtRatio(service.debtRatio);
// Operational Score (inverse of deployment frequency, low freq = high risk)
const deployRisk = service.deploymentFrequency < 1 ? 80 :
service.deploymentFrequency < 5 ? 40 : 10;
dimensionScores.operational = deployRisk;
// Calculate Weighted Composite
let composite = 0;
for (const [dim, score] of Object.entries(dimensionScores)) {
const config = this.dimensions[dim as RiskDimension];
composite += (score * config.weight);
}
composite = Math.round(composite);
// Determine Risk Level
let riskLevel: RiskScore['riskLevel'] = 'low';
for (const [dim, score] of Object.entries(dimensionScores)) {
const config = this.dimensions[dim as RiskDimension];
if (score >= config.thresholds.critical) {
riskLevel = 'critical';
break;
}
if (score >= config.thresholds.warning) {
riskLevel = 'high';
}
}
if (riskLevel === 'low' && composite >= 50) riskLevel = 'medium';
return {
serviceId: service.serviceId,
compositeScore: composite,
dimensions: dimensionScores as Record<RiskDimension, number>,
riskLevel
};
}
}
4. Usage Example
const calculator = new PortfolioRiskCalculator();
const serviceMetrics: ServiceMetrics = {
serviceId: 'payment-gateway',
vulnCount: 12,
errorRate: 0.02,
debtRatio: 0.15,
mttrMinutes: 45,
deploymentFrequency: 3
};
const riskReport = calculator.calculate(serviceMetrics);
console.log(riskReport);
/* Output:
{
serviceId: 'payment-gateway',
compositeScore: 68,
dimensions: { security: 60, stability: 45, debt: 15, operational: 40 },
riskLevel: 'high'
}
*/
Architecture Rationale
- Modularity: Separating normalization from calculation allows swapping metric sources without altering the risk logic.
- Extensibility: New dimensions can be added by extending the
RiskDimension type and updating the configuration.
- Performance: The calculation is O(n) relative to dimensions, making it suitable for real-time scoring of thousands of services.
Pitfall Guide
1. Equal Weighting of All Dimensions
Mistake: Assigning equal weight to security, stability, debt, and operational risk.
Explanation: Not all risks are equal. A payment service requires higher security weighting than an internal documentation tool. Static weights lead to misprioritization.
Best Practice: Implement context-aware weighting. Use service tags (e.g., tier: critical, data: pii) to dynamically adjust weights during calculation.
2. Ignoring Dependency Topology
Mistake: Assessing services in isolation without considering upstream/downstream dependencies.
Explanation: A low-risk service may become high-risk if it is a dependency for a critical path. Risk propagates through the graph.
Best Practice: Integrate with a service mesh or dependency graph. Apply a "blast radius multiplier" to the risk score based on the number of critical dependents.
3. Metric Gaming
Mistake: Developers close technical debt tickets without fixing code to lower the debt score.
Explanation: If metrics drive performance reviews, teams will optimize the metric, not the system.
Best Practice: Use lagging indicators alongside leading indicators. Correlate debt reduction with incident rates. Audit score changes against code commit quality.
4. Alert Fatigue from Static Thresholds
Mistake: Triggering alerts on every score change or using fixed thresholds for all services.
Explanation: Fixed thresholds generate noise for stable services and miss drift in volatile ones.
Best Practice: Use dynamic baselining. Alert on statistical anomalies (e.g., Z-score deviation from historical baseline) rather than absolute thresholds.
5. Lack of Remediation Context
Mistake: Reporting a risk score without actionable remediation steps.
Explanation: A score of "75" provides no guidance. Engineering teams need to know what to fix.
Best Practice: Attach "Risk Drivers" to the report. For example, security: 80 (Cause: CVE-2023-XXXX in dependency X). Link directly to remediation tickets or runbooks.
6. Over-Engineering Data Collection
Mistake: Building complex scrapers for every tool.
Explanation: Tool APIs change, and scrapers break. This creates maintenance overhead.
Best Practice: Use standardized ingestion adapters or leverage existing platforms (e.g., Backstage, Cortex) that provide unified APIs. Prioritize webhooks over polling.
7. Static Risk Models
Mistake: Defining the risk model once and never revisiting it.
Explanation: Risk landscapes evolve. New threat vectors emerge, and business priorities shift.
Best Practice: Schedule quarterly reviews of the risk model. Validate that high-risk scores correlate with actual incidents. Adjust weights and thresholds based on retrospective data.
Production Bundle
Action Checklist
Decision Matrix
| Scenario | Recommended Approach | Why | Cost Impact |
|---|
| Startup (<20 services) | Manual Spreadsheet + CLI Script | Low overhead; rapid iteration; sufficient visibility for small scale. | Low |
| Mid-Size (20-100 services) | Custom Risk Engine (TypeScript/Go) | Automation required; customizable weights; integrates with internal tooling. | Medium |
| Enterprise (100+ services) | Platform Engineering Tool (e.g., Backstage/Cortex) | Governance at scale; plugin ecosystem; centralized catalog management. | High |
| Highly Regulated Industry | Risk Engine with Compliance Module | Audit trails; strict weighting for compliance risks; immutable reporting. | High |
Configuration Template
Copy this YAML configuration to define your risk engine parameters.
risk_engine:
version: "1.0"
dimensions:
security:
weight: 0.35
sources:
- provider: "snyk"
metric: "critical_vulns"
normalizer: "logarithmic"
- provider: "github"
metric: "dependabot_alerts"
normalizer: "linear"
thresholds:
critical: 80
warning: 50
stability:
weight: 0.25
sources:
- provider: "datadog"
metric: "error_rate_5m"
normalizer: "exponential"
- provider: "pagerduty"
metric: "mttr_hours"
normalizer: "linear"
thresholds:
critical: 70
warning: 40
debt:
weight: 0.20
sources:
- provider: "sonarqube"
metric: "technical_debt_ratio"
normalizer: "linear"
- provider: "jira"
metric: "open_debt_tickets"
normalizer: "sqrt"
thresholds:
critical: 60
warning: 30
operational:
weight: 0.20
sources:
- provider: "github_actions"
metric: "deployment_frequency"
normalizer: "inverse_linear"
thresholds:
critical: 75
warning: 45
context_rules:
- tag: "tier:critical"
dimension_weights:
security: 0.45
stability: 0.35
debt: 0.10
operational: 0.10
- tag: "data:pii"
dimension_weights:
security: 0.50
compliance: 0.30
stability: 0.10
debt: 0.10
output:
format: "json"
destination: "s3://risk-reports"
retention_days: 365
Quick Start Guide
-
Initialize Configuration:
Create risk-config.yaml using the template above. Adjust weights and sources to match your toolchain.
cp risk-config-template.yaml risk-config.yaml
-
Run First Assessment:
Execute the risk engine CLI to generate an initial report.
npx @codcompass/risk-engine assess --config risk-config.yaml --output report.json
-
Review Risk Matrix:
Open report.json or pipe to a visualization tool. Identify services with riskLevel: critical or compositeScore > 70.
cat report.json | jq '.services[] | select(.riskLevel == "critical")'
-
Integrate CI/CD:
Add a step to your pipeline to block deployments for services exceeding risk thresholds.
# GitHub Actions Example
- name: Check Portfolio Risk
run: |
SCORE=$(npx @codcompass/risk-engine get-score --service ${{ env.SERVICE_NAME }})
if [ "$SCORE" -gt 80 ]; then
echo "Risk score $SCORE exceeds threshold. Blocking deployment."
exit 1
fi
-
Schedule Recalculation:
Set up a cron job or workflow to run assessments daily, ensuring the risk matrix stays current.
# Crontab entry for daily run at 06:00
0 6 * * * /usr/local/bin/risk-engine assess --config /etc/risk-config.yaml