ing import Dict, Any, Optional
import requests
logger = logging.getLogger("ir_orchestrator")
class IncidentRunbookExecutor:
def init(self, config_path: str, approval_api: str):
self.config = self._load_config(config_path)
self.approval_api = approval_api
self.evidence_dir = Path("evidence_store")
self.evidence_dir.mkdir(exist_ok=True)
def _load_config(self, path: str) -> Dict[str, Any]:
with open(path, "r") as f:
return json.load(f)
def _compute_chain_hash(self, artifact_path: str) -> str:
sha256 = hashlib.sha256()
with open(artifact_path, "rb") as f:
for chunk in iter(lambda: f.read(4096), b""):
sha256.update(chunk)
return sha256.hexdigest()
def _preserve_evidence(self, artifact_path: str) -> Dict[str, Any]:
chain_hash = self._compute_chain_hash(artifact_path)
metadata = {
"artifact": artifact_path,
"collected_at": datetime.now(timezone.utc).isoformat(),
"collector": "ir_orchestrator",
"sha256": chain_hash,
"chain_of_custody": True
}
meta_path = self.evidence_dir / f"{Path(artifact_path).stem}_meta.json"
with open(meta_path, "w") as f:
json.dump(metadata, f, indent=2)
logger.info(f"Evidence preserved: {artifact_path} | Hash: {chain_hash}")
return metadata
async def _request_approval(self, action: str, context: Dict[str, Any]) -> bool:
payload = {"action": action, "context": context, "requested_at": datetime.now(timezone.utc).isoformat()}
resp = requests.post(self.approval_api, json=payload, timeout=30)
return resp.status_code == 200 and resp.json().get("approved", False)
async def execute_playbook(self, event: Dict[str, Any]) -> Dict[str, Any]:
severity = event.get("severity", "low")
playbook = self.config.get("playbooks", {}).get(severity)
if not playbook:
logger.warning(f"No playbook mapped for severity: {severity}")
return {"status": "skipped", "reason": "no_mapping"}
execution_log = {"event_id": event.get("id"), "steps": [], "status": "running"}
for step in playbook.get("steps", []):
action = step["action"]
requires_approval = step.get("requires_approval", False)
if requires_approval:
approved = await self._request_approval(action, {"event": event})
if not approved:
execution_log["status"] = "halted_by_approval"
break
# Simulate action execution (replace with real API/CLI calls)
step_result = {"step": action, "timestamp": datetime.now(timezone.utc).isoformat(), "status": "executed"}
execution_log["steps"].append(step_result)
# Auto-preserve artifacts if generated
if step.get("preserve_artifacts"):
for artifact in step["preserve_artifacts"]:
self._preserve_evidence(artifact)
execution_log["status"] = "completed"
logger.info(f"Playbook execution finished: {json.dumps(execution_log, indent=2)}")
return execution_log
### Runbook Configuration (YAML)
```yaml
playbooks:
critical:
name: "Ransomware Containment & Forensics"
steps:
- action: "isolate_host"
target: "{{event.host_id}}"
requires_approval: false
preserve_artifacts:
- "/tmp/host_snapshot.mem"
- "/tmp/network_connections.log"
- action: "disable_compromised_credentials"
target: "{{event.user_account}}"
requires_approval: true
- action: "notify_ciso_and_legal"
channels: ["slack", "pagerduty"]
requires_approval: false
- action: "trigger_full_disk_imaging"
target: "{{event.host_id}}"
requires_approval: true
preserve_artifacts:
- "/evidence/disk_image.raw"
resource "aws_s3_bucket" "ir_evidence_vault" {
bucket = "org-ir-evidence-${var.environment}"
versioning {
enabled = true
}
server_side_encryption_configuration {
rule {
apply_server_side_encryption_by_default {
sse_algorithm = "aws:kms"
}
}
}
lifecycle {
prevent_destroy = true
}
}
resource "aws_cloudwatch_log_group" "ir_execution_audit" {
name = "/security/ir-execution-audit"
retention_in_days = 365
}
resource "aws_iam_role" "ir_orchestrator_role" {
name = "ir-orchestrator-${var.environment}"
assume_role_policy = jsonencode({
Version = "2012-10-17"
Statement = [{
Action = "sts:AssumeRole"
Effect = "Allow"
Principal = { Service = "lambda.amazonaws.com" }
}]
})
}
This architecture transforms IR planning from a static exercise into a continuously validated, observable, and compliant execution pipeline. Code becomes the single source of truth; humans govern boundaries; telemetry drives adaptation.
Pitfall Guide (7 Critical Mistakes)
1. Static Documentation masquerading as a Plan
The Trap: Treating a 50-page PDF as a response plan. Documents rot, aren't versioned, and lack execution hooks.
Why It Fails: During an incident, analysts cannot parse prose under pressure. Manual step lookup introduces latency and error.
Mitigation: Migrate to runbook-as-code. Use YAML/JSON structures that map directly to API calls, CLI commands, and approval workflows. Store in Git with CI/CD validation.
2. Automation Without Human Governance
The Trap: Fully automating containment actions (e.g., mass account disablement, network isolation) without approval gates or rollback procedures.
Why It Fails: False positives cascade into business outages. Legal and compliance teams lose visibility.
Mitigation: Implement tiered automation. Low-severity actions auto-execute; medium/high severity require contextual approval. Always embed rollback steps and dry-run modes.
3. Ignoring Third-Party & Supply Chain Dependencies
The Trap: Planning only for internal systems while assuming vendors, SaaS providers, and contractors will handle their own response.
Why It Fails: 60%+ of modern breaches involve third-party vectors. Lack of contractual IR clauses causes coordination failures.
Mitigation: Embed vendor IR requirements into SLAs and contracts. Maintain a mapped dependency graph. Run joint tabletop exercises with critical suppliers annually.
4. Lack of Measurable Readiness Metrics
The Trap: Declaring "we are ready" without tracking MTTR, playbook execution success rate, or simulation fidelity.
Why It Fails: Unmeasured readiness is unproven readiness. Regulatory auditors demand evidence, not assertions.
Mitigation: Instrument every runbook execution. Track: time-to-triage, approval latency, automation success rate, evidence preservation compliance. Report quarterly to leadership.
5. Tabletop Exercises That Lack Realism
The Trap: Scripted scenarios with predetermined outcomes, no telemetry replay, and no time pressure.
Why It Fails: Teams memorize answers instead of practicing decision-making under ambiguity.
Mitigation: Use attack simulation platforms (e.g., Atomic Red Team, Caldera) to generate real alerts. Run exercises with live SIEM/SOAR data, inject failures, and measure actual response times.
6. Poor Evidence Preservation & Chain of Custody
The Trap: Collecting logs and memory dumps without cryptographic hashing, timestamping, or access controls.
Why It Fails: Evidence becomes inadmissible in legal proceedings or regulatory investigations. Integrity is questioned.
Mitigation: Automate evidence collection with SHA-256 hashing, immutable storage (WORM/S3 Object Lock), and audited access logs. Never modify raw artifacts.
7. Regulatory Misalignment & One-Size-Fits-All Compliance
The Trap: Drafting a single IR plan to satisfy all frameworks (GDPR, SEC, HIPAA, PCI-DSS) without mapping specific notification timelines or evidence requirements.
Why It Fails: Missing jurisdictional deadlines triggers fines. Inconsistent evidence formats fail audits.
Mitigation: Build a regulatory mapping matrix into your runbook router. Tag playbooks with compliance triggers, required artifacts, and mandatory notification windows. Automate deadline tracking.
Production Bundle
Incident Response Readiness Checklist
Pre-Incident (0-30 Days)
During Incident
Post-Incident
Escalation & Action Decision Matrix
| Severity | Business Impact | Detection Confidence | Action Required | Approval Needed | Regulatory Trigger |
|---|
| Low | No data loss, isolated test/dev | High | Auto-contain, log, notify SOC | No | None |
| Medium | Partial service degradation, no PII | Medium | Contain host, disable creds, notify IR lead | IR Lead | Internal tracking only |
| High | Production impact, potential data exposure | High | Isolate, image disk, notify CISO/Legal | CISO | 72-hour window (GDPR/SEC) |
| Critical | Ransomware, active exfiltration, PII breach | Confirmed | Full containment, legal hold, PR prep, regulator notification | CISO + Legal | Immediate (72h max) |
Decision Rule: If detection confidence is low but business impact is high, default to containment with rollback capability. Never let uncertainty delay isolation when critical assets are at risk.
Runbook Configuration Template
# ir-runbook-config.yaml
metadata:
version: "2.1"
last_updated: "2024-06-15T08:30:00Z"
owner: "security-ir-team"
compliance_tags: ["GDPR", "SEC", "NIS2"]
playbooks:
critical:
name: "Active Breach Containment"
steps:
- action: "isolate_network_segment"
target: "{{event.segment_id}}"
requires_approval: false
rollback: "restore_network_segment"
preserve_artifacts:
- "/tmp/pcap_capture.pcap"
- "/tmp/flow_logs.json"
- action: "revoke_active_sessions"
target: "{{event.user_id}}"
requires_approval: true
approval_timeout_minutes: 15
- action: "trigger_forensic_imaging"
target: "{{event.host_id}}"
requires_approval: true
preserve_artifacts:
- "/evidence/disk_image.raw"
- "/evidence/memory_dump.mem"
- action: "notify_regulatory_contact"
channels: ["encrypted_email", "secure_portal"]
requires_approval: false
deadline_hours: 72
30-Day Quick Start Guide
Days 1-5: Foundation
- Initialize a private Git repository for runbook-as-code
- Map your top 5 most likely incident scenarios to MITRE ATT&CK techniques
- Provision an immutable evidence storage bucket (S3/GCS/Azure Blob) with Object Lock
- Draft severity classification matrix aligned with business impact
Days 6-15: Automation & Integration
5. Deploy the IR executor script; integrate with your SIEM/SOAR via webhook
6. Build approval gate API (can start with a simple Slack slash command + Lambda)
7. Implement SHA-256 evidence hashing and metadata generation
8. Create YAML runbooks for Low/Medium/High/Critical severities
Days 16-25: Validation & Governance
9. Run 3 tabletop exercises using live telemetry replay (no scripting)
10. Measure MTTR, approval latency, and evidence preservation compliance
11. Update runbooks based on execution gaps
12. Map regulatory notification windows to runbook triggers
Days 26-30: Operationalization
13. Integrate IR metrics into executive dashboard
14. Schedule quarterly simulation cadence
15. Document vendor IR escalation paths
16. Conduct leadership briefing with readiness report
Closing Perspective
Security incident response planning has outgrown the age of static documents. The organizations that survive and recover quickly are those that treat response as a continuous engineering discipline: codified, automated, governed, and validated. By shifting from paper to pipelines, from assumptions to metrics, and from silos to orchestration, you transform incident response from a crisis management exercise into a predictable, auditable, and resilient capability.
The plan is no longer what you write. It's what you run.