from datetime import datetime, timezone
from botocore.exceptions import ClientError
class CloudCostOptimizer:
def init(self, region: str = "us-east-1"):
self.ec2 = boto3.client("ec2", region_name=region)
self.cloudwatch = boto3.client("cloudwatch", region_name=region)
self.cost_explorer = boto3.client("ce", region_name=region)
self.region = region
def get_running_instances(self) -> list:
response = self.ec2.describe_instances(
Filters=[{"Name": "instance-state-name", "Values": ["running"]}]
)
instances = []
for reservation in response["Reservations"]:
for inst in reservation["Instances"]:
instances.append({
"InstanceId": inst["InstanceId"],
"InstanceType": inst["InstanceType"],
"Tags": {t["Key"]: t["Value"] for t in inst.get("Tags", [])},
"LaunchTime": inst["LaunchTime"]
})
return instances
def get_cpu_utilization(self, instance_id: str, days: int = 14) -> float:
end = datetime.now(timezone.utc)
start = end.replace(hour=0, minute=0, second=0, microsecond=0) - __import__("datetime").timedelta(days=days)
response = self.cloudwatch.get_metric_statistics(
Namespace="AWS/EC2",
MetricName="CPUUtilization",
Dimensions=[{"Name": "InstanceId", "Value": instance_id}],
StartTime=start,
EndTime=end,
Period=86400,
Statistics=["Average"]
)
if not response["Datapoints"]:
return 0.0
return sum(d["Average"] for d in response["Datapoints"]) / len(response["Datapoints"])
def recommend_rightsizing(self, avg_cpu: float, current_type: str) -> str | None:
# Simplified mapping; production should use AWS Compute Optimizer or ML model
mapping = {
"t3.medium": "t3.small" if avg_cpu < 25 else None,
"m5.xlarge": "m5.large" if avg_cpu < 30 else None,
"c5.2xlarge": "c5.xlarge" if avg_cpu < 20 else None
}
return mapping.get(current_type)
def enforce_tags(self, instance_id: str, required_tags: dict):
missing = {k: v for k, v in required_tags.items() if k not in self.get_running_instances()[0].get("Tags", {})}
if missing:
self.ec2.create_tags(Resources=[instance_id], Tags=[{"Key": k, "Value": v} for k, v in missing.items()])
def run_optimization_cycle(self, required_tags: dict):
instances = self.get_running_instances()
report = []
for inst in instances:
avg_cpu = self.get_cpu_utilization(inst["InstanceId"])
recommendation = self.recommend_rightsizing(avg_cpu, inst["InstanceType"])
if recommendation:
report.append({
"InstanceId": inst["InstanceId"],
"CurrentType": inst["InstanceType"],
"RecommendedType": recommendation,
"AvgCPU": round(avg_cpu, 2),
"Action": "resize"
})
self.enforce_tags(inst["InstanceId"], required_tags)
return report
if name == "main":
optimizer = CloudCostOptimizer()
results = optimizer.run_optimization_cycle(required_tags={"Environment": "production", "Team": "platform"})
print(json.dumps(results, indent=2, default=str))
**Production Notes:**
- Replace static mapping with AWS Compute Optimizer API or a lightweight regression model trained on historical utilization.
- Wrap `create_tags` in idempotent checks to avoid API throttling.
- Schedule via EventBridge + Lambda for continuous execution; add CloudWatch alarms for cost anomalies.
### 2. Infrastructure-as-Code Cost Guardrails (Terraform)
Embed cost optimization directly into provisioning pipelines. This Terraform module enforces auto-scaling, storage lifecycle policies, and mandatory tagging.
```hcl
variable "environment" {
type = string
default = "production"
}
variable "team" {
type = string
default = "platform"
}
# Auto-scaling with predictive policy
resource "aws_autoscaling_group" "optimized" {
name = "${var.environment}-app-asg"
min_size = 2
max_size = 10
desired_capacity = 3
vpc_zone_identifier = var.subnet_ids
target_group_arns = [aws_lb_target_group.app.arn]
mixed_instances_policy {
instances_distribution {
on_demand_base_capacity = 1
on_demand_percentage_above_base_capacity = 20
spot_allocation_strategy = "capacity-optimized"
}
launch_template {
launch_template_specification {
launch_template_id = aws_launch_template.app.id
version = "$Latest"
}
}
}
tag {
key = "Environment"
value = var.environment
propagate_at_launch = true
}
tag {
key = "Team"
value = var.team
propagate_at_launch = true
}
}
# S3 Lifecycle for cost-tiered storage
resource "aws_s3_bucket_lifecycle_configuration" "data_lifecycle" {
bucket = aws_s3_bucket.data.id
rule {
id = "archive-old-data"
status = "Enabled"
transition {
days = 30
storage_class = "STANDARD_IA"
}
transition {
days = 90
storage_class = "GLACIER"
}
expiration {
days = 365
}
}
}
# Mandatory tagging via provider defaults
provider "aws" {
default_tags {
tags = {
ManagedBy = "terraform"
CostCenter = "engineering"
Environment = var.environment
}
}
}
Key Architecture Decisions:
spot_allocation_strategy = "capacity-optimized" minimizes interruption risk while maximizing discount.
- Lifecycle rules prevent indefinite storage accumulation; align retention with compliance requirements.
- Provider-level
default_tags ensures 100% attribution coverage without developer friction.
Pitfall Guide (6)
| # | Pitfall | Why It Happens | Mitigation Strategy |
|---|
| 1 | Performance Degradation from Aggressive Rightsizing | Teams resize based on short-term metrics without accounting for burst capacity or seasonal spikes. | Implement rolling evaluation windows (β₯30 days), retain 20% headroom, and use CloudWatch alarms to trigger automatic rollback. |
| 2 | Ignoring Data Egress & Transfer Costs | Focus remains on compute/storage while cross-AZ, cross-region, and internet egress fees compound silently. | Enable VPC Flow Logs + Cost Explorer data transfer filters; deploy CloudFront/Global Accelerator for public assets; compress payloads before cross-region replication. |
| 3 | Spot Instance Fragmentation Without Fallback | Workloads fail during spot reclamation due to missing checkpointing or single-AZ dependency. | Use multi-AZ spot pools, implement S3-backed state checkpoints, and configure ASG fallback to on-demand with priority-based allocation. |
| 4 | Tagging Enforcement Without Governance | Tags are applied inconsistently; finance cannot reconcile spend, leading to "unallocated cost" black holes. | Enforce tags via SCPs (Service Control Policies) or OPA/Conftest in CI/CD; reject deployments missing CostCenter, Environment, and Owner. |
| 5 | Treating Optimization as a One-Time Project | Initial savings erode as new services launch without cost-aware design patterns. | Embed cost gates in PR reviews, automate monthly FinOps reviews, and tie platform KPIs to cost-per-request or cost-per-active-user. |
| 6 | Overcommitting to Reserved Instances/Savings Plans | Long-term commitments lock in capacity that becomes obsolete due to architectural shifts or workload consolidation. | Start with 12-month Savings Plans (flexible across instance families), monitor coverage monthly, and utilize AWS Marketplace RI resale for unused commitments. |
Production Bundle
β
Cloud Cost Optimization Checklist
Phase 1: Foundation (Days 1β3)
Phase 2: Automation (Days 4β7)
Phase 3: Governance & Scale (Days 8β14)
π Decision Matrix: Pricing Model Selection
| Workload Characteristic | Recommended Model | Rationale | Risk Mitigation |
|---|
| Steady-state, predictable baseline | Savings Plans / Reserved Instances | 35β60% discount for 1β3 year commitment | Start with 12-month flexible; monitor utilization monthly |
| Fault-tolerant, batch, stateless | Spot Instances | 60β90% discount; interruptible by design | Multi-AZ pools, checkpointing, on-demand fallback |
| Variable, event-driven, short-lived | On-Demand / Serverless | Pay-per-use; no upfront commitment | Right-size function memory; enable provisioned concurrency only for critical paths |
| Seasonal, predictable spikes | Reserved + On-Demand hybrid | Base covered by RI; spikes handled on-demand | Use auto-scaling with mixed-instance policy |
| Long-term archival, infrequent access | Cold Storage / Glacier Deep Archive | < $0.002/GB/month | Set retrieval SLAs; automate lifecycle transitions |
| Cross-region replication | Data Transfer Optimized + CDN | Reduce egress via edge caching | Enable CloudFront/Cloud Armor; compress payloads |
# main.tf (excerpt)
module "cost_guardrails" {
source = "git::https://github.com/yourorg/terraform-cost-baseline.git"
environment = var.environment
team = var.team
max_spot_interruption = 0.15
storage_archive_days = 90
enable_predictive_scaling = true
}
# policy.rego (OpenPolicyAgent for CI/CD)
package costguard
deny[msg] {
input.resource.type == "aws_instance"
not input.resource.tags["CostCenter"]
msg := "Missing mandatory CostCenter tag"
}
deny[msg] {
input.resource.type == "aws_autoscaling_group"
input.resource.config.on_demand_percentage > 80
msg := "On-demand percentage exceeds 80%; consider spot integration"
}
Integration Points:
- Run
opa test in CI pipeline before terraform plan
- Block merges violating cost policies
- Store baseline module in private registry for team reuse
π Quick Start: 5-Day Rollout Plan
| Day | Objective | Deliverable | Owner |
|---|
| 1 | Billing visibility & tagging baseline | Centralized cost dashboard; SCP enforcing 3 mandatory tags | FinOps / Cloud Ops |
| 2 | Telemetry collection & rightsizing pilot | Python script deployed to Lambda; 10 instances evaluated | Platform Engineering |
| 3 | Auto-scaling & spot integration | ASG updated with capacity-optimized spot; fallback tested | DevOps / SRE |
| 4 | Storage lifecycle & egress audit | S3/GCS lifecycle policies applied; CDN enabled for public assets | Data Engineering |
| 5 | Governance & feedback loop | FinOps weekly sync scheduled; cost gates added to PR template; anomaly alerts active | Engineering Leadership |
Success Metrics (30-Day Post-Deployment):
- β₯25% reduction in idle compute spend
- β₯90% resource tagging coverage
- β€5% spot interruption rate for eligible workloads
- Cost anomaly alert MTTR < 2 hours
- Finance attribution accuracy β₯95%
Cloud cost optimization is not a cost-cutting exercise; it is an engineering discipline that aligns infrastructure efficiency with business velocity. By embedding telemetry, policy-as-code, and automated remediation into your delivery pipeline, you transform cost from a reactive liability into a proactive competitive advantage. Start with visibility, automate intelligently, govern consistently, and iterate continuously. The cloud rewards precision, not perfection.