afficClass === 'serverless_preferred') {
return InferencePath.SERVERLESS_INFERENCE;
}
if (payloadSize >= this.config.batchThreshold) {
return InferencePath.BATCH_TRANSFORM;
}
return InferencePath.PROVISIONED_ENDPOINT;
}
}
**Architecture Rationale**: Centralizing routing logic eliminates drift between services. The function is stateless, making it trivial to unit test and validate against property-based invariants. Traffic classification is decoupled from payload size, allowing business logic to override technical thresholds when necessary.
### Step 2: Serverless Inference & Cold Start Mitigation
Serverless endpoints introduce a unique failure mode: `ModelNotReadyException` during initialization. The mitigation strategy combines extended timeouts, exponential backoff, and deterministic fallback.
```typescript
import { InvokeEndpointCommand, SageMakerClient } from '@aws-sdk/client-sagemaker-runtime';
export async function invokeServerlessWithFallback(
client: SageMakerClient,
endpointName: string,
payload: Uint8Array,
maxRetries: number = 2
): Promise<Uint8Array> {
const baseTimeout = 60_000;
const retryDelay = 3_000;
for (let attempt = 0; attempt <= maxRetries; attempt++) {
try {
const command = new InvokeEndpointCommand({
EndpointName: endpointName,
Body: payload,
ContentType: 'application/octet-stream'
});
const response = await client.send(command);
return response.Body as Uint8Array;
} catch (error: any) {
if (error.name === 'ModelNotReadyException' && attempt < maxRetries) {
await new Promise(res => setTimeout(res, retryDelay * (attempt + 1)));
continue;
}
throw error;
}
}
throw new Error('Serverless inference timeout exceeded');
}
Architecture Rationale: The initial timeout is set to 60 seconds to accommodate model loading. Retry logic uses progressive delays to avoid thundering herd scenarios. When retries are exhausted, the orchestration layer (Step Functions) catches the timeout and routes to batch transform, ensuring zero request loss.
Step 3: Automated Cost Controls
Cost governance requires three synchronized mechanisms: time-based scaling, financial alerting, and idle detection.
Scheduled Scaling Configuration:
Parameters:
BusinessHoursMin:
Type: Number
Default: 2
OffHoursMin:
Type: Number
Default: 0
Timezone:
Type: String
Default: America/New_York
Resources:
ScaleUpSchedule:
Type: AWS::ApplicationAutoScaling::ScheduledAction
Properties:
Schedule: "cron(0 9 ? * MON-FRI *)"
TimeZone: !Ref Timezone
ScalableTargetAction:
MinCapacity: !Ref BusinessHoursMin
MaxCapacity: 8
ScaleDownSchedule:
Type: AWS::ApplicationAutoScaling::ScheduledAction
Properties:
Schedule: "cron(0 18 ? * MON-FRI *)"
TimeZone: !Ref Timezone
ScalableTargetAction:
MinCapacity: !Ref OffHoursMin
MaxCapacity: 2
Idle Detection Lambda:
import { SageMakerClient, DescribeEndpointCommand, UpdateEndpointWeightsAndCapacitiesCommand } from '@aws-sdk/client-sagemaker';
import { CloudWatchClient, GetMetricStatisticsCommand } from '@aws-sdk/client-cloudwatch';
export async function handler(event: any) {
const smClient = new SageMakerClient({ region: process.env.AWS_REGION });
const cwClient = new CloudWatchClient({ region: process.env.AWS_REGION });
const endpoints = await smClient.send(new ListEndpointsCommand({}));
for (const ep of endpoints.Endpoints || []) {
if (ep.Tags?.some(t => t.Key === 'ProtectedFromAutoStop' && t.Value === 'true')) {
continue;
}
const metrics = await cwClient.send(new GetMetricStatisticsCommand({
Namespace: 'AWS/SageMaker',
MetricName: 'Invocations',
Dimensions: [{ Name: 'EndpointName', Value: ep.EndpointName }],
StartTime: new Date(Date.now() - 3_600_000),
EndTime: new Date(),
Period: 3600,
Statistics: ['Sum']
}));
const totalInvocations = metrics.Datapoints?.[0]?.Sum ?? 0;
if (totalInvocations === 0 && process.env.ENVIRONMENT !== 'production') {
await smClient.send(new DeleteEndpointCommand({ EndpointName: ep.EndpointName }));
}
}
}
Architecture Rationale: Scheduled scaling aligns capacity with business hours, capturing the majority of cost savings. The idle detection Lambda runs hourly via EventBridge, checking invocation metrics over a rolling 60-minute window. Tag-based protection prevents accidental termination of critical endpoints. Production environments default to capacity reduction rather than deletion, preserving data integrity.
Step 4: CI/CD Gating & Multi-Region State
Deployment automation requires strict stage ordering and credential federation. GitHub Actions with OIDC eliminates long-lived access keys.
permissions:
id-token: write
contents: read
jobs:
validate:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Lint Templates
run: cfn-lint infra/**/*.yaml
- name: Run Property Tests
run: pytest tests/ --cov=src --cov-fail-under=80
- name: Security Compliance
run: cfn-guard validate -r rules/ -d infra/
- name: Static Analysis
run: bandit -r src/ && pip-audit
Multi-Region State Replication:
Task tokens and execution metadata are stored in DynamoDB Global Tables. Cross-region clients automatically route to the nearest healthy region. When a region fails, Route 53 health checks trigger Step Functions failover workflows, preserving RPO near-zero for Tier 1 disaster recovery.
Pitfall Guide
-
Ignoring Cold Start Fallback Chains
- Explanation: Assuming serverless inference will always respond within SLA. Cold starts can exceed 45 seconds during model initialization or traffic spikes.
- Fix: Implement explicit catch blocks in orchestration layers that route to batch transform or queue requests when timeout thresholds are breached.
-
Deploying Billing Alarms Outside us-east-1
- Explanation: AWS publishes estimated charges in the
AWS/Billing namespace exclusively in US East (N. Virginia). Deploying alarms in other regions results in missing metrics.
- Fix: Always provision billing alarm stacks in
us-east-1, regardless of workload region. Use cross-account roles to centralize monitoring.
-
Attempting to Scale Standard Endpoints to Zero
- Explanation: Production variant-based SageMaker endpoints require at least one running instance. Only endpoints using Inference Components with
ManagedInstanceScaling.MinInstanceCount=0 support true zero scaling.
- Fix: Verify endpoint architecture before configuring auto-stop policies. Default to
MinCapacity=1 for standard endpoints or switch to inference components for zero-capacity requirements.
-
Bypassing Staging Smoke Tests
- Explanation: Promoting unvalidated infrastructure changes directly to production causes cascading failures. Manual approval gates are ineffective without automated validation.
- Fix: Enforce staging deployment with automated health checks. Block production promotion until smoke tests return successful HTTP 200 responses and metric baselines are met.
-
Mismatching DR Tier Definitions
- Explanation: Treating all workloads as Tier 1 (RPO near-zero, RTO <5min) inflates costs unnecessarily. Tier 2 and Tier 3 workloads tolerate higher data loss and longer recovery windows.
- Fix: Classify workloads by business impact. Use synchronous replication for Tier 1, asynchronous for Tier 2, and backup/restore for Tier 3. Align infrastructure spend with classification.
-
Hardcoding AWS Credentials in CI/CD
- Explanation: Long-lived access keys increase blast radius if compromised. Rotation is manual and error-prone.
- Fix: Use OIDC federation between GitHub and AWS IAM. Assume deployment roles via short-lived session tokens. Revoke access instantly by removing the trust policy.
-
Missing Invariant Validation in Routing Logic
- Explanation: Conditional routing branches drift over time, causing unpredictable path selection.
- Fix: Implement property-based tests that verify deterministic output for all input combinations. Validate that exactly one path is selected per request and that fallback chains are exhaustive.
Production Bundle
Action Checklist
Decision Matrix
| Scenario | Recommended Approach | Why | Cost Impact |
|---|
| Steady traffic >500 req/min | Provisioned Endpoint | Predictable latency, no cold starts | ~$215/mo baseline + instance scaling |
| Sporadic traffic <50 req/hour | Serverless Inference | Pay-per-request, scales to zero | $0 when idle, ~$0.00001667/GB-sec |
| Large datasets >10GB | Batch Transform | Optimized for throughput, async processing | Per-job pricing, no always-on cost |
| Multi-region compliance | DynamoDB Global Tables + Route 53 | Automatic failover, near-zero RPO | ~$0.25/GB replicated + Route 53 health checks |
| Off-hours cost reduction | Scheduled Scaling + Auto-Stop Lambda | Aligns capacity with demand | Up to 70% reduction during idle periods |
Configuration Template
AWSTemplateFormatVersion: '2010-09-09'
Parameters:
Environment:
Type: String
AllowedValues: [staging, production]
EnableServerlessFallback:
Type: String
Default: 'true'
BatchThreshold:
Type: Number
Default: 1000
Conditions:
IsProduction: !Equals [!Ref Environment, production]
ServerlessEnabled: !Equals [!Ref EnableServerlessFallback, 'true']
Resources:
RoutingConfigTable:
Type: AWS::DynamoDB::Table
Properties:
TableName: !Sub '${AWS::StackName}-routing-config'
AttributeDefinitions:
- AttributeName: configKey
AttributeType: S
KeySchema:
- AttributeName: configKey
KeyType: HASH
BillingMode: PAY_PER_REQUEST
GlobalSecondaryIndexes:
- IndexName: trafficClass-index
KeySchema:
- AttributeName: trafficClass
KeyType: HASH
Projection:
ProjectionType: ALL
BillingAlarmStack:
Type: AWS::CloudFormation::Stack
Condition: IsProduction
Properties:
TemplateURL: https://s3.amazonaws.com/infra-templates/billing-alarm.yaml
Parameters:
WarningThreshold: 100
CriticalThreshold: 200
EmergencyThreshold: 500
NotificationEmail: ops-team@company.com
Quick Start Guide
- Initialize Routing Configuration: Deploy the DynamoDB table and populate initial thresholds based on your payload distribution. Set
BatchThreshold to match your historical 95th percentile file count.
- Provision Cost Controls: Apply the scheduled scaling template aligned with your team's business hours. Deploy billing alarms in
us-east-1 and configure SNS topics for escalation.
- Enable CI/CD Gating: Fork the repository, configure GitHub OIDC trust policies, and run the 4-stage pipeline against a staging environment. Verify smoke tests pass before approving production promotion.
- Validate Multi-Region Failover: Enable DynamoDB Global Tables for your target regions. Simulate a regional outage by disabling Route 53 health checks and verify that Step Functions automatically routes to the secondary region without data loss.