h open(config_path, 'r') as f:
config = yaml.safe_load(f)
required_keys = ['source_dir', 'replacements', 'metadata']
for key in required_keys:
if key not in config:
raise ValueError(f"Missing required config key: {key}")
return config
def apply_transformations(content: str, replacements: Dict[str, str]) -> str:
"""Apply regex replacements to content."""
for pattern, replacement in replacements.items():
content = re.sub(pattern, replacement, content)
return content
def process_file(file_path: Path, config: Dict[str, Any], dry_run: bool) -> bool:
"""Process a single file. Returns True if modified."""
try:
content = file_path.read_text(encoding='utf-8')
original_content = content
# Apply replacements
content = apply_transformations(content, config['replacements'])
# Inject metadata if header missing
if config['metadata'].get('inject_header', False):
header = f"<!-- Auto-generated: {config['metadata'].get('project', 'unknown')} -->\n"
if not content.startswith(header):
content = header + content
if content != original_content:
if dry_run:
logger.info(f"[DRY RUN] Would update: {file_path}")
else:
file_path.write_text(content, encoding='utf-8')
logger.info(f"Updated: {file_path}")
return True
return False
except Exception as e:
logger.error(f"Failed to process {file_path}: {e}")
return False
@click.command()
@click.option('--config', '-c', type=Path, required=True, help='Path to YAML config file.')
@click.option('--dry-run', is_flag=True, help='Preview changes without writing.')
def main(config: Path, dry_run: bool):
"""Run document transformation pipeline."""
try:
cfg = load_config(config)
source_dir = Path(cfg['source_dir'])
if not source_dir.is_dir():
raise NotADirectoryError(f"Source dir not found: {source_dir}")
pattern = cfg.get('file_pattern', '*.md')
files = list(source_dir.rglob(pattern))
logger.info(f"Found {len(files)} files matching {pattern}")
modified_count = 0
for f in files:
if process_file(f, cfg, dry_run):
modified_count += 1
logger.info(f"Completed. Modified {modified_count} files.")
except Exception as e:
logger.error(f"Pipeline failed: {e}")
exit(1)
if name == 'main':
main()
#### 3. Orchestration: Robust Makefile
The Makefile serves as the contract for the automation stack. It validates prerequisites, manages dependencies, and provides a uniform interface.
**File:** `Makefile`
```makefile
SHELL := /bin/bash
.ONESHELL:
.SHELLFLAGS := -eu -o pipefail -c
# Configuration
CONFIG_FILE ?= automation.yaml
PYTHON := python3
SCRIPTS_DIR := scripts
.PHONY: all check-env transform lint clean
# Default target
all: check-env lint transform
# Validate environment
check-env:
@echo ">> Checking environment..."
@command -v $(PYTHON) >/dev/null 2>&1 || { echo "Error: $(PYTHON) not found"; exit 1; }
@$(PYTHON) -c "import yaml, click" || { echo "Error: Missing Python dependencies. Run: pip install -r requirements.txt"; exit 1; }
@test -f $(CONFIG_FILE) || { echo "Error: Config file $(CONFIG_FILE) not found"; exit 1; }
@echo ">> Environment OK."
# Lint scripts
lint: check-env
@echo ">> Running linters..."
@$(PYTHON) -m flake8 $(SCRIPTS_DIR) --max-line-length=120
@echo ">> Linting passed."
# Run transformation
transform: check-env
@echo ">> Running document transformer..."
@$(PYTHON) $(SCRIPTS_DIR)/doc_transformer.py --config $(CONFIG_FILE)
@echo ">> Transformation complete."
# Dry run for safety
dry-run: check-env
@echo ">> Running dry-run..."
@$(PYTHON) $(SCRIPTS_DIR)/doc_transformer.py --config $(CONFIG_FILE) --dry-run
@echo ">> Dry-run complete."
# Clean artifacts
clean:
@echo ">> Cleaning artifacts..."
@rm -rf __pycache__ .pytest_cache
@find . -type f -name "*.pyc" -delete
@echo ">> Clean complete."
4. Database Safety: Schema Guard
Database operations require strict transaction handling and idempotency. The following pattern uses a version table to track applied migrations and ensures atomicity.
File: scripts/schema_guard.py
#!/usr/bin/env python3
"""
Idempotent schema migration guard.
Applies pending migrations within transactions.
"""
import psycopg2
import psycopg2.extras
import logging
from pathlib import Path
logger = logging.getLogger(__name__)
def ensure_version_table(conn):
"""Create schema_version table if it doesn't exist."""
with conn.cursor() as cur:
cur.execute("""
CREATE TABLE IF NOT EXISTS schema_version (
version INTEGER PRIMARY KEY,
applied_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);
""")
conn.commit()
def get_applied_versions(conn):
"""Retrieve list of applied migration versions."""
with conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
cur.execute("SELECT version FROM schema_version ORDER BY version;")
return [row['version'] for row in cur.fetchall()]
def apply_migration(conn, version, sql_content):
"""Apply a single migration within a transaction."""
try:
with conn.cursor() as cur:
cur.execute(sql_content)
cur.execute("INSERT INTO schema_version (version) VALUES (%s);", (version,))
conn.commit()
logger.info(f"Applied migration {version}")
except Exception as e:
conn.rollback()
logger.error(f"Failed to apply migration {version}: {e}")
raise
def run_migrations(migrations_dir: Path, db_url: str):
"""Execute pending migrations."""
conn = psycopg2.connect(db_url)
try:
ensure_version_table(conn)
applied = get_applied_versions(conn)
migration_files = sorted(migrations_dir.glob("*.sql"))
for f in migration_files:
version = int(f.stem.split('_')[0])
if version not in applied:
logger.info(f"Applying migration {version} from {f.name}")
sql = f.read_text()
apply_migration(conn, version, sql)
else:
logger.debug(f"Skipping already applied migration {version}")
logger.info("Schema guard complete. No pending migrations.")
finally:
conn.close()
if __name__ == '__main__':
import click
@click.command()
@click.argument('migrations_dir', type=Path)
@click.argument('db_url')
def cli(migrations_dir, db_url):
run_migrations(migrations_dir, db_url)
cli()
5. CI/CD Enforcement: Matrix Build with Caching
Continuous Integration must replicate local automation and add matrix testing and caching to accelerate feedback loops.
File: .github/workflows/automation.yml
name: Automation Pipeline
on:
push:
branches: [main, develop]
pull_request:
branches: [main]
jobs:
validate:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ['3.9', '3.10', '3.11']
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Setup Python ${{ matrix.python-version }}
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}
- name: Cache pip dependencies
uses: actions/cache@v3
with:
path: ~/.cache/pip
key: ${{ runner.os }}-pip-${{ hashFiles('**/requirements.txt') }}
restore-keys: |
${{ runner.os }}-pip-
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install -r requirements.txt
- name: Run linting
run: make lint
- name: Run dry-run validation
run: make dry-run
- name: Run tests
run: pytest tests/ -v --cov=scripts
deploy-artifact:
needs: validate
if: github.ref == 'refs/heads/main'
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Build artifact
run: |
mkdir -p dist
cp -r scripts dist/
cp automation.yaml dist/
tar -czf automation-bundle-${{ github.sha }}.tar.gz -C dist .
- name: Upload artifact
uses: actions/upload-artifact@v3
with:
name: automation-bundle
path: automation-bundle-${{ github.sha }}.tar.gz
Pitfall Guide
-
Non-Idempotent Scripts
- Explanation: Scripts that fail or produce incorrect results when run multiple times. This breaks CI pipelines and retry logic.
- Fix: Always check state before action. Use
CREATE TABLE IF NOT EXISTS, check file existence before overwrite, and design transformations to be convergent.
-
Hardcoded Secrets and Paths
- Explanation: Embedding credentials or absolute paths in scripts makes them insecure and non-portable.
- Fix: Use environment variables for secrets and configuration files for paths. Validate inputs at runtime. Never commit secrets to version control.
-
Silent Failures
- Explanation: Scripts that exit with code 0 even when errors occur, masking failures in pipelines.
- Fix: Use
set -e in Bash, handle exceptions explicitly in Python, and return non-zero exit codes on failure. Implement structured logging to capture error context.
-
Script Sprawl and Duplication
- Explanation: Creating new scripts for every minor variation leads to maintenance nightmares and inconsistent behavior.
- Fix: Parameterize scripts using CLI arguments and config files. Use Makefile variables to pass context. Centralize common logic in shared modules.
-
Lack of Dry-Run Capability
- Explanation: Scripts that perform destructive actions without a preview mode increase risk during development and testing.
- Fix: Implement a
--dry-run flag that logs intended actions without side effects. This is critical for file transformations and database operations.
-
Ignoring Cross-Platform Compatibility
- Explanation: Using OS-specific commands or path separators breaks automation on different developer machines or CI runners.
- Fix: Use
pathlib in Python for path manipulation. Avoid Bashisms in shared scripts. Test automation on multiple OSes in CI matrix builds.
-
Missing Rollback Mechanisms
- Explanation: Automation that modifies state without a way to revert changes can cause prolonged outages.
- Fix: Implement backup steps before mutations. Version artifacts with commit hashes. Ensure database migrations are reversible where possible.
Production Bundle
Action Checklist
Decision Matrix
| Scenario | Recommended Approach | Why | Cost Impact |
|---|
| One-off data fix | Python script with dry-run | Fast execution, precise control, low setup overhead | Low dev cost, medium risk if not reviewed |
| Build orchestration | Makefile + Python scripts | Declarative dependencies, portable, consistent interface | Medium setup, high long-term ROI |
| Multi-env deployment | CI/CD pipeline with artifacts | Reproducible, auditable, enforces gates, rollback support | High setup cost, low operational risk |
| Database migrations | Schema guard with version table | Idempotent, transactional, tracks state across envs | Medium dev cost, critical for data integrity |
| Log aggregation | Bash glue + external tool | Leverages system utilities, low resource usage | Low cost, requires robust error handling |
Configuration Template
Use this template for automation.yaml to standardize script behavior.
# automation.yaml
# Central configuration for automation scripts
project:
name: "my-service"
version: "1.0.0"
paths:
source_dir: "./docs"
output_dir: "./build"
migrations_dir: "./migrations"
transform:
file_pattern: "*.md"
replacements:
"OLD_PROJECT_NAME": "NEW_PROJECT_NAME"
"v1\\.0": "v2.0"
metadata:
inject_header: true
project: "my-service"
database:
url_env: "DATABASE_URL"
timeout_seconds: 30
logging:
level: "INFO"
format: "json"
Quick Start Guide
-
Initialize Environment:
pip install -r requirements.txt
cp automation.example.yaml automation.yaml
# Edit automation.yaml with your project settings
-
Validate Configuration:
make check-env
-
Preview Changes:
make dry-run
# Review logs to ensure transformations are correct
-
Execute Pipeline:
make all
# Runs linting and transformation
-
Verify Results:
git diff
# Inspect changes before committing