rvalMs: number;
maxScrollAttempts: number;
fixedElementSelectors: string[];
lazyImageSelectors: string[];
}
const DEFAULT_CONFIG: CaptureConfig = {
viewportWidth: 1280,
scrollStepPx: 400,
scrollIntervalMs: 100,
maxScrollAttempts: 50,
fixedElementSelectors: ['[class*="fixed"]', '[class*="sticky"]', 'header', 'footer'],
lazyImageSelectors: ['img[data-src]', 'img[loading="lazy"]'],
};
export class ScreenshotOrchestrator {
private browser: Browser;
private config: CaptureConfig;
constructor(browser: Browser, config: Partial<CaptureConfig> = {}) {
this.browser = browser;
this.config = { ...DEFAULT_CONFIG, ...config };
}
async capture(url: string): Promise<Buffer> {
const page = await this.browser.newPage();
await page.setViewport({ width: this.config.viewportWidth, height: 800 });
try {
await page.goto(url, { waitUntil: 'networkidle0', timeout: 30000 });
// Stage 1: Normalize DOM to prevent ghosting
await this.normalizeFixedElements(page);
// Stage 2: Trigger lazy loads and infinite scroll
await this.executeScrollRoutine(page);
// Stage 3: Calculate true content height
const height = await this.calculateTrueHeight(page);
// Stage 4: Capture with precision
await page.setViewport({ width: this.config.viewportWidth, height });
const buffer = await page.screenshot({
type: 'png',
clip: { x: 0, y: 0, width: this.config.viewportWidth, height },
});
return buffer;
} finally {
await page.close();
}
}
private async normalizeFixedElements(page: Page): Promise<void> {
const selectors = this.config.fixedElementSelectors.join(',');
await page.evaluate((sel) => {
const elements = document.querySelectorAll(sel);
elements.forEach((el) => {
const style = window.getComputedStyle(el);
if (style.position === 'fixed') {
el.setAttribute('data-capture-orig-pos', style.position);
el.style.position = 'absolute';
}
});
}, selectors);
}
private async executeScrollRoutine(page: Page): Promise<void> {
await page.evaluate((cfg) => {
return new Promise<void>((resolve) => {
let attempts = 0;
let lastHeight = document.body.scrollHeight;
const scrollStep = () => {
window.scrollBy(0, cfg.scrollStepPx);
attempts++;
if (attempts >= cfg.maxScrollAttempts) {
window.scrollTo(0, 0);
resolve();
return;
}
const currentHeight = document.body.scrollHeight;
if (currentHeight > lastHeight) {
lastHeight = currentHeight;
attempts = 0; // Reset attempts if content grows
}
if (window.scrollY + window.innerHeight >= document.body.scrollHeight) {
window.scrollTo(0, 0);
resolve();
} else {
setTimeout(scrollStep, cfg.scrollIntervalMs);
}
};
scrollStep();
});
}, this.config);
// Wait for images triggered by scroll to complete
await page.waitForFunction(() => {
const images = Array.from(document.images);
return images.every((img) => img.complete);
}, { timeout: 10000 }).catch(() => {
// Timeout implies some images failed or are still loading; proceed with capture
});
// Handle data-src pattern
await page.evaluate((selectors) => {
document.querySelectorAll(selectors).forEach((el) => {
const img = el as HTMLImageElement;
if (img.dataset.src && !img.src) {
img.src = img.dataset.src;
}
});
}, this.config.lazyImageSelectors.join(','));
}
private async calculateTrueHeight(page: Page): Promise<number> {
const height = await page.evaluate(() => {
const body = document.body;
const html = document.documentElement;
return Math.max(
body.scrollHeight,
body.offsetHeight,
html.clientHeight,
html.scrollHeight,
html.offsetHeight
);
});
// Check for absolutely positioned elements extending beyond flow
const absHeight = await page.evaluate(() => {
let maxBottom = 0;
document.querySelectorAll('*').forEach((el) => {
const rect = el.getBoundingClientRect();
const bottom = rect.bottom + window.scrollY;
if (bottom > maxBottom) maxBottom = bottom;
});
return Math.ceil(maxBottom);
});
return Math.max(height, absHeight);
}
}
#### Rationale
* **`fixedElementSelectors`**: The orchestrator accepts specific selectors rather than using a universal query. This reduces DOM traversal overhead. If selectors are unknown, a broader pattern can be passed, but targeted lists are preferred for performance.
* **Scroll Routine**: The scroll step uses `400px` increments with a `100ms` delay. This timing aligns with typical Intersection Observer debounce intervals. Faster scrolling may skip lazy-load triggers; slower scrolling increases execution time unnecessarily. The routine resets the attempt counter when content height increases, accommodating infinite scroll patterns.
* **Height Aggregation**: The `calculateTrueHeight` method compares `scrollHeight`, `offsetHeight`, and `clientHeight` across both `body` and `documentElement`. This handles discrepancies caused by CSS resets or framework-specific styling. The secondary check using `getBoundingClientRect` ensures absolutely positioned elements that break out of the document flow are included in the capture bounds.
* **`clip` vs `fullPage`**: Using `clip` with a calculated height avoids the internal tiling logic of `fullPage: true`, which can introduce seams or artifacts on very tall pages.
### Pitfall Guide
Production screenshot automation requires anticipating failure modes. The following pitfalls represent common mistakes observed in automated pipelines.
1. **The Universal Selector Trap**
* *Explanation:* Using `querySelectorAll('*')` to find fixed elements or measure height forces the browser to traverse every node in the DOM. On pages with large trees, this causes significant latency and can trigger memory pressure.
* *Fix:* Use targeted selectors for known patterns (e.g., `.navbar`, `#sticky-footer`). For height measurement, rely on aggregated metrics rather than iterating all elements unless absolutely positioned outliers are suspected.
2. **Ignoring the Texture Ceiling**
* *Explanation:* Chrome enforces a maximum texture size, typically 16,384 pixels. If a page exceeds this, the screenshot will silently clip or the browser may crash. Developers often assume `fullPage: true` handles arbitrary lengths.
* *Fix:* Implement a height check before capture. If the calculated height exceeds the limit, switch to a segmented capture strategy where the page is captured in chunks and stitched together programmatically.
3. **Lazy Load Race Conditions**
* *Explanation:* Scrolling too quickly prevents Intersection Observers from firing, leaving images as placeholders. Conversely, waiting indefinitely for network idle after every scroll step can cause timeouts on pages with continuous background requests.
* *Fix:* Use a controlled scroll interval (e.g., 100ms). After scrolling, wait for `document.images.every(img => img.complete)` rather than network idle. This focuses on asset loading without being blocked by telemetry or analytics requests.
4. **Overflow Lock Persistence**
* *Explanation:* Many sites apply `overflow: hidden` via CSS classes to lock scrolling when modals or menus are open. Overriding inline styles may not work if the class re-applies dynamically or if the style is enforced by a stylesheet with higher specificity.
* *Fix:* Remove the specific classes responsible for overflow locking (e.g., `no-scroll`, `modal-open`) in addition to setting inline styles to `visible`. Run this normalization immediately before capture to minimize the window for re-application.
5. **Height Measurement Drift**
* *Explanation:* Relying solely on `document.documentElement.scrollHeight` can result in cut-offs if the page contains absolutely positioned elements that extend beyond the document flow, or if margins collapse differently in headless mode.
* *Fix:* Aggregate multiple height metrics using `Math.max`. For critical accuracy, perform a bounding rect scan of all elements to find the true bottom edge, though this should be reserved for cases where standard metrics fail due to performance costs.
6. **Iframe Synchronization**
* *Explanation:* Iframes load asynchronously and may not be ready when the main page capture occurs. This results in blank rectangles where embedded content should appear.
* *Fix:* Wait for iframe load events or specific selectors within frames before capturing. Use `page.frames()` to iterate and ensure critical iframes have reached a stable state. Note that cross-origin iframes may have restrictions; ensure the capture environment has appropriate permissions.
### Production Bundle
#### Action Checklist
- [ ] **Define Selectors:** Identify fixed-position and lazy-load selectors for target sites to optimize DOM queries.
- [ ] **Set Texture Limit:** Implement a height threshold check (e.g., 16,000px) to trigger segmented capture for long pages.
- [ ] **Configure Scroll Timing:** Adjust scroll step and interval based on target site behavior; default to 400px/100ms.
- [ ] **Handle Overflow Classes:** List CSS classes that lock overflow and include them in the normalization routine.
- [ ] **Verify Iframe Readiness:** Add waits for critical iframe content to ensure embedded media renders correctly.
- [ ] **Test Height Metrics:** Validate height calculation against known pages to ensure no cut-offs or excessive blank space.
- [ ] **Implement Segmentation:** For pages exceeding texture limits, develop a stitching pipeline to combine multiple captures.
#### Decision Matrix
| Scenario | Recommended Approach | Why | Cost Impact |
| :--- | :--- | :--- | :--- |
| **Standard Marketing Page** | DOM Normalize + Scroll + Clip | Resolves fixed elements and lazy loads efficiently. | Low CPU, Fast execution. |
| **Infinite Scroll Feed** | Segmented Capture + Stitch | Bypasses texture limits; handles dynamic content growth. | High CPU/IO, Slower execution. |
| **Pixel-Perfect Archive** | RectCalc + Clip | Ensures absolutely positioned elements are included. | Medium CPU, Reliable accuracy. |
| **High-Volume Batch** | Targeted Selectors + Optimized Scroll | Reduces DOM traversal overhead for throughput. | Low CPU, Scalable. |
#### Configuration Template
Use this TypeScript interface to configure the orchestrator for different site profiles.
```typescript
const siteProfiles: Record<string, Partial<CaptureConfig>> = {
default: {
viewportWidth: 1280,
scrollStepPx: 400,
scrollIntervalMs: 100,
fixedElementSelectors: ['[class*="fixed"]', '[class*="sticky"]'],
lazyImageSelectors: ['img[data-src]', 'img[loading="lazy"]'],
},
aggressiveLazyLoad: {
scrollStepPx: 200,
scrollIntervalMs: 150,
maxScrollAttempts: 100,
},
stickyHeaderSite: {
fixedElementSelectors: ['.site-header', '#nav-bar', '.floating-action'],
},
};
Quick Start Guide
- Install Dependencies: Run
npm install puppeteer typescript @types/puppeteer.
- Initialize Browser: Create a browser instance with
puppeteer.launch({ headless: 'new' }).
- Instantiate Orchestrator: Create a
ScreenshotOrchestrator with the desired configuration profile.
- Execute Capture: Call
orchestrator.capture(url) and save the returned buffer to a file.
- Cleanup: Close the browser instance to release resources.
async function main() {
const browser = await puppeteer.launch({ headless: 'new' });
const orchestrator = new ScreenshotOrchestrator(browser, siteProfiles.default);
try {
const buffer = await orchestrator.capture('https://example.com');
require('fs').writeFileSync('capture.png', buffer);
console.log('Capture saved.');
} catch (error) {
console.error('Capture failed:', error);
} finally {
await browser.close();
}
}
main();