Last active
September 25, 2024 05:21
-
-
Save prescience-data/f0b16d053bda3e43ddeb4ef3556fd2d9 to your computer and use it in GitHub Desktop.
Strip Page With Puppeteer
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import { Buffer } from "buffer" | |
import { createHash } from "crypto" | |
import { HTTPRequest, Protocol } from "puppeteer-core" | |
import { isPuppeteerPage, Page } from "../types" | |
type CaptureSnapshotResponse = Protocol.DOMSnapshot.CaptureSnapshotResponse | |
type RequestHook = (request: HTTPRequest) => Promise<void> | |
interface ImageSnapshot { | |
hash: string | |
url: string | |
data: string | |
} | |
interface PageSnapshot { | |
url: string | |
dom: CaptureSnapshotResponse | |
images: Map<string, ImageSnapshot> | |
} | |
/** | |
* Determines if a value is an image url. | |
* | |
* @param {string} value | |
* @return {boolean} | |
*/ | |
const isImage = (value: string): boolean => | |
[".jpg", ".jpeg", ".png", ".gif"].includes( | |
value.match(/\.\w{3,4}($|\?)/)?.[0] ?? `` | |
) | |
/** | |
* Generates a md5 hash of an image url. | |
* | |
* @param {string} value | |
* @return {string} | |
*/ | |
const md5 = (value: string): string => | |
createHash("md5").update(value).digest("hex") | |
/** | |
* Captures a complete snapshot of the DOM using CDP. | |
* Note: A `DOMSnapshot` is represented as a complex table, | |
* so reconstitution may be desired prior to persisting to database. | |
* | |
* @see https://chromedevtools.github.io/devtools-protocol/tot/DOMSnapshot/#method-captureSnapshot | |
* @param {Page} page | |
* @return {Promise<CaptureSnapshotResponse>} | |
*/ | |
const captureSnapshot = async ( | |
page: Page | |
): Promise<CaptureSnapshotResponse> => { | |
// Use raw CPD to capture a snapshot of the DOM. | |
const [, snapshot] = await Promise.all([ | |
page.client().send(`DOMSnapshot.enable`), | |
page.client().send(`DOMSnapshot.captureSnapshot`, { | |
computedStyles: [], // Add styles to capture inline here... | |
includeDOMRects: false, | |
includePaintOrder: false | |
}) | |
]) | |
if (!snapshot) { | |
throw new Error(`Failed to capture snapshot.`) | |
} | |
return snapshot | |
} | |
/** | |
* Request listener to push all image responses to a provided map as base64. | |
* | |
* @param {Map<string, ImageSnapshot>} images | |
* @return {RequestHook} | |
*/ | |
const captureImages = | |
(images: Map<string, ImageSnapshot>): RequestHook => | |
async (request: HTTPRequest): Promise<void> => { | |
const url: string = request.url() | |
const hash: string = md5(url) | |
if (isImage(url) && !images.has(hash)) { | |
const buffer: Buffer | undefined = await ( | |
await request.response() | |
)?.buffer() | |
if (buffer) { | |
const data: string = buffer.toString("base64") | |
images.set(hash, { | |
hash, | |
url, | |
data | |
}) | |
} | |
} | |
} | |
/** | |
* Captures a snapshot of specified url including DOM and images (as base64 strings). | |
* | |
* @param {Page} page | |
* @param {string} url | |
* @return {Promise<PageSnapshot>} | |
*/ | |
export const stripPage = async ( | |
page: Page, | |
url: string | |
): Promise<PageSnapshot> => { | |
// Create a map of all images received by the page. | |
const images: Map<string, ImageSnapshot> = new Map<string, ImageSnapshot>() | |
// Listen to all finished requests and capture images to the image map. | |
// A similar approach can be implemented for any other asset type, however the DOMSnapshot has the ability to inline styles. | |
page.on("requestfinished", captureImages(images)) | |
// Load the intended url. | |
await page.goto(url, { | |
waitUntil: ["domcontentloaded", "networkidle2"] | |
}) | |
// Capture a DOM Snapshot lookup table (for later recomposition via async queue worker). | |
const dom: CaptureSnapshotResponse = await captureSnapshot(page) | |
// Return a complete object to save in database. | |
return { | |
url, | |
dom, | |
images | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment