Skip to content

Instantly share code, notes, and snippets.

@WomB0ComB0
Last active July 31, 2024 16:14
Show Gist options
  • Save WomB0ComB0/cd5425416c74f74353e979678d59027a to your computer and use it in GitHub Desktop.
Save WomB0ComB0/cd5425416c74f74353e979678d59027a to your computer and use it in GitHub Desktop.
Simple text content web scraper with puppeteer
import puppeteer from "puppeteer";
import axios from "axios";
export const scraper = async (url: Readonly<string>): Promise<string[]> => {
const browser = await puppeteer.launch({ headless: true });
const page = await browser.newPage();
await page.goto(url, {
timeout: 0,
waitUntil: "domcontentloaded"
});
await new Promise((resolve) => setTimeout(resolve, 5000));
const content: string[] = await page.evaluate(() => {
const
p: NodeListOf<HTMLParagraphElement> = document.querySelectorAll('p')!,
div: NodeListOf<HTMLDivElement> = document.querySelectorAll('div')!,
span: NodeListOf<HTMLSpanElement> = document.querySelectorAll('span')!,
a: NodeListOf<HTMLAnchorElement> = document.querySelectorAll('a')!,
h1: NodeListOf<HTMLHeadingElement> = document.querySelectorAll('h1')!,
h2: NodeListOf<HTMLHeadingElement> = document.querySelectorAll('h2')!,
h3: NodeListOf<HTMLHeadingElement> = document.querySelectorAll('h3')!,
h4: NodeListOf<HTMLHeadingElement> = document.querySelectorAll('h4')!,
h5: NodeListOf<HTMLHeadingElement> = document.querySelectorAll('h5')!,
h6: NodeListOf<HTMLHeadingElement> = document.querySelectorAll('h6')!,
li: NodeListOf<HTMLLIElement> = document.querySelectorAll('li')!;
const els: NodeListOf<Element>[] = [p, div, span, a, h1, h2, h3, h4, h5, h6, li];
const filteredEls: string[][] = els.map((element: NodeListOf<Element>) => {
return Array.from(element)
.filter((el: Element) => el.textContent && el.textContent.trim().length > 0)
.map((el: Element) => {
let text = el.textContent!.trim();
text = text.replace(/[\u{1F600}-\u{1F64F}]/gu, '');
text = text.replace(/[^\x00-\x7F]/g, '');
text = text.replace(/\s+/g, ' ');
text = text.replace(/[\n\r]+/g, ' ');
return text;
});
});
const contentful: string[] = filteredEls.flat();
const hash_set: Set<string> = new Set(contentful.flat());
return Array.from(hash_set);
});
await browser.close();
return content;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment