Last active
July 31, 2024 16:14
-
-
Save WomB0ComB0/cd5425416c74f74353e979678d59027a to your computer and use it in GitHub Desktop.
Simple text content web scraper with puppeteer
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import puppeteer from "puppeteer"; | |
import axios from "axios"; | |
export const scraper = async (url: Readonly<string>): Promise<string[]> => { | |
const browser = await puppeteer.launch({ headless: true }); | |
const page = await browser.newPage(); | |
await page.goto(url, { | |
timeout: 0, | |
waitUntil: "domcontentloaded" | |
}); | |
await new Promise((resolve) => setTimeout(resolve, 5000)); | |
const content: string[] = await page.evaluate(() => { | |
const | |
p: NodeListOf<HTMLParagraphElement> = document.querySelectorAll('p')!, | |
div: NodeListOf<HTMLDivElement> = document.querySelectorAll('div')!, | |
span: NodeListOf<HTMLSpanElement> = document.querySelectorAll('span')!, | |
a: NodeListOf<HTMLAnchorElement> = document.querySelectorAll('a')!, | |
h1: NodeListOf<HTMLHeadingElement> = document.querySelectorAll('h1')!, | |
h2: NodeListOf<HTMLHeadingElement> = document.querySelectorAll('h2')!, | |
h3: NodeListOf<HTMLHeadingElement> = document.querySelectorAll('h3')!, | |
h4: NodeListOf<HTMLHeadingElement> = document.querySelectorAll('h4')!, | |
h5: NodeListOf<HTMLHeadingElement> = document.querySelectorAll('h5')!, | |
h6: NodeListOf<HTMLHeadingElement> = document.querySelectorAll('h6')!, | |
li: NodeListOf<HTMLLIElement> = document.querySelectorAll('li')!; | |
const els: NodeListOf<Element>[] = [p, div, span, a, h1, h2, h3, h4, h5, h6, li]; | |
const filteredEls: string[][] = els.map((element: NodeListOf<Element>) => { | |
return Array.from(element) | |
.filter((el: Element) => el.textContent && el.textContent.trim().length > 0) | |
.map((el: Element) => { | |
let text = el.textContent!.trim(); | |
text = text.replace(/[\u{1F600}-\u{1F64F}]/gu, ''); | |
text = text.replace(/[^\x00-\x7F]/g, ''); | |
text = text.replace(/\s+/g, ' '); | |
text = text.replace(/[\n\r]+/g, ' '); | |
return text; | |
}); | |
}); | |
const contentful: string[] = filteredEls.flat(); | |
const hash_set: Set<string> = new Set(contentful.flat()); | |
return Array.from(hash_set); | |
}); | |
await browser.close(); | |
return content; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment