WomB0ComB0 · July 31, 2024 16:14
diff --git a/scraper.ts b/scraper.ts
 import puppeteer from "puppeteer";
 import axios from "axios";

 export const scraper = async (url: Readonly<string>): Promise<string[]> => {
  const browser = await puppeteer.launch({ headless: true });
  const page = await browser.newPage();
  
  await page.goto(url, {
    timeout: 0,
    waitUntil: "domcontentloaded"
  });

  await new Promise((resolve) => setTimeout(resolve, 5000));

  const content: string[] = await page.evaluate(() => {
    const 
      p: NodeListOf<HTMLParagraphElement> = document.querySelectorAll('p')!,
      div: NodeListOf<HTMLDivElement> = document.querySelectorAll('div')!,
      span: NodeListOf<HTMLSpanElement> = document.querySelectorAll('span')!,
      a: NodeListOf<HTMLAnchorElement> = document.querySelectorAll('a')!,
      h1: NodeListOf<HTMLHeadingElement> = document.querySelectorAll('h1')!,
      h2: NodeListOf<HTMLHeadingElement> = document.querySelectorAll('h2')!,
      h3: NodeListOf<HTMLHeadingElement> = document.querySelectorAll('h3')!,
      h4: NodeListOf<HTMLHeadingElement> = document.querySelectorAll('h4')!,
      h5: NodeListOf<HTMLHeadingElement> = document.querySelectorAll('h5')!,
      h6: NodeListOf<HTMLHeadingElement> = document.querySelectorAll('h6')!,
      li: NodeListOf<HTMLLIElement> = document.querySelectorAll('li')!;

    const els: NodeListOf<Element>[] = [p, div, span, a, h1, h2, h3, h4, h5, h6, li];
    const filteredEls: string[][] = els.map((element: NodeListOf<Element>) => {
      return Array.from(element)
        .filter((el: Element) => el.textContent && el.textContent.trim().length > 0)
        .map((el: Element) => {
          let text = el.textContent!.trim();
          text = text.replace(/[\u{1F600}-\u{1F64F}]/gu, '');
          text = text.replace(/[^\x00-\x7F]/g, '');
          text = text.replace(/\s+/g, ' ');
          text = text.replace(/[\n\r]+/g, ' ');
          return text;
        });
    });

    const contentful: string[] = filteredEls.flat();
    const hash_set: Set<string> = new Set(contentful.flat());
    return Array.from(hash_set);
  });

  await browser.close();
  return content;
 }
	import puppeteer from "puppeteer";
	import axios from "axios";

	export const scraper = async (url: Readonly<string>): Promise<string[]> => {
	const browser = await puppeteer.launch({ headless: true });
	const page = await browser.newPage();

	await page.goto(url, {
	timeout: 0,
	waitUntil: "domcontentloaded"
	});

	await new Promise((resolve) => setTimeout(resolve, 5000));

	const content: string[] = await page.evaluate(() => {
	const
	p: NodeListOf<HTMLParagraphElement> = document.querySelectorAll('p')!,
	div: NodeListOf<HTMLDivElement> = document.querySelectorAll('div')!,
	span: NodeListOf<HTMLSpanElement> = document.querySelectorAll('span')!,
	a: NodeListOf<HTMLAnchorElement> = document.querySelectorAll('a')!,
	h1: NodeListOf<HTMLHeadingElement> = document.querySelectorAll('h1')!,
	h2: NodeListOf<HTMLHeadingElement> = document.querySelectorAll('h2')!,
	h3: NodeListOf<HTMLHeadingElement> = document.querySelectorAll('h3')!,
	h4: NodeListOf<HTMLHeadingElement> = document.querySelectorAll('h4')!,
	h5: NodeListOf<HTMLHeadingElement> = document.querySelectorAll('h5')!,
	h6: NodeListOf<HTMLHeadingElement> = document.querySelectorAll('h6')!,
	li: NodeListOf<HTMLLIElement> = document.querySelectorAll('li')!;

	const els: NodeListOf<Element>[] = [p, div, span, a, h1, h2, h3, h4, h5, h6, li];
	const filteredEls: string[][] = els.map((element: NodeListOf<Element>) => {
	return Array.from(element)
	.filter((el: Element) => el.textContent && el.textContent.trim().length > 0)
	.map((el: Element) => {
	let text = el.textContent!.trim();
	text = text.replace(/[\u{1F600}-\u{1F64F}]/gu, '');
	text = text.replace(/[^\x00-\x7F]/g, '');
	text = text.replace(/\s+/g, ' ');
	text = text.replace(/[\n\r]+/g, ' ');
	return text;
	});
	});

	const contentful: string[] = filteredEls.flat();
	const hash_set: Set<string> = new Set(contentful.flat());
	return Array.from(hash_set);
	});

	await browser.close();
	return content;
	}