Skip to content

Instantly share code, notes, and snippets.

@WomB0ComB0
Last active August 2, 2024 17:02
Show Gist options
  • Save WomB0ComB0/05e05df564e3700acea2d0db9ef9a53a to your computer and use it in GitHub Desktop.
Save WomB0ComB0/05e05df564e3700acea2d0db9ef9a53a to your computer and use it in GitHub Desktop.
Advanced, general, website scraper
import puppeteer from 'puppeteer-extra';
import { LaunchOptions } from 'puppeteer';
import fs from 'fs/promises';
import path from 'path';
import { fileURLToPath } from 'url';
import StealthPlugin from 'puppeteer-extra-plugin-stealth';
import AdblockerPlugin from 'puppeteer-extra-plugin-adblocker';
interface Row {
text: string;
}
const __filename = fileURLToPath(import.meta.url);
const __dirname = path.dirname(__filename);
class TrieNode {
children: { [key: string]: TrieNode } = {};
isEndOfWord: boolean = false;
}
class Trie {
private root: TrieNode = new TrieNode();
public insert(word: string): void {
let node = this.root;
for (const char of word) {
if (!node.children[char]) {
node.children[char] = new TrieNode();
}
node = node.children[char];
}
node.isEndOfWord = true;
}
public search(word: string): boolean {
let node = this.root;
for (const char of word) {
if (!node.children[char]) {
return false;
}
node = node.children[char];
}
return node.isEndOfWord;
}
public startsWith(prefix: string): boolean {
let node = this.root;
for (const char of prefix) {
if (!node.children[char]) {
return false;
}
node = node.children[char];
}
return true;
}
}
const createHashSet = <T>(arr: T[]): Set<T> => {
const hashSet: Set<T> = new Set<T>();
const duplicates: T[] = [];
for (const item of arr) {
if (hashSet.has(item)) {
duplicates.push(item);
} else {
hashSet.add(item);
}
}
return hashSet;
}
const removeInternalDuplicates = (text: string): string => {
const segments: RegExpMatchArray = text.match(/[^.!?]+[.!?]+/g) || [text];
const uniqueSegments: Set<string> = new Set<string>();
const result: string[] = [];
for (const segment of segments) {
if (!uniqueSegments.has(segment)) {
uniqueSegments.add(segment);
result.push(segment);
}
}
return result.join(' ');
};
const removeExternalDuplicates = (text: string): string => {
const segments: RegExpMatchArray = text.match(/[^.!?]+[.!?]+/g) || [text];
const uniqueSegments: Set<string> = new Set<string>();
const result: string[] = [];
for (const segment of segments) {
if (!uniqueSegments.has(segment)) {
uniqueSegments.add(segment);
result.push(segment);
}
}
return result.join(' ');
};
const removeDuplicates = (text: string): string => {
return removeInternalDuplicates(removeExternalDuplicates(text));
};
const wait = (s: number) => new Promise(r => setTimeout(r, s * 1000))
class Scraper {
private cachedData: Row[] | null = null;
private cachedNewWords: Row[] | null = null;
private filterTrie: Trie | null = null;
private filterDict: Set<string> | null = null;
private newWordsTree: Trie | null = null;
private newWordsDict: Set<string> | null = null;
private cachedNSFW: string[] | null = null;
constructor() {
puppeteer.use(StealthPlugin());
puppeteer.use(AdblockerPlugin({ blockTrackers: true }));
}
private async initializeFilterWords(): Promise<void> {
const filePath = path.join(__dirname, 'data', 'slurs.txt');
const nsfwFilePath = path.join(__dirname, 'data', 'nsfw-names.txt');
if (!this.cachedData?.length) {
this.cachedData = await this.parseTXT(filePath);
this.filterTrie = new Trie();
this.filterDict = new Set<string>();
for (const row of this.cachedData) {
this.filterTrie.insert(row.text);
this.filterDict.add(row.text);
}
}
if (!this.cachedNewWords?.length) {
this.cachedNewWords = await this.parseTXT(nsfwFilePath);
this.newWordsTree = new Trie();
this.newWordsDict = new Set<string>();
for (const row of this.cachedNewWords) {
this.newWordsTree.insert(row.text);
this.newWordsDict.add(row.text);
}
}
}
private async parseTXT(filePath: string): Promise<Row[]> {
const data = await fs.readFile(filePath, 'utf8');
return data.split('\n').filter(line => line.trim()).map(line => ({ text: line.trim() }));
}
private async filterText(text: string, replace: string): Promise<string> {
if (!text) return text;
const words = text.split(' ');
for (let i = 0; i < words.length; i++) {
const word = words[i];
if (this.filterDict!.has(word) || this.filterTrie!.search(word)) {
words[i] = replace;
}
}
return words.join(' ');
}
public async scrape(url: Readonly<string>): Promise<{ flaggedDomain: boolean, containsCensored: boolean, filteredTexts: string[] }> {
const filePath = path.join(__dirname, 'data', 'nsfw.txt');
try {
if (!this.cachedNSFW) {
const rows = await this.parseTXT(filePath);
this.cachedNSFW = rows.map(row => row.text);
}
const nsfw = this.cachedNSFW;
if (nsfw.includes(url.split('/')[0] === 'www' ? url : url.split('/')[2])) return {
flaggedDomain: true,
containsCensored: false,
filteredTexts: [],
};
await this.initializeFilterWords();
const browser = await puppeteer.launch(<LaunchOptions>{
headless: true,
args: ['--no-sandbox', '--disable-setuid-sandbox'],
timeout: 0,
});
const page = await browser.newPage();
const response = await page.goto(url, { timeout: 0, waitUntil: "domcontentloaded" });
await wait(5);
const finalUrl = response?.url() || url;
if (this.cachedNSFW.includes(finalUrl.split('/')[0] === 'www' ? finalUrl : finalUrl.split('/')[2])) {
await browser.close();
return {
flaggedDomain: true,
containsCensored: false,
filteredTexts: [],
};
}
const texts = await page.evaluate(() => {
const elements = [...document.querySelectorAll('p, div, span, a, h1, h2, h3, h4, h5, h6, li')];
let filteredElements = elements.map(el => el.textContent?.trim() || '').filter(text => text.length > 0);
for (let i = 0; i < filteredElements.length; i++) {
let text = filteredElements[i];
text = text.replace(/[\u{1F600}-\u{1F64F}]/gu, '');
text = text.replace(/[^\x00-\x7F]/g, '');
text = text.replace(/\s+/g, ' ');
text = text.replace(/[\n\r]+/g, ' ');
filteredElements[i] = text;
}
return filteredElements;
});
await browser.close();
const processedTexts = texts.map(removeDuplicates);
const hashSet = createHashSet(processedTexts);
const filteredTexts = await Promise.all(
Array.from(hashSet).map(async text => await this.filterText(text, '***'))
);
const containsCensored = filteredTexts.some(text => text.includes('***'));
return {
flaggedDomain: false,
containsCensored: containsCensored,
filteredTexts: filteredTexts,
};
} catch (error) {
console.error('Error during scraping:', error);
throw new Error('Scraping failed.');
}
}
}
export default Scraper;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment