Last active
August 2, 2024 17:02
-
-
Save WomB0ComB0/05e05df564e3700acea2d0db9ef9a53a to your computer and use it in GitHub Desktop.
Advanced, general, website scraper
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import puppeteer from 'puppeteer-extra'; | |
import { LaunchOptions } from 'puppeteer'; | |
import fs from 'fs/promises'; | |
import path from 'path'; | |
import { fileURLToPath } from 'url'; | |
import StealthPlugin from 'puppeteer-extra-plugin-stealth'; | |
import AdblockerPlugin from 'puppeteer-extra-plugin-adblocker'; | |
interface Row { | |
text: string; | |
} | |
const __filename = fileURLToPath(import.meta.url); | |
const __dirname = path.dirname(__filename); | |
class TrieNode { | |
children: { [key: string]: TrieNode } = {}; | |
isEndOfWord: boolean = false; | |
} | |
class Trie { | |
private root: TrieNode = new TrieNode(); | |
public insert(word: string): void { | |
let node = this.root; | |
for (const char of word) { | |
if (!node.children[char]) { | |
node.children[char] = new TrieNode(); | |
} | |
node = node.children[char]; | |
} | |
node.isEndOfWord = true; | |
} | |
public search(word: string): boolean { | |
let node = this.root; | |
for (const char of word) { | |
if (!node.children[char]) { | |
return false; | |
} | |
node = node.children[char]; | |
} | |
return node.isEndOfWord; | |
} | |
public startsWith(prefix: string): boolean { | |
let node = this.root; | |
for (const char of prefix) { | |
if (!node.children[char]) { | |
return false; | |
} | |
node = node.children[char]; | |
} | |
return true; | |
} | |
} | |
const createHashSet = <T>(arr: T[]): Set<T> => { | |
const hashSet: Set<T> = new Set<T>(); | |
const duplicates: T[] = []; | |
for (const item of arr) { | |
if (hashSet.has(item)) { | |
duplicates.push(item); | |
} else { | |
hashSet.add(item); | |
} | |
} | |
return hashSet; | |
} | |
const removeInternalDuplicates = (text: string): string => { | |
const segments: RegExpMatchArray = text.match(/[^.!?]+[.!?]+/g) || [text]; | |
const uniqueSegments: Set<string> = new Set<string>(); | |
const result: string[] = []; | |
for (const segment of segments) { | |
if (!uniqueSegments.has(segment)) { | |
uniqueSegments.add(segment); | |
result.push(segment); | |
} | |
} | |
return result.join(' '); | |
}; | |
const removeExternalDuplicates = (text: string): string => { | |
const segments: RegExpMatchArray = text.match(/[^.!?]+[.!?]+/g) || [text]; | |
const uniqueSegments: Set<string> = new Set<string>(); | |
const result: string[] = []; | |
for (const segment of segments) { | |
if (!uniqueSegments.has(segment)) { | |
uniqueSegments.add(segment); | |
result.push(segment); | |
} | |
} | |
return result.join(' '); | |
}; | |
const removeDuplicates = (text: string): string => { | |
return removeInternalDuplicates(removeExternalDuplicates(text)); | |
}; | |
const wait = (s: number) => new Promise(r => setTimeout(r, s * 1000)) | |
class Scraper { | |
private cachedData: Row[] | null = null; | |
private cachedNewWords: Row[] | null = null; | |
private filterTrie: Trie | null = null; | |
private filterDict: Set<string> | null = null; | |
private newWordsTree: Trie | null = null; | |
private newWordsDict: Set<string> | null = null; | |
private cachedNSFW: string[] | null = null; | |
constructor() { | |
puppeteer.use(StealthPlugin()); | |
puppeteer.use(AdblockerPlugin({ blockTrackers: true })); | |
} | |
private async initializeFilterWords(): Promise<void> { | |
const filePath = path.join(__dirname, 'data', 'slurs.txt'); | |
const nsfwFilePath = path.join(__dirname, 'data', 'nsfw-names.txt'); | |
if (!this.cachedData?.length) { | |
this.cachedData = await this.parseTXT(filePath); | |
this.filterTrie = new Trie(); | |
this.filterDict = new Set<string>(); | |
for (const row of this.cachedData) { | |
this.filterTrie.insert(row.text); | |
this.filterDict.add(row.text); | |
} | |
} | |
if (!this.cachedNewWords?.length) { | |
this.cachedNewWords = await this.parseTXT(nsfwFilePath); | |
this.newWordsTree = new Trie(); | |
this.newWordsDict = new Set<string>(); | |
for (const row of this.cachedNewWords) { | |
this.newWordsTree.insert(row.text); | |
this.newWordsDict.add(row.text); | |
} | |
} | |
} | |
private async parseTXT(filePath: string): Promise<Row[]> { | |
const data = await fs.readFile(filePath, 'utf8'); | |
return data.split('\n').filter(line => line.trim()).map(line => ({ text: line.trim() })); | |
} | |
private async filterText(text: string, replace: string): Promise<string> { | |
if (!text) return text; | |
const words = text.split(' '); | |
for (let i = 0; i < words.length; i++) { | |
const word = words[i]; | |
if (this.filterDict!.has(word) || this.filterTrie!.search(word)) { | |
words[i] = replace; | |
} | |
} | |
return words.join(' '); | |
} | |
public async scrape(url: Readonly<string>): Promise<{ flaggedDomain: boolean, containsCensored: boolean, filteredTexts: string[] }> { | |
const filePath = path.join(__dirname, 'data', 'nsfw.txt'); | |
try { | |
if (!this.cachedNSFW) { | |
const rows = await this.parseTXT(filePath); | |
this.cachedNSFW = rows.map(row => row.text); | |
} | |
const nsfw = this.cachedNSFW; | |
if (nsfw.includes(url.split('/')[0] === 'www' ? url : url.split('/')[2])) return { | |
flaggedDomain: true, | |
containsCensored: false, | |
filteredTexts: [], | |
}; | |
await this.initializeFilterWords(); | |
const browser = await puppeteer.launch(<LaunchOptions>{ | |
headless: true, | |
args: ['--no-sandbox', '--disable-setuid-sandbox'], | |
timeout: 0, | |
}); | |
const page = await browser.newPage(); | |
const response = await page.goto(url, { timeout: 0, waitUntil: "domcontentloaded" }); | |
await wait(5); | |
const finalUrl = response?.url() || url; | |
if (this.cachedNSFW.includes(finalUrl.split('/')[0] === 'www' ? finalUrl : finalUrl.split('/')[2])) { | |
await browser.close(); | |
return { | |
flaggedDomain: true, | |
containsCensored: false, | |
filteredTexts: [], | |
}; | |
} | |
const texts = await page.evaluate(() => { | |
const elements = [...document.querySelectorAll('p, div, span, a, h1, h2, h3, h4, h5, h6, li')]; | |
let filteredElements = elements.map(el => el.textContent?.trim() || '').filter(text => text.length > 0); | |
for (let i = 0; i < filteredElements.length; i++) { | |
let text = filteredElements[i]; | |
text = text.replace(/[\u{1F600}-\u{1F64F}]/gu, ''); | |
text = text.replace(/[^\x00-\x7F]/g, ''); | |
text = text.replace(/\s+/g, ' '); | |
text = text.replace(/[\n\r]+/g, ' '); | |
filteredElements[i] = text; | |
} | |
return filteredElements; | |
}); | |
await browser.close(); | |
const processedTexts = texts.map(removeDuplicates); | |
const hashSet = createHashSet(processedTexts); | |
const filteredTexts = await Promise.all( | |
Array.from(hashSet).map(async text => await this.filterText(text, '***')) | |
); | |
const containsCensored = filteredTexts.some(text => text.includes('***')); | |
return { | |
flaggedDomain: false, | |
containsCensored: containsCensored, | |
filteredTexts: filteredTexts, | |
}; | |
} catch (error) { | |
console.error('Error during scraping:', error); | |
throw new Error('Scraping failed.'); | |
} | |
} | |
} | |
export default Scraper; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment