WomB0ComB0 · August 2, 2024 17:02
diff --git a/scraper-filtration.ts b/scraper-filtration.ts
 import puppeteer from 'puppeteer-extra';
 import { LaunchOptions } from 'puppeteer';
 import fs from 'fs/promises';
 import path from 'path';
 import { fileURLToPath } from 'url';
 import StealthPlugin from 'puppeteer-extra-plugin-stealth';
 import AdblockerPlugin from 'puppeteer-extra-plugin-adblocker';

 interface Row {
  text: string;
 }

 const __filename = fileURLToPath(import.meta.url);
 const __dirname = path.dirname(__filename);


 class TrieNode {
    children: { [key: string]: TrieNode } = {};
    isEndOfWord: boolean = false;
 }

 class Trie {
    private root: TrieNode = new TrieNode();

    public insert(word: string): void {
        let node = this.root;
        for (const char of word) {
            if (!node.children[char]) {
                node.children[char] = new TrieNode();
            }
            node = node.children[char];
        }
        node.isEndOfWord = true;
    }

    public search(word: string): boolean {
        let node = this.root;
        for (const char of word) {
            if (!node.children[char]) {
                return false;
            }
            node = node.children[char];
        }
        return node.isEndOfWord;
    }

    public startsWith(prefix: string): boolean {
        let node = this.root;
        for (const char of prefix) {
            if (!node.children[char]) {
                return false;
            }
            node = node.children[char];
        }
        return true;
    }
 }

 const createHashSet = <T>(arr: T[]): Set<T> => {
  const hashSet: Set<T> = new Set<T>();
  const duplicates: T[] = [];

  for (const item of arr) {
    if (hashSet.has(item)) {
      duplicates.push(item);
    } else {
      hashSet.add(item);
    }
  }
  return hashSet;
 }

 const removeInternalDuplicates = (text: string): string => {
  const segments: RegExpMatchArray = text.match(/[^.!?]+[.!?]+/g) || [text];
  const uniqueSegments: Set<string> = new Set<string>();
  const result: string[] = [];

  for (const segment of segments) {
    if (!uniqueSegments.has(segment)) {
      uniqueSegments.add(segment);
      result.push(segment);
    }
  }

  return result.join(' ');
 };

 const removeExternalDuplicates = (text: string): string => {
  const segments: RegExpMatchArray = text.match(/[^.!?]+[.!?]+/g) || [text];
  const uniqueSegments: Set<string> = new Set<string>();
  const result: string[] = [];

  for (const segment of segments) {
    if (!uniqueSegments.has(segment)) {
      uniqueSegments.add(segment);
      result.push(segment);
    }
  }

  return result.join(' ');
 };

 const removeDuplicates = (text: string): string => {
  return removeInternalDuplicates(removeExternalDuplicates(text));
 };

 const wait = (s: number) => new Promise(r => setTimeout(r, s * 1000))

 class Scraper {
  private cachedData: Row[] | null = null;
  private cachedNewWords: Row[] | null = null;
  private filterTrie: Trie | null = null;
  private filterDict: Set<string> | null = null;
  private newWordsTree: Trie | null = null;
  private newWordsDict: Set<string> | null = null;
  private cachedNSFW: string[] | null = null;

  constructor() {
    puppeteer.use(StealthPlugin());
    puppeteer.use(AdblockerPlugin({ blockTrackers: true }));
  }

  private async initializeFilterWords(): Promise<void> {
    const filePath = path.join(__dirname, 'data', 'slurs.txt');
    const nsfwFilePath = path.join(__dirname, 'data', 'nsfw-names.txt');

    if (!this.cachedData?.length) {
      this.cachedData = await this.parseTXT(filePath);
      this.filterTrie = new Trie();
      this.filterDict = new Set<string>();
      for (const row of this.cachedData) {
        this.filterTrie.insert(row.text);
        this.filterDict.add(row.text);
      }
    }

    if (!this.cachedNewWords?.length) {
      this.cachedNewWords = await this.parseTXT(nsfwFilePath);
      this.newWordsTree = new Trie();
      this.newWordsDict = new Set<string>();
      for (const row of this.cachedNewWords) {
        this.newWordsTree.insert(row.text);
        this.newWordsDict.add(row.text);
      }
    }
  }

  private async parseTXT(filePath: string): Promise<Row[]> {
    const data = await fs.readFile(filePath, 'utf8');
    return data.split('\n').filter(line => line.trim()).map(line => ({ text: line.trim() }));
  }

  private async filterText(text: string, replace: string): Promise<string> {
    if (!text) return text;

    const words = text.split(' ');
    for (let i = 0; i < words.length; i++) {
      const word = words[i];
      if (this.filterDict!.has(word) || this.filterTrie!.search(word)) {
        words[i] = replace;
      }
    }
    return words.join(' ');
  }

  public async scrape(url: Readonly<string>): Promise<{ flaggedDomain: boolean, containsCensored: boolean, filteredTexts: string[] }> {
    const filePath = path.join(__dirname, 'data', 'nsfw.txt');

    try {
      if (!this.cachedNSFW) {
        const rows = await this.parseTXT(filePath);
        this.cachedNSFW = rows.map(row => row.text);
      }

      const nsfw = this.cachedNSFW;
      if (nsfw.includes(url.split('/')[0] === 'www' ? url : url.split('/')[2])) return {
        flaggedDomain: true,
        containsCensored: false,
        filteredTexts: [],
      };

      await this.initializeFilterWords();

      const browser = await puppeteer.launch(<LaunchOptions>{
        headless: true,
        args: ['--no-sandbox', '--disable-setuid-sandbox'],
        timeout: 0,
      });

      const page = await browser.newPage();
      const response = await page.goto(url, { timeout: 0, waitUntil: "domcontentloaded" });

      await wait(5);

      const finalUrl = response?.url() || url;
      if (this.cachedNSFW.includes(finalUrl.split('/')[0] === 'www' ? finalUrl : finalUrl.split('/')[2])) {
        await browser.close();
        return {
          flaggedDomain: true,
          containsCensored: false,
          filteredTexts: [],
        };
      }

      const texts = await page.evaluate(() => {
        const elements = [...document.querySelectorAll('p, div, span, a, h1, h2, h3, h4, h5, h6, li')];
        let filteredElements = elements.map(el => el.textContent?.trim() || '').filter(text => text.length > 0);
        for (let i = 0; i < filteredElements.length; i++) {
          let text = filteredElements[i];
          text = text.replace(/[\u{1F600}-\u{1F64F}]/gu, '');
          text = text.replace(/[^\x00-\x7F]/g, '');
          text = text.replace(/\s+/g, ' ');
          text = text.replace(/[\n\r]+/g, ' ');
          filteredElements[i] = text;
        }
        return filteredElements;
      });

      await browser.close();

      const processedTexts = texts.map(removeDuplicates);
      const hashSet = createHashSet(processedTexts);

      const filteredTexts = await Promise.all(
        Array.from(hashSet).map(async text => await this.filterText(text, '***'))
      );

      const containsCensored = filteredTexts.some(text => text.includes('***'));

      return {
        flaggedDomain: false,
        containsCensored: containsCensored,
        filteredTexts: filteredTexts,
      };

    } catch (error) {
      console.error('Error during scraping:', error);
      throw new Error('Scraping failed.');
    }
  }
 }

 export default Scraper;
	import puppeteer from 'puppeteer-extra';
	import { LaunchOptions } from 'puppeteer';
	import fs from 'fs/promises';
	import path from 'path';
	import { fileURLToPath } from 'url';
	import StealthPlugin from 'puppeteer-extra-plugin-stealth';
	import AdblockerPlugin from 'puppeteer-extra-plugin-adblocker';

	interface Row {
	text: string;
	}

	const __filename = fileURLToPath(import.meta.url);
	const __dirname = path.dirname(__filename);


	class TrieNode {
	children: { [key: string]: TrieNode } = {};
	isEndOfWord: boolean = false;
	}

	class Trie {
	private root: TrieNode = new TrieNode();

	public insert(word: string): void {
	let node = this.root;
	for (const char of word) {
	if (!node.children[char]) {
	node.children[char] = new TrieNode();
	}
	node = node.children[char];
	}
	node.isEndOfWord = true;
	}

	public search(word: string): boolean {
	let node = this.root;
	for (const char of word) {
	if (!node.children[char]) {
	return false;
	}
	node = node.children[char];
	}
	return node.isEndOfWord;
	}

	public startsWith(prefix: string): boolean {
	let node = this.root;
	for (const char of prefix) {
	if (!node.children[char]) {
	return false;
	}
	node = node.children[char];
	}
	return true;
	}
	}

	const createHashSet = <T>(arr: T[]): Set<T> => {
	const hashSet: Set<T> = new Set<T>();
	const duplicates: T[] = [];

	for (const item of arr) {
	if (hashSet.has(item)) {
	duplicates.push(item);
	} else {
	hashSet.add(item);
	}
	}
	return hashSet;
	}

	const removeInternalDuplicates = (text: string): string => {
	const segments: RegExpMatchArray = text.match(/[^.!?]+[.!?]+/g) \|\| [text];
	const uniqueSegments: Set<string> = new Set<string>();
	const result: string[] = [];

	for (const segment of segments) {
	if (!uniqueSegments.has(segment)) {
	uniqueSegments.add(segment);
	result.push(segment);
	}
	}

	return result.join(' ');
	};

	const removeExternalDuplicates = (text: string): string => {
	const segments: RegExpMatchArray = text.match(/[^.!?]+[.!?]+/g) \|\| [text];
	const uniqueSegments: Set<string> = new Set<string>();
	const result: string[] = [];

	for (const segment of segments) {
	if (!uniqueSegments.has(segment)) {
	uniqueSegments.add(segment);
	result.push(segment);
	}
	}

	return result.join(' ');
	};

	const removeDuplicates = (text: string): string => {
	return removeInternalDuplicates(removeExternalDuplicates(text));
	};

	const wait = (s: number) => new Promise(r => setTimeout(r, s * 1000))

	class Scraper {
	private cachedData: Row[] \| null = null;
	private cachedNewWords: Row[] \| null = null;
	private filterTrie: Trie \| null = null;
	private filterDict: Set<string> \| null = null;
	private newWordsTree: Trie \| null = null;
	private newWordsDict: Set<string> \| null = null;
	private cachedNSFW: string[] \| null = null;

	constructor() {
	puppeteer.use(StealthPlugin());
	puppeteer.use(AdblockerPlugin({ blockTrackers: true }));
	}

	private async initializeFilterWords(): Promise<void> {
	const filePath = path.join(__dirname, 'data', 'slurs.txt');
	const nsfwFilePath = path.join(__dirname, 'data', 'nsfw-names.txt');

	if (!this.cachedData?.length) {
	this.cachedData = await this.parseTXT(filePath);
	this.filterTrie = new Trie();
	this.filterDict = new Set<string>();
	for (const row of this.cachedData) {
	this.filterTrie.insert(row.text);
	this.filterDict.add(row.text);
	}
	}

	if (!this.cachedNewWords?.length) {
	this.cachedNewWords = await this.parseTXT(nsfwFilePath);
	this.newWordsTree = new Trie();
	this.newWordsDict = new Set<string>();
	for (const row of this.cachedNewWords) {
	this.newWordsTree.insert(row.text);
	this.newWordsDict.add(row.text);
	}
	}
	}

	private async parseTXT(filePath: string): Promise<Row[]> {
	const data = await fs.readFile(filePath, 'utf8');
	return data.split('\n').filter(line => line.trim()).map(line => ({ text: line.trim() }));
	}

	private async filterText(text: string, replace: string): Promise<string> {
	if (!text) return text;

	const words = text.split(' ');
	for (let i = 0; i < words.length; i++) {
	const word = words[i];
	if (this.filterDict!.has(word) \|\| this.filterTrie!.search(word)) {
	words[i] = replace;
	}
	}
	return words.join(' ');
	}

	public async scrape(url: Readonly<string>): Promise<{ flaggedDomain: boolean, containsCensored: boolean, filteredTexts: string[] }> {
	const filePath = path.join(__dirname, 'data', 'nsfw.txt');

	try {
	if (!this.cachedNSFW) {
	const rows = await this.parseTXT(filePath);
	this.cachedNSFW = rows.map(row => row.text);
	}

	const nsfw = this.cachedNSFW;
	if (nsfw.includes(url.split('/')[0] === 'www' ? url : url.split('/')[2])) return {
	flaggedDomain: true,
	containsCensored: false,
	filteredTexts: [],
	};

	await this.initializeFilterWords();

	const browser = await puppeteer.launch(<LaunchOptions>{
	headless: true,
	args: ['--no-sandbox', '--disable-setuid-sandbox'],
	timeout: 0,
	});

	const page = await browser.newPage();
	const response = await page.goto(url, { timeout: 0, waitUntil: "domcontentloaded" });

	await wait(5);

	const finalUrl = response?.url() \|\| url;
	if (this.cachedNSFW.includes(finalUrl.split('/')[0] === 'www' ? finalUrl : finalUrl.split('/')[2])) {
	await browser.close();
	return {
	flaggedDomain: true,
	containsCensored: false,
	filteredTexts: [],
	};
	}

	const texts = await page.evaluate(() => {
	const elements = [...document.querySelectorAll('p, div, span, a, h1, h2, h3, h4, h5, h6, li')];
	let filteredElements = elements.map(el => el.textContent?.trim() \|\| '').filter(text => text.length > 0);
	for (let i = 0; i < filteredElements.length; i++) {
	let text = filteredElements[i];
	text = text.replace(/[\u{1F600}-\u{1F64F}]/gu, '');
	text = text.replace(/[^\x00-\x7F]/g, '');
	text = text.replace(/\s+/g, ' ');
	text = text.replace(/[\n\r]+/g, ' ');
	filteredElements[i] = text;
	}
	return filteredElements;
	});

	await browser.close();

	const processedTexts = texts.map(removeDuplicates);
	const hashSet = createHashSet(processedTexts);

	const filteredTexts = await Promise.all(
	Array.from(hashSet).map(async text => await this.filterText(text, '***'))
	);

	const containsCensored = filteredTexts.some(text => text.includes('***'));

	return {
	flaggedDomain: false,
	containsCensored: containsCensored,
	filteredTexts: filteredTexts,
	};

	} catch (error) {
	console.error('Error during scraping:', error);
	throw new Error('Scraping failed.');
	}
	}
	}

	export default Scraper;