ranyefet · August 2, 2018 10:51
diff --git a/crawler.js b/crawler.js
 const got = require("got");
 const cheerio = require("cheerio");
 const url = require("url");
 const URL = url.URL;

 const INITIAL_URL = "https://bitcoin.org/";
 const MAX_QUEUE_SIZE = 1000;
 const MAX_RUNS = 10;

 function isRelativePath(url) {
  return url.startsWith("/");
 }

 class Crawler {
  constructor(initialUrl) {
    this.initialUrl = new URL(initialUrl);
    this.queue = new Set();
    this.indexed = new Set();
    this.titlesIndex = new Map();
    this.numRuns = 0;
  }

  parse(html) {
    return cheerio.load(html);
  }

  async fetch(url) {
    console.log(`Crawling ${url}...`);

    try {
      return await got(url);
    } catch (e) {
      console.error(`Error crawling ${url}`);
      return null;
    }
  }

  // Find all links and filter out external links
  getInboundLinks(find) {
    return find("a").filter((idx, link) => {
      const href = link.attribs.href;
      if (!href) return false;

      return isRelativePath(href) || href.includes(this.initialUrl.hostname);
    });
  }

  addLink(url) {
    if (this.queue.size >= MAX_QUEUE_SIZE) {
      throw new Error("Queue reached maximum limit");
    }
    this.queue.add(url);
  }

  addLinksToQueue(links) {
    const { protocol, hostname } = this.initialUrl;

    links.each((idx, link) => {
      const href = link.attribs.href;
      if (isRelativePath(href)) {
        const url = `${protocol}//${hostname}${href}`;
        this.addLink(url);
        return;
      }
      this.addLink(href);
    });
  }

  indexPage(url, title) {
    if (title) {
      const words = title.split(" ");
      words.forEach(word => {
        const titleAlreadyIndexed = this.titlesIndex.has(word);
        if (titleAlreadyIndexed) {
          const urlSet = this.titlesIndex.get(word);
          this.titlesIndex.set(word, urlSet.add(url));
        } else {
          this.titlesIndex.set(word, new Set([url]));
        }
      });
    }
  }

  markAsIndexed(url) {
    this.queue.delete(url);
    this.indexed.add(url);
  }

  getNextUrl() {
    if (this.queue.size) {
      const it = this.queue.values();
      return it.next().value;
    }
    return null;
  }

  async crawlPage(url) {
    const response = await this.fetch(url);
    if (!response) {
      throw new Error("Unable to fetch: ", url);
    }

    const find = this.parse(response.body);
    const title = find("title").text();
    console.log("Page Title", title);
    this.indexPage(url, title);

    const links = this.getInboundLinks(find);
    this.addLinksToQueue(links);
    console.log("Queue size", this.queue.size);

    this.markAsIndexed(url);

    this.numRuns++;

    const nextUrl = this.getNextUrl();
    if (nextUrl && this.numRuns < MAX_RUNS) {
      process.nextTick(() => this.crawlPage(nextUrl));
    } else {
      console.log("Titles Indexed", this.titlesIndex);
      console.log("All done");
    }
  }

  async start() {
    await this.crawlPage(this.initialUrl.toString());
  }
 }

 // Run the crawler
 const crawler = new Crawler(INITIAL_URL);
 const run = (async () => await crawler.start())();
	const got = require("got");
	const cheerio = require("cheerio");
	const url = require("url");
	const URL = url.URL;

	const INITIAL_URL = "https://bitcoin.org/";
	const MAX_QUEUE_SIZE = 1000;
	const MAX_RUNS = 10;

	function isRelativePath(url) {
	return url.startsWith("/");
	}

	class Crawler {
	constructor(initialUrl) {
	this.initialUrl = new URL(initialUrl);
	this.queue = new Set();
	this.indexed = new Set();
	this.titlesIndex = new Map();
	this.numRuns = 0;
	}

	parse(html) {
	return cheerio.load(html);
	}

	async fetch(url) {
	console.log(`Crawling ${url}...`);

	try {
	return await got(url);
	} catch (e) {
	console.error(`Error crawling ${url}`);
	return null;
	}
	}

	// Find all links and filter out external links
	getInboundLinks(find) {
	return find("a").filter((idx, link) => {
	const href = link.attribs.href;
	if (!href) return false;

	return isRelativePath(href) \|\| href.includes(this.initialUrl.hostname);
	});
	}

	addLink(url) {
	if (this.queue.size >= MAX_QUEUE_SIZE) {
	throw new Error("Queue reached maximum limit");
	}
	this.queue.add(url);
	}

	addLinksToQueue(links) {
	const { protocol, hostname } = this.initialUrl;

	links.each((idx, link) => {
	const href = link.attribs.href;
	if (isRelativePath(href)) {
	const url = `${protocol}//${hostname}${href}`;
	this.addLink(url);
	return;
	}
	this.addLink(href);
	});
	}

	indexPage(url, title) {
	if (title) {
	const words = title.split(" ");
	words.forEach(word => {
	const titleAlreadyIndexed = this.titlesIndex.has(word);
	if (titleAlreadyIndexed) {
	const urlSet = this.titlesIndex.get(word);
	this.titlesIndex.set(word, urlSet.add(url));
	} else {
	this.titlesIndex.set(word, new Set([url]));
	}
	});
	}
	}

	markAsIndexed(url) {
	this.queue.delete(url);
	this.indexed.add(url);
	}

	getNextUrl() {
	if (this.queue.size) {
	const it = this.queue.values();
	return it.next().value;
	}
	return null;
	}

	async crawlPage(url) {
	const response = await this.fetch(url);
	if (!response) {
	throw new Error("Unable to fetch: ", url);
	}

	const find = this.parse(response.body);
	const title = find("title").text();
	console.log("Page Title", title);
	this.indexPage(url, title);

	const links = this.getInboundLinks(find);
	this.addLinksToQueue(links);
	console.log("Queue size", this.queue.size);

	this.markAsIndexed(url);

	this.numRuns++;

	const nextUrl = this.getNextUrl();
	if (nextUrl && this.numRuns < MAX_RUNS) {
	process.nextTick(() => this.crawlPage(nextUrl));
	} else {
	console.log("Titles Indexed", this.titlesIndex);
	console.log("All done");
	}
	}

	async start() {
	await this.crawlPage(this.initialUrl.toString());
	}
	}

	// Run the crawler
	const crawler = new Crawler(INITIAL_URL);
	const run = (async () => await crawler.start())();