Created
August 2, 2018 10:51
-
-
Save ranyefet/8d31eb027c1fa8908fb3110b16487d92 to your computer and use it in GitHub Desktop.
A simple web crawler
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const got = require("got"); | |
const cheerio = require("cheerio"); | |
const url = require("url"); | |
const URL = url.URL; | |
const INITIAL_URL = "https://bitcoin.org/"; | |
const MAX_QUEUE_SIZE = 1000; | |
const MAX_RUNS = 10; | |
function isRelativePath(url) { | |
return url.startsWith("/"); | |
} | |
class Crawler { | |
constructor(initialUrl) { | |
this.initialUrl = new URL(initialUrl); | |
this.queue = new Set(); | |
this.indexed = new Set(); | |
this.titlesIndex = new Map(); | |
this.numRuns = 0; | |
} | |
parse(html) { | |
return cheerio.load(html); | |
} | |
async fetch(url) { | |
console.log(`Crawling ${url}...`); | |
try { | |
return await got(url); | |
} catch (e) { | |
console.error(`Error crawling ${url}`); | |
return null; | |
} | |
} | |
// Find all links and filter out external links | |
getInboundLinks(find) { | |
return find("a").filter((idx, link) => { | |
const href = link.attribs.href; | |
if (!href) return false; | |
return isRelativePath(href) || href.includes(this.initialUrl.hostname); | |
}); | |
} | |
addLink(url) { | |
if (this.queue.size >= MAX_QUEUE_SIZE) { | |
throw new Error("Queue reached maximum limit"); | |
} | |
this.queue.add(url); | |
} | |
addLinksToQueue(links) { | |
const { protocol, hostname } = this.initialUrl; | |
links.each((idx, link) => { | |
const href = link.attribs.href; | |
if (isRelativePath(href)) { | |
const url = `${protocol}//${hostname}${href}`; | |
this.addLink(url); | |
return; | |
} | |
this.addLink(href); | |
}); | |
} | |
indexPage(url, title) { | |
if (title) { | |
const words = title.split(" "); | |
words.forEach(word => { | |
const titleAlreadyIndexed = this.titlesIndex.has(word); | |
if (titleAlreadyIndexed) { | |
const urlSet = this.titlesIndex.get(word); | |
this.titlesIndex.set(word, urlSet.add(url)); | |
} else { | |
this.titlesIndex.set(word, new Set([url])); | |
} | |
}); | |
} | |
} | |
markAsIndexed(url) { | |
this.queue.delete(url); | |
this.indexed.add(url); | |
} | |
getNextUrl() { | |
if (this.queue.size) { | |
const it = this.queue.values(); | |
return it.next().value; | |
} | |
return null; | |
} | |
async crawlPage(url) { | |
const response = await this.fetch(url); | |
if (!response) { | |
throw new Error("Unable to fetch: ", url); | |
} | |
const find = this.parse(response.body); | |
const title = find("title").text(); | |
console.log("Page Title", title); | |
this.indexPage(url, title); | |
const links = this.getInboundLinks(find); | |
this.addLinksToQueue(links); | |
console.log("Queue size", this.queue.size); | |
this.markAsIndexed(url); | |
this.numRuns++; | |
const nextUrl = this.getNextUrl(); | |
if (nextUrl && this.numRuns < MAX_RUNS) { | |
process.nextTick(() => this.crawlPage(nextUrl)); | |
} else { | |
console.log("Titles Indexed", this.titlesIndex); | |
console.log("All done"); | |
} | |
} | |
async start() { | |
await this.crawlPage(this.initialUrl.toString()); | |
} | |
} | |
// Run the crawler | |
const crawler = new Crawler(INITIAL_URL); | |
const run = (async () => await crawler.start())(); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment