Created
November 22, 2018 09:30
-
-
Save almost/9ee99b1a3e7fa240c596be3820c0b6b0 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"use strict"; | |
const url = require("url"); | |
const rp = require("request-promise-native"); | |
const getHrefs = require("get-hrefs"); | |
const MAX_CONCURRENT = 10; | |
const MAX_COUNT = 2000; | |
const ALLOW_DOMAINS = new Set(["almostobsolete.net", "tomparslow.co.uk"]); | |
const START_URLS = ["http://almostobsolete.net/"]; | |
async function getHrefsFromUrl(url) { | |
const body = await rp({ url }); | |
return getHrefs(body, { baseUrl: url }) | |
} | |
class Crawler { | |
constructor(startUrls) { | |
this.totalRequested = 0; | |
this.inFlight = new Set(); | |
this.seen = new Set(startUrls); | |
this.queue = [...startUrls]; | |
this.fillQueue(); | |
setInterval(this.showStatus.bind(this), 1000).unref(); | |
} | |
processHrefs(hrefs) { | |
for (let href of hrefs) { | |
if (ALLOW_DOMAINS.has(url.parse(href).hostname)) { | |
if (!this.seen.has(href)) { | |
this.seen.add(href); | |
this.queue.push(href); | |
} | |
} | |
} | |
} | |
async makeRequest(url) { | |
this.totalRequested += 1; | |
let hrefs; | |
try { | |
hrefs = await getHrefsFromUrl(url); | |
} catch (e) { | |
console.log("Failed to get", e.statusCode, url); | |
} | |
if (hrefs) { | |
this.processHrefs(hrefs); | |
} | |
this.inFlight.delete(url); | |
this.fillQueue(); | |
} | |
fillQueue() { | |
while ( | |
this.inFlight.size < MAX_CONCURRENT && | |
this.queue.length && | |
this.totalRequested < MAX_COUNT | |
) { | |
const url = this.queue.shift(); | |
this.inFlight.add(url); | |
this.makeRequest(url); | |
} | |
} | |
showStatus() { | |
console.log( | |
"--------------------------------------------------------------------------------" | |
); | |
console.log("Queue size:", this.queue.length); | |
console.log("In flight:", this.inFlight); | |
console.log("Total requested", this.totalRequested); | |
console.log("Seen", this.seen.size); | |
} | |
} | |
new Crawler(START_URLS); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment