Skip to content

Instantly share code, notes, and snippets.

@almost
Created November 22, 2018 09:30
Show Gist options
  • Save almost/9ee99b1a3e7fa240c596be3820c0b6b0 to your computer and use it in GitHub Desktop.
Save almost/9ee99b1a3e7fa240c596be3820c0b6b0 to your computer and use it in GitHub Desktop.
"use strict";
const url = require("url");
const rp = require("request-promise-native");
const getHrefs = require("get-hrefs");
const MAX_CONCURRENT = 10;
const MAX_COUNT = 2000;
const ALLOW_DOMAINS = new Set(["almostobsolete.net", "tomparslow.co.uk"]);
const START_URLS = ["http://almostobsolete.net/"];
async function getHrefsFromUrl(url) {
const body = await rp({ url });
return getHrefs(body, { baseUrl: url })
}
class Crawler {
constructor(startUrls) {
this.totalRequested = 0;
this.inFlight = new Set();
this.seen = new Set(startUrls);
this.queue = [...startUrls];
this.fillQueue();
setInterval(this.showStatus.bind(this), 1000).unref();
}
processHrefs(hrefs) {
for (let href of hrefs) {
if (ALLOW_DOMAINS.has(url.parse(href).hostname)) {
if (!this.seen.has(href)) {
this.seen.add(href);
this.queue.push(href);
}
}
}
}
async makeRequest(url) {
this.totalRequested += 1;
let hrefs;
try {
hrefs = await getHrefsFromUrl(url);
} catch (e) {
console.log("Failed to get", e.statusCode, url);
}
if (hrefs) {
this.processHrefs(hrefs);
}
this.inFlight.delete(url);
this.fillQueue();
}
fillQueue() {
while (
this.inFlight.size < MAX_CONCURRENT &&
this.queue.length &&
this.totalRequested < MAX_COUNT
) {
const url = this.queue.shift();
this.inFlight.add(url);
this.makeRequest(url);
}
}
showStatus() {
console.log(
"--------------------------------------------------------------------------------"
);
console.log("Queue size:", this.queue.length);
console.log("In flight:", this.inFlight);
console.log("Total requested", this.totalRequested);
console.log("Seen", this.seen.size);
}
}
new Crawler(START_URLS);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment