Last active
November 22, 2018 09:30
-
-
Save almost/7f10568539b9a079cb8aca3a13d52dc8 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Solution https://gist.github.com/almost/9ee99b1a3e7fa240c596be3820c0b6b0 | |
"use strict"; | |
const url = require('url'); | |
const rp = require("request-promise-native"); | |
const getHrefs = require("get-hrefs"); | |
const MAX_CONCURRENT = 3; | |
const MAX_COUNT = 5; | |
const ALLOW_DOMAINS = new Set(["almostobsolete.net", "tomparslow.co.uk"]); | |
const START_URLS = ["http://almostobsolete.net/"]; | |
async function getHrefsFromUrl(currentUrl) { | |
const body = await rp({ url: currentUrl }); | |
return getHrefs(body, { baseUrl: currentUrl}); | |
} | |
function isAllowedDomain(currentUrl) { | |
return ALLOW_DOMAINS.has(url.parse(currentUrl).hostname)); | |
} | |
// TODO | |
// Starting from START_URLS find links and crawl them. | |
// Only follow links to pages in th ALLOW_DOMAINS | |
// Do not make more that MAX_CONCURRENT requests at any one time | |
// Do not make more than MAX_COUNT requests overall | |
// Do not request the same url twice |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment