Created
November 29, 2017 23:11
-
-
Save t3db0t/2db07159130bc5c0dc9ebe6c8dc8f317 to your computer and use it in GitHub Desktop.
Example using Supercrawler
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
var supercrawler = require("supercrawler"); | |
// 1. Create a new instance of the Crawler object, providing configuration | |
// details. Note that configuration cannot be changed after the object is | |
// created. | |
var crawler = new supercrawler.Crawler({ | |
// By default, Supercrawler uses a simple FIFO queue, which doesn't support | |
// retries or memory of crawl state. For any non-trivial crawl, you should | |
// create a database. Provide your database config to the constructor of | |
// DbUrlList. | |
// urlList: new supercrawler.DbUrlList({ | |
// db: { | |
// database: "crawler", | |
// username: "root", | |
// password: secrets.db.password, | |
// sequelizeOpts: { | |
// dialect: "mysql", | |
// host: "localhost" | |
// } | |
// } | |
// }), | |
// Tme (ms) between requests | |
interval: 1000, | |
// Maximum number of requests at any one time. | |
concurrentRequestsLimit: 5, | |
// Time (ms) to cache the results of robots.txt queries. | |
robotsCacheTime: 3600000, | |
// Query string to use during the crawl. | |
userAgent: "Mozilla/5.0 (compatible; supercrawler/1.0; +https://github.com/brendonboshell/supercrawler)", | |
// Custom options to be passed to request. | |
// request: { | |
// headers: { | |
// 'x-custom-header': 'example' | |
// } | |
// } | |
}); | |
crawler.on("crawlurl", function (url) { | |
console.log("Crawling " + url); | |
}); | |
// Get "Sitemaps:" directives from robots.txt | |
crawler.addHandler(supercrawler.handlers.robotsParser()); | |
// Crawl sitemap files and extract their URLs. | |
crawler.addHandler(supercrawler.handlers.sitemapsParser()); | |
// Pick up <a href> links from HTML documents | |
crawler.addHandler("text/html", supercrawler.handlers.htmlLinkParser({ | |
// Restrict discovered links to the following hostnames. | |
hostnames: ["sweetpricing.com"] | |
})); | |
// Custom content handler for HTML pages. | |
crawler.addHandler("text/html", function (context) { | |
var sizeKb = Buffer.byteLength(context.body) / 1024; | |
logger.info("Processed", context.url, "Size=", sizeKb, "KB"); | |
}); | |
console.log("Starting Crawler..."); | |
crawler.getUrlList() | |
.insertIfNotExists(new supercrawler.Url("https://sweetpricing.com")) | |
.then(function () { | |
return crawler.start(); | |
}); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment