Created
January 15, 2013 03:23
-
-
Save sirkitree/4535781 to your computer and use it in GitHub Desktop.
Simple indexing script. Run with node indexer > output.file
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| var Crawler = require("simplecrawler").Crawler; | |
| // var myCrawler = new Crawler("jeradbitner.com"); | |
| var myCrawler = new Crawler("www.grammy.com"); | |
| // myCrawler.domain = "www.grammy.com"; | |
| myCrawler.supportedMimeTypes = [ | |
| /^text\//i | |
| ]; | |
| myCrawler.scanSubdomains = false; | |
| myCrawler.ignoreWWWDomain = true; | |
| // myCrawler.discoverResources = false; | |
| var items = new Array; | |
| myCrawler.on("fetchcomplete", function(queueItem, responseBuffer, response) { | |
| // Only want html pages && | |
| // only want stuff on this domain. (Seems to pull in other things sometimes | |
| // even though simplcrawler claims it should not). | |
| if (queueItem.stateData.contentType.indexOf("text/html") != -1 && | |
| (queueItem.domain == myCrawler.domain)) { | |
| items.push(queueItem.path); | |
| console.log(queueItem.url); | |
| } | |
| }); | |
| myCrawler.start(); | |
| myCrawler.on("complete", function() { | |
| console.log(items); | |
| console.log(items.length); | |
| }); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment