Last active
January 31, 2019 10:42
-
-
Save RadNi/82f5bb9c33c80aaee0be52a79f8b6454 to your computer and use it in GitHub Desktop.
It is a simple crawler implemented with Node crawler package .
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
//require('events').EventEmitter.defaultMaxListeners = 0 | |
var Crawler = require("crawler"); | |
var baseURLS = ['http://www.google.com/','http://www.yahoo.com', 'http://www.amazon.com', 'http://www.sharif.ir/home']; | |
var depth = 3 | |
var urls = new Set() | |
var c = new Crawler({ | |
maxConnections : 100000, | |
retries: 2, | |
skipDuplicates: true, | |
// preRequest: function(options, done) { | |
// | |
// // console.log("This request: ", options.uri) | |
// done(); | |
// }, | |
callback : function (error, res, done) { | |
if(error){ | |
console.log(error); | |
done() | |
}else { | |
var $ = res.$; | |
console.log(res.request.uri.href + " " + urls.size); | |
if($) { | |
var tags = $("a"); | |
// console.log("inja" + " " + res.body) | |
// console.log($) | |
for (var a = 0; a < tags.length; a++) { | |
// console.log(res.request.uri.href+ " " + tags.length) | |
if (tags[a].attribs.href) { | |
// console.log(res.request.uri.href) | |
if (tags[a].attribs.href.startsWith("www") || tags[a].attribs.href.startsWith("http") || tags[a].attribs.href.startsWith("https")) { | |
// console.log(res.request.uri.href) | |
urls.add(tags[a].attribs.href) | |
} | |
} | |
} | |
} | |
else { | |
console.log("hereee") | |
} | |
done(); | |
} | |
} | |
}); | |
c.queue(baseURLS); | |
// c.queue("http://www.sharif.ir/home"); | |
c.on('drain',function(){ | |
depth--; | |
if (depth<=0) { | |
console.log(urls); | |
console.log("size: ", urls.size) | |
} | |
else { | |
console.log("Draiinnnnnnnnnnn, "+ depth); | |
// c.queue("http://www.sharif.ir/home") | |
c.queue(Array.from(urls)) | |
} | |
}); | |
Sometimes javascript heap memory will finish, a naive solution is using this option for node :
--max_old_space_size=2000000
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
For testing after initializing new node project, use
npm install crawler
thennode crawler-project.js
.baseURLS
should contain the start points you want to crawl.the
depth
variable shows depth you want to crawl from start points.