Skip to content

Instantly share code, notes, and snippets.

@pyadav
Forked from Devko/gist:1064553
Created December 9, 2016 05:17
Show Gist options
  • Select an option

  • Save pyadav/1f19aaea62fb3b0ccbdb07add6cc9278 to your computer and use it in GitHub Desktop.

Select an option

Save pyadav/1f19aaea62fb3b0ccbdb07add6cc9278 to your computer and use it in GitHub Desktop.
Node.JS + Redis = Link Crawling
/*
* Crawling the Web (http://bitbucket.org/feuervogel/nodejs-crawler)
*
* 1. Start mit einer nicht-leeren Liste von URLs
* 2. Für jede URL:
* 2. a) Lade HTML runter
* 2. b) Extrahiere Hyperlinks
* 2. c) Hänge Hyperlinks an neue Liste an
* 3. Verschmelze beide Listen
* 4. GOTO 1.
*
*
*/
var sys = require("sys");
var redis = require("redis");
/** create redis conection */
var client = redis.createClient();
var clients_running = 0;
var max_clients = 50;
// exception abfangen und ausgeben
process.addListener("uncaughtException", function(exception){
if(exception.message){
sys.puts(exception.message);
clients_running--;
while (clients_running < max_clients) {
client.randomkey( function (err, key) {
urlemitter.emit("url", key);
});
clients_running++;
}
}else{
sys.puts(sys.inspect(exception));
clients_running--;
while (clients_running < max_clients) {
client.randomkey( function (err, key) {
urlemitter.emit("url", key);
});
clients_running++;
}
}
});
//versehe Array mit einer Methode, die alle Elemente einzigartig macht
Array.prototype.distinctValues = function(){
var hash = new Object();
for (i = 0; i < this.length; i++){
hash[this[i]] = true
}
var array = new Array();
for (key in hash) {
array.push(key);
}
return array;
}
//funktion, die externe hyperlinks aus html extrahiert
function extractExternalHyperlinks(url, html){
var urlparser = require("url");
var url_host = urlparser.parse(url).host;
var results = html.match(/<a[^>]+href=["']http:\/\/[^\s"']+/ig);
if(results == null){
return new Array();
}
var urlarr = new Array();
for(var i = 0; i < results.length; i++){
pur_url = results[i].match(/http:\/\/[^\s"']+/i);
new_url = urlparser.parse(String(pur_url));
if(new_url.host != url_host){
urlarr.push(pur_url);
}
}
return urlarr.distinctValues();
}
// wir brauchen events
events = require("events");
// neuer eventemitter
urlemitter = new events.EventEmitter();
urlemitter.addListener("url", function (url) {
// debug ausgabe
//sys.puts("crawling: "+ url);
var http = require("http");
var urlparser = require("url");
var parsed_url = urlparser.parse(url);
if(!parsed_url.port){
parsed_url.port = 80;
}
if(!parsed_url.pathname){
parsed_url.pathname = "/";
}
if(!parsed_url.search){
parsed_url.search = "";
}
// request zusammenbauen
var temp_queryobj = {pathname: parsed_url.pathname, search: parsed_url.search};
var crawling_host = parsed_url.hostname;
var crawling_port = parsed_url.port;
var crawling_query = urlparser.format(temp_queryobj);
// neuen urlclient erstellen
var urlclient = http.createClient(crawling_port, crawling_host);
// header für request
var requestheader = {};
requestheader["User-Agent"] = "Mozilla/5.0 (X11; U; Linux i686; de; rv:1.9.1.8) Gecko/20100214 Ubuntu/9.10 (karmic) Firefox/3.5.8";
requestheader["host"] = crawling_host;
// request abfeuern
var request = urlclient.request("GET", crawling_query, requestheader);
// füge response listener hinzu
request.addListener("response", function(response) {
// die antwort hat http-headers und einen statuscode
var headers = response.headers;
var statuscode = response.statusCode;
// falls kein content-type im header, brich ab
if(!headers["content-type"]){
return;
}
// falls der content-type kein text, brich ab
var contenttype = headers["content-type"];
if(contenttype.substring(0, 4) != "text"){
return;
}
// utf-8 encoding einschalten
response.setEncoding("utf8");
// antwort parsen und neue urls emitten
var body = "";
response.addListener("data", function(chunk) {
body += chunk;
});
response.addListener("end", function() {
urls = extractExternalHyperlinks(url, body);
for(var i = 0; i < urls.length; i++){
client.sadd(urls[i], url); // in der Datenbank speichern
if (clients_running < max_clients) {
client.randomkey( function (err, key) { // zufällige URL aus der Datenbank
urlemitter.emit("url", key); // neue URL hinzufügen
});
clients_running++;
}
}
clients_running--;
});
});
if (clients_running < max_clients)
{
while (clients_running < max_clients) {
client.randomkey( function (err, key) { // zufällige URL aus der Datenbank
urlemitter.emit("url", key); // neue URL hinzufügen
});
clients_running++;
}
}
request.end();
});
// initiale url
urlemitter.emit("url", "http://news.ycombinator.com/");
client.randomkey( function (err, key) { // zufällige URL aus der Datenbank
urlemitter.emit("url", key); // neue URL hinzufügen
});
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment