-
-
Save pyadav/1f19aaea62fb3b0ccbdb07add6cc9278 to your computer and use it in GitHub Desktop.
Node.JS + Redis = Link Crawling
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| /* | |
| * Crawling the Web (http://bitbucket.org/feuervogel/nodejs-crawler) | |
| * | |
| * 1. Start mit einer nicht-leeren Liste von URLs | |
| * 2. Für jede URL: | |
| * 2. a) Lade HTML runter | |
| * 2. b) Extrahiere Hyperlinks | |
| * 2. c) Hänge Hyperlinks an neue Liste an | |
| * 3. Verschmelze beide Listen | |
| * 4. GOTO 1. | |
| * | |
| * | |
| */ | |
| var sys = require("sys"); | |
| var redis = require("redis"); | |
| /** create redis conection */ | |
| var client = redis.createClient(); | |
| var clients_running = 0; | |
| var max_clients = 50; | |
| // exception abfangen und ausgeben | |
| process.addListener("uncaughtException", function(exception){ | |
| if(exception.message){ | |
| sys.puts(exception.message); | |
| clients_running--; | |
| while (clients_running < max_clients) { | |
| client.randomkey( function (err, key) { | |
| urlemitter.emit("url", key); | |
| }); | |
| clients_running++; | |
| } | |
| }else{ | |
| sys.puts(sys.inspect(exception)); | |
| clients_running--; | |
| while (clients_running < max_clients) { | |
| client.randomkey( function (err, key) { | |
| urlemitter.emit("url", key); | |
| }); | |
| clients_running++; | |
| } | |
| } | |
| }); | |
| //versehe Array mit einer Methode, die alle Elemente einzigartig macht | |
| Array.prototype.distinctValues = function(){ | |
| var hash = new Object(); | |
| for (i = 0; i < this.length; i++){ | |
| hash[this[i]] = true | |
| } | |
| var array = new Array(); | |
| for (key in hash) { | |
| array.push(key); | |
| } | |
| return array; | |
| } | |
| //funktion, die externe hyperlinks aus html extrahiert | |
| function extractExternalHyperlinks(url, html){ | |
| var urlparser = require("url"); | |
| var url_host = urlparser.parse(url).host; | |
| var results = html.match(/<a[^>]+href=["']http:\/\/[^\s"']+/ig); | |
| if(results == null){ | |
| return new Array(); | |
| } | |
| var urlarr = new Array(); | |
| for(var i = 0; i < results.length; i++){ | |
| pur_url = results[i].match(/http:\/\/[^\s"']+/i); | |
| new_url = urlparser.parse(String(pur_url)); | |
| if(new_url.host != url_host){ | |
| urlarr.push(pur_url); | |
| } | |
| } | |
| return urlarr.distinctValues(); | |
| } | |
| // wir brauchen events | |
| events = require("events"); | |
| // neuer eventemitter | |
| urlemitter = new events.EventEmitter(); | |
| urlemitter.addListener("url", function (url) { | |
| // debug ausgabe | |
| //sys.puts("crawling: "+ url); | |
| var http = require("http"); | |
| var urlparser = require("url"); | |
| var parsed_url = urlparser.parse(url); | |
| if(!parsed_url.port){ | |
| parsed_url.port = 80; | |
| } | |
| if(!parsed_url.pathname){ | |
| parsed_url.pathname = "/"; | |
| } | |
| if(!parsed_url.search){ | |
| parsed_url.search = ""; | |
| } | |
| // request zusammenbauen | |
| var temp_queryobj = {pathname: parsed_url.pathname, search: parsed_url.search}; | |
| var crawling_host = parsed_url.hostname; | |
| var crawling_port = parsed_url.port; | |
| var crawling_query = urlparser.format(temp_queryobj); | |
| // neuen urlclient erstellen | |
| var urlclient = http.createClient(crawling_port, crawling_host); | |
| // header für request | |
| var requestheader = {}; | |
| requestheader["User-Agent"] = "Mozilla/5.0 (X11; U; Linux i686; de; rv:1.9.1.8) Gecko/20100214 Ubuntu/9.10 (karmic) Firefox/3.5.8"; | |
| requestheader["host"] = crawling_host; | |
| // request abfeuern | |
| var request = urlclient.request("GET", crawling_query, requestheader); | |
| // füge response listener hinzu | |
| request.addListener("response", function(response) { | |
| // die antwort hat http-headers und einen statuscode | |
| var headers = response.headers; | |
| var statuscode = response.statusCode; | |
| // falls kein content-type im header, brich ab | |
| if(!headers["content-type"]){ | |
| return; | |
| } | |
| // falls der content-type kein text, brich ab | |
| var contenttype = headers["content-type"]; | |
| if(contenttype.substring(0, 4) != "text"){ | |
| return; | |
| } | |
| // utf-8 encoding einschalten | |
| response.setEncoding("utf8"); | |
| // antwort parsen und neue urls emitten | |
| var body = ""; | |
| response.addListener("data", function(chunk) { | |
| body += chunk; | |
| }); | |
| response.addListener("end", function() { | |
| urls = extractExternalHyperlinks(url, body); | |
| for(var i = 0; i < urls.length; i++){ | |
| client.sadd(urls[i], url); // in der Datenbank speichern | |
| if (clients_running < max_clients) { | |
| client.randomkey( function (err, key) { // zufällige URL aus der Datenbank | |
| urlemitter.emit("url", key); // neue URL hinzufügen | |
| }); | |
| clients_running++; | |
| } | |
| } | |
| clients_running--; | |
| }); | |
| }); | |
| if (clients_running < max_clients) | |
| { | |
| while (clients_running < max_clients) { | |
| client.randomkey( function (err, key) { // zufällige URL aus der Datenbank | |
| urlemitter.emit("url", key); // neue URL hinzufügen | |
| }); | |
| clients_running++; | |
| } | |
| } | |
| request.end(); | |
| }); | |
| // initiale url | |
| urlemitter.emit("url", "http://news.ycombinator.com/"); | |
| client.randomkey( function (err, key) { // zufällige URL aus der Datenbank | |
| urlemitter.emit("url", key); // neue URL hinzufügen | |
| }); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment