Created
May 14, 2013 12:09
-
-
Save s3131212/5575433 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"db":{ | |
"host":"127.0.0.1", | |
"port":3306, | |
"user":"root", | |
"password":"", | |
"database":"crawler" | |
}, | |
"baseSite":"http://s3131212.com/links/" | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
var fs = require('fs'); | |
var ConfigLoader = { | |
_cache:null, | |
load:function(file){ | |
var self = this; | |
this._cache = JSON.parse(fs.readFileSync(file).toString()); | |
fs.watch(file,function(){ | |
self._cache = JSON.parse(fs.readFileSync(file).toString()); | |
}); | |
return this._cache; | |
}, | |
get:function(key){ | |
return this._cache[key]; | |
} | |
}; | |
module.exports = ConfigLoader; |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
var request = require('request'); | |
var cheerio = require('cheerio'); | |
var async = require('async'); | |
var db = require('mysql'); | |
var util = require('./utils.js'); | |
var config = require('./configLoader.js'); | |
config.load(__dirname+'/config.json'); | |
var Crawler = function(){ | |
var self = this; | |
this.conn = db.createConnection(config.get('db')); | |
this.indexed = 0; | |
this.baseSite = config.get('baseSite'); | |
this._url = this.baseSite; | |
this.url = this.baseSite; | |
this.crawl = function(cb){ | |
this.conn.query('SELECT * FROM `queue` LIMIT 0,1',function(e,result){ | |
self.url = result.length > 0 ? result[0].url : self.baseSite; | |
request(self.url,function(e,res,body){ | |
if(!e && res.statusCode === 200){ | |
self.getInfo(body); | |
} | |
else { | |
console.log('Error requesting page %s',self.url); | |
} | |
if(result.length > 0){ | |
self.conn.query('DELETE FROM `queue` WHERE `id` = ?',[result[0].id],function(){ | |
cb(); | |
}); | |
} | |
else { | |
cb(); | |
} | |
self._url = self.url; | |
}); | |
}); | |
}; | |
this.getInfo = function(html){ | |
var $ = cheerio.load(html); | |
var title = $('head title').text(); | |
var keywords = $('head meta[name=keywords]').attr('content'); | |
var desc = $('head meta[name=description]').attr('content'); | |
var links = $('a'); | |
console.log('Crawling "%s" | %s',title,this.url); | |
async.map(links.map(function(){ | |
return $(this).attr('href'); | |
}).map(function(href){ | |
if(href && href != self._url && !(/^#(\w)+/.test(href)) && !util.imageRegexp.test(href)){ | |
if(util.isExternal(href)){ | |
return 'INSERT INTO `queue` SET `id` = \''+util.id()+'\', `url` = '+self.conn.escape(href); | |
} | |
else { | |
return 'INSERT INTO `queue` SET `id` = \''+util.id()+'\', `url` = '+self.conn.escape(util.resolveRelativeURL(href,self._url)); | |
} | |
} | |
return false; | |
}).filter(function(el){ | |
return !!el; | |
}) | |
,this.conn.query.bind(this.conn),function(e,result){ | |
if(e){ | |
console.log('Error writing queue.'); | |
console.log(e); | |
} | |
}); | |
this.conn.query('INSERT INTO `websites` SET ?',{ | |
id:util.id(), | |
url:this.url, | |
from:this._url, | |
title:title, | |
keywords:keywords || '', | |
desc:desc || '' | |
},function(e){ | |
if(e){ | |
console.log('Error indexing page %s',self._url); | |
console.log(e); | |
} | |
else { | |
console.log('Successfully indexed page %s',self._url); | |
self.indexed++; | |
} | |
}); | |
}; | |
}; | |
module.exports = Crawler; |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
var Crawler = require('./crawler.js'); | |
var async = require('async'); | |
var spider = new Crawler(); | |
async.forever(function(cb){ | |
spider.crawl(function(){ | |
cb(null); | |
}); | |
}); |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
var path = require('path'); | |
var Utils = { | |
id:function(){ | |
var ret = ''; | |
for(var chars="0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_".split(''),i=0;i<75;i++){ | |
ret += chars[~~(Math.random() * chars.length)]; | |
} | |
return ret; | |
}, | |
resolveRelativeURL:function(p,url){ | |
var proto = url.match(/^https?/)[0]; //extract the protocol out | |
url = url.replace(/^https?:\/\//,''); //remove the protocol | |
return proto+'://'+path.normalize(url+'/'+p) //find out the absolute URL | |
.replace(path.sep,'/') //replace blackslash with forward slash in Windows | |
.replace(/#(\w)+/,''); //remove URL fragment | |
}, | |
isExternal:function(url){ | |
return url.match(/^https?/) !== null; | |
}, | |
imageRegexp: new RegExp("("+['\\.png','\\.jpg','\\.gif','\\.bmp','\\.psd'].join('|')+")$","i") | |
}; | |
module.exports = Utils; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment