-
-
Save JT5D/6972674 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| -- phpMyAdmin SQL Dump | |
| -- version 3.5.8.1 | |
| -- http://www.phpmyadmin.net | |
| -- | |
| -- Host: 127.0.0.1 | |
| -- Generation Time: Jun 14, 2013 at 08:39 AM | |
| -- Server version: 5.6.11-log | |
| -- PHP Version: 5.4.14 | |
| SET SQL_MODE="NO_AUTO_VALUE_ON_ZERO"; | |
| SET time_zone = "+00:00"; | |
| /*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */; | |
| /*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */; | |
| /*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */; | |
| /*!40101 SET NAMES utf8 */; | |
| -- | |
| -- Database: `crawler` | |
| -- | |
| -- -------------------------------------------------------- | |
| -- | |
| -- Table structure for table `queue` | |
| -- | |
| CREATE TABLE IF NOT EXISTS `queue` ( | |
| `id` varchar(75) NOT NULL, | |
| `url` varchar(2000) NOT NULL, | |
| `from` varchar(2000) NOT NULL, | |
| PRIMARY KEY (`id`) | |
| ) ENGINE=InnoDB DEFAULT CHARSET=latin1; | |
| -- -------------------------------------------------------- | |
| -- | |
| -- Table structure for table `websites` | |
| -- | |
| CREATE TABLE IF NOT EXISTS `websites` ( | |
| `id` varchar(75) CHARACTER SET utf8 COLLATE utf8_unicode_ci NOT NULL, | |
| `url` varchar(2000) CHARACTER SET utf8 COLLATE utf8_unicode_ci NOT NULL, | |
| `from` varchar(2000) CHARACTER SET utf16 COLLATE utf16_unicode_ci NOT NULL, | |
| `title` varchar(500) CHARACTER SET utf8 COLLATE utf8_unicode_ci NOT NULL, | |
| `keywords` varchar(500) CHARACTER SET utf8 COLLATE utf8_unicode_ci NOT NULL, | |
| `desc` text CHARACTER SET utf32 COLLATE utf32_unicode_ci NOT NULL, | |
| PRIMARY KEY (`id`) | |
| ) ENGINE=InnoDB DEFAULT CHARSET=latin1; | |
| /*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */; | |
| /*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */; | |
| /*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */; |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "db":{ | |
| "host":"127.0.0.1", | |
| "port":3306, | |
| "user":"root", | |
| "password":"", | |
| "database":"crawler" | |
| }, | |
| "baseSite":"http://s3131212.com/links/" | |
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| var fs = require('fs'); | |
| var ConfigLoader = { | |
| _cache:null, | |
| load:function(file){ | |
| var self = this; | |
| this._cache = JSON.parse(fs.readFileSync(file).toString()); | |
| fs.watch(file,function(){ | |
| self._cache = JSON.parse(fs.readFileSync(file).toString()); | |
| }); | |
| return this._cache; | |
| }, | |
| get:function(key){ | |
| return this._cache[key]; | |
| } | |
| }; | |
| module.exports = ConfigLoader; |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| var request = require('request'); | |
| var cheerio = require('cheerio'); | |
| var async = require('async'); | |
| var db = require('mysql'); | |
| var util = require('./utils.js'); | |
| var config = require('./configLoader.js'); | |
| config.load(__dirname+'/config.json'); | |
| var Crawler = function(){ | |
| var self = this; | |
| this.conn = db.createConnection(config.get('db')); | |
| this.indexed = 0; | |
| this.baseSite = config.get('baseSite'); | |
| this._url = this.baseSite; | |
| this.url = this.baseSite; | |
| this.crawl = function(cb){ | |
| this.conn.query('SELECT * FROM `queue` LIMIT 0,1',function(e,result){ | |
| self.url = result.length > 0 ? result[0].url : self.baseSite; | |
| request(self.url,function(e,res,body){ | |
| if(result.length > 0){ | |
| self.conn.query('DELETE FROM `queue` WHERE `id` = ?',[result[0].id],function(){ | |
| cb(); | |
| }); | |
| } | |
| else { | |
| cb(); | |
| } | |
| if(!e && res.statusCode === 200){ | |
| self.getInfo(body,result.length > 0 ? result[0].from : ''); | |
| } | |
| else { | |
| console.log('Error requesting page %s',self.url); | |
| } | |
| self._url = self.url; | |
| }); | |
| }); | |
| }; | |
| this.getInfo = function(html,from){ | |
| var $ = cheerio.load(html); | |
| var title = $('head title').text(); | |
| var keywords = $('head meta[name=keywords]').attr('content'); | |
| var desc = $('head meta[name=description]').attr('content'); | |
| var links = $('a'); | |
| console.log('Crawling "%s" | %s',title,this.url); | |
| async.map(links.map(function(){ | |
| var href = $(this).attr('href'); | |
| if(href && href != self._url && !(/^#(\w)+/.test(href)) && !util.imageRegexp.test(href)){ | |
| if(util.isExternal(href)){ | |
| return 'INSERT INTO `queue` SET `id` = \''+util.id()+'\', `url` = '+self.conn.escape(href)+', `from` = '+self.conn.escape(from); | |
| } | |
| else { | |
| return 'INSERT INTO `queue` SET `id` = \''+util.id()+'\', `url` = '+self.conn.escape(util.resolveRelativeURL(href,self._url))+', `from` = '+self.conn.escape(from); | |
| } | |
| } | |
| return false; | |
| }).filter(function(el){ | |
| return !!el; | |
| }) | |
| ,this.conn.query.bind(this.conn),function(e,result){ | |
| if(e){ | |
| console.log('Error writing queue.'); | |
| console.log(e); | |
| } | |
| }); | |
| this.conn.query('INSERT INTO `websites` SET ?',{ | |
| id:util.id(), | |
| url:this.url, | |
| from:from, | |
| title:title, | |
| keywords:keywords || '', | |
| desc:desc || '' | |
| },function(e){ | |
| if(e){ | |
| console.log('Error indexing page %s',self.url); | |
| console.log(e); | |
| } | |
| else { | |
| console.log('Successfully indexed page %s',self.url); | |
| self.indexed++; | |
| } | |
| }); | |
| }; | |
| }; | |
| module.exports = Crawler; |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| var Crawler = require('./crawler.js'); | |
| var async = require('async'); | |
| var spider = new Crawler(); | |
| async.forever(function(cb){ | |
| spider.crawl(function(){ | |
| process.nextTick(function(){ | |
| cb(null); | |
| }); | |
| }); | |
| }); |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| var path = require('path'); | |
| var Utils = { | |
| id:function(){ | |
| var ret = ''; | |
| for(var chars="0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_".split(''),i=0;i<75;i++){ | |
| ret += chars[~~(Math.random() * chars.length)]; | |
| } | |
| return ret; | |
| }, | |
| resolveRelativeURL:function(p,url){ | |
| var proto = url.match(/^https?/)[0]; //extract the protocol out | |
| url = url.replace(/^https?:\/\//,''); //remove the protocol | |
| return proto+'://'+path.normalize(url+'/'+p) //find out the absolute URL | |
| .replace(path.sep,'/') //replace blackslash with forward slash in Windows | |
| .replace(/#(\w)+/,''); //remove URL fragment | |
| }, | |
| isExternal:function(url){ | |
| return url.match(/^https?/) !== null; | |
| }, | |
| imageRegexp: new RegExp("("+['\\.png','\\.jpg','\\.gif','\\.bmp','\\.psd'].join('|')+")$","i") | |
| }; | |
| module.exports = Utils; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment