Skip to content

Instantly share code, notes, and snippets.

@JT5D
Forked from licson/ sql.sql
Created October 14, 2013 08:30
Show Gist options
  • Select an option

  • Save JT5D/6972674 to your computer and use it in GitHub Desktop.

Select an option

Save JT5D/6972674 to your computer and use it in GitHub Desktop.
-- phpMyAdmin SQL Dump
-- version 3.5.8.1
-- http://www.phpmyadmin.net
--
-- Host: 127.0.0.1
-- Generation Time: Jun 14, 2013 at 08:39 AM
-- Server version: 5.6.11-log
-- PHP Version: 5.4.14
SET SQL_MODE="NO_AUTO_VALUE_ON_ZERO";
SET time_zone = "+00:00";
/*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */;
/*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */;
/*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */;
/*!40101 SET NAMES utf8 */;
--
-- Database: `crawler`
--
-- --------------------------------------------------------
--
-- Table structure for table `queue`
--
CREATE TABLE IF NOT EXISTS `queue` (
`id` varchar(75) NOT NULL,
`url` varchar(2000) NOT NULL,
`from` varchar(2000) NOT NULL,
PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=latin1;
-- --------------------------------------------------------
--
-- Table structure for table `websites`
--
CREATE TABLE IF NOT EXISTS `websites` (
`id` varchar(75) CHARACTER SET utf8 COLLATE utf8_unicode_ci NOT NULL,
`url` varchar(2000) CHARACTER SET utf8 COLLATE utf8_unicode_ci NOT NULL,
`from` varchar(2000) CHARACTER SET utf16 COLLATE utf16_unicode_ci NOT NULL,
`title` varchar(500) CHARACTER SET utf8 COLLATE utf8_unicode_ci NOT NULL,
`keywords` varchar(500) CHARACTER SET utf8 COLLATE utf8_unicode_ci NOT NULL,
`desc` text CHARACTER SET utf32 COLLATE utf32_unicode_ci NOT NULL,
PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=latin1;
/*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */;
/*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */;
/*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */;
{
"db":{
"host":"127.0.0.1",
"port":3306,
"user":"root",
"password":"",
"database":"crawler"
},
"baseSite":"http://s3131212.com/links/"
}
var fs = require('fs');
var ConfigLoader = {
_cache:null,
load:function(file){
var self = this;
this._cache = JSON.parse(fs.readFileSync(file).toString());
fs.watch(file,function(){
self._cache = JSON.parse(fs.readFileSync(file).toString());
});
return this._cache;
},
get:function(key){
return this._cache[key];
}
};
module.exports = ConfigLoader;
var request = require('request');
var cheerio = require('cheerio');
var async = require('async');
var db = require('mysql');
var util = require('./utils.js');
var config = require('./configLoader.js');
config.load(__dirname+'/config.json');
var Crawler = function(){
var self = this;
this.conn = db.createConnection(config.get('db'));
this.indexed = 0;
this.baseSite = config.get('baseSite');
this._url = this.baseSite;
this.url = this.baseSite;
this.crawl = function(cb){
this.conn.query('SELECT * FROM `queue` LIMIT 0,1',function(e,result){
self.url = result.length > 0 ? result[0].url : self.baseSite;
request(self.url,function(e,res,body){
if(result.length > 0){
self.conn.query('DELETE FROM `queue` WHERE `id` = ?',[result[0].id],function(){
cb();
});
}
else {
cb();
}
if(!e && res.statusCode === 200){
self.getInfo(body,result.length > 0 ? result[0].from : '');
}
else {
console.log('Error requesting page %s',self.url);
}
self._url = self.url;
});
});
};
this.getInfo = function(html,from){
var $ = cheerio.load(html);
var title = $('head title').text();
var keywords = $('head meta[name=keywords]').attr('content');
var desc = $('head meta[name=description]').attr('content');
var links = $('a');
console.log('Crawling "%s" | %s',title,this.url);
async.map(links.map(function(){
var href = $(this).attr('href');
if(href && href != self._url && !(/^#(\w)+/.test(href)) && !util.imageRegexp.test(href)){
if(util.isExternal(href)){
return 'INSERT INTO `queue` SET `id` = \''+util.id()+'\', `url` = '+self.conn.escape(href)+', `from` = '+self.conn.escape(from);
}
else {
return 'INSERT INTO `queue` SET `id` = \''+util.id()+'\', `url` = '+self.conn.escape(util.resolveRelativeURL(href,self._url))+', `from` = '+self.conn.escape(from);
}
}
return false;
}).filter(function(el){
return !!el;
})
,this.conn.query.bind(this.conn),function(e,result){
if(e){
console.log('Error writing queue.');
console.log(e);
}
});
this.conn.query('INSERT INTO `websites` SET ?',{
id:util.id(),
url:this.url,
from:from,
title:title,
keywords:keywords || '',
desc:desc || ''
},function(e){
if(e){
console.log('Error indexing page %s',self.url);
console.log(e);
}
else {
console.log('Successfully indexed page %s',self.url);
self.indexed++;
}
});
};
};
module.exports = Crawler;
var Crawler = require('./crawler.js');
var async = require('async');
var spider = new Crawler();
async.forever(function(cb){
spider.crawl(function(){
process.nextTick(function(){
cb(null);
});
});
});
var path = require('path');
var Utils = {
id:function(){
var ret = '';
for(var chars="0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_".split(''),i=0;i<75;i++){
ret += chars[~~(Math.random() * chars.length)];
}
return ret;
},
resolveRelativeURL:function(p,url){
var proto = url.match(/^https?/)[0]; //extract the protocol out
url = url.replace(/^https?:\/\//,''); //remove the protocol
return proto+'://'+path.normalize(url+'/'+p) //find out the absolute URL
.replace(path.sep,'/') //replace blackslash with forward slash in Windows
.replace(/#(\w)+/,''); //remove URL fragment
},
isExternal:function(url){
return url.match(/^https?/) !== null;
},
imageRegexp: new RegExp("("+['\\.png','\\.jpg','\\.gif','\\.bmp','\\.psd'].join('|')+")$","i")
};
module.exports = Utils;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment