Created
November 10, 2015 17:36
-
-
Save LinZap/a08483b7d1cce6033a00 to your computer and use it in GitHub Desktop.
Google Search Cralwer (Zap Ver.)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
var g = require("./node-g-search"), | |
db = require("./node-pg-promise"), | |
url = require("url"); | |
var keyword = '暨南大學'; | |
var p = db.connect({ | |
user:'postgres', | |
password: 'postgres', | |
host: 'localhost', | |
dbname: 'gsearch_zap' | |
}) | |
.then(function(){ | |
return db.query('select insertclass(?,?) as cid',[1,keyword]) | |
.then(function(res){ | |
console.log(res.rows[0].cid); | |
return res.rows[0].cid; | |
}) | |
}) | |
.then(function(cid){ | |
return g.searchAll(keyword) | |
.then(function(data){ | |
var proPool = []; | |
data.forEach(function(item){ | |
var href = item.href, | |
tit = item.tit, | |
des = item.des; | |
proPool.push(inserturl2(cid,tit,des,href)); | |
}); | |
return Promise.all(proPool).then(function(){return}); | |
}) | |
}) | |
.then(function(){ | |
db.close(); | |
}) | |
/* | |
inserturl2( | |
_cid int, | |
_tit varchar, | |
_des varchar, | |
_url varchar, | |
_scheme varchar, | |
_host varchar, | |
_path varchar, | |
_query varchar) | |
*/ | |
function inserturl2(_cid,_tit,_des,_url){ | |
var u = url.parse(_url), | |
_scheme = u.protocol.replace(':',''), | |
_host = u.host, | |
_path = u.path, | |
_query = u.query, | |
params = [_cid,_tit,_des,_url,_scheme,_host,_path,_query]; | |
return db.query('select inserturl2(?,?,?,?,?,?,?,?)',params) | |
.then(function(res){ | |
return res; | |
}); | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
var request = require("request"), | |
cheerio = require('cheerio'), | |
querystring = require('querystring'); | |
const gurl = "https://www.google.com.tw/search?"; | |
const num = 100; | |
module.exports = { | |
option: { | |
headers: { | |
'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.80 Safari/537.36' | |
} | |
}, | |
setQuery: function(q,page){ | |
query = {q:q,num: num,start: (page-1)*num, filter:0}; | |
this.option.url = gurl + querystring.stringify(query); | |
}, | |
search: function(q,page){ | |
this.setQuery(q,page); | |
return new Promise(function(resolve,reject){ | |
request(this.option , function(error, response, body){ | |
if(error) return reject(error); | |
resolve(this.fetch(cheerio.load(body))); | |
}.bind(this)); | |
}.bind(this)) | |
}, | |
searchAll: function(q){ | |
return this.search(q,1).then(function(res){ | |
var pagenum = res.pagenum, | |
promsPool = [], | |
allData = res.data; | |
for (var i = 2; i <= pagenum; i++) { | |
var prom = this.search(q,i).then(function(res){ | |
allData = allData.concat(res.data); | |
return res; | |
}) | |
promsPool.push(prom); | |
} | |
return Promise.all(promsPool).then(function(){return allData;}); | |
}.bind(this)) | |
}, | |
fetch: function($){ | |
var a = $('.r').children('a'), | |
t = $('.st'), | |
p = $('#nav').find('td').not('.navend').length; | |
d = []; | |
a.each(function(i,el){ | |
var href = $(el).attr('href'), | |
tit = $(el).text(), | |
des = t.eq(i).text(); | |
d.push({ tit: tit, href: href, des: des }); | |
}); | |
return { | |
data: d, | |
pagenum: p | |
}; | |
} | |
}; | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
'use strict'; | |
var pg = require('pg'), | |
util = require('util'); | |
module.exports = { | |
connect: function(opt){ | |
//postgres://user:password@host:port/database | |
var connStr = "postgres://"+opt.user+":"+opt.password+"@"+opt.host; | |
connStr+= ":"+(opt.port||5432)+"/"+opt.dbname; | |
if(this.client) pg.end(); | |
return new Promise(function(resolve,reject){ | |
pg.connect(connStr,function(err, client, done) { | |
// can not handle err, throw err | |
if(err){ console.error("%s",err); return; } | |
this.client = client; | |
this.done = done; | |
resolve(client); | |
}.bind(this)); | |
}.bind(this)); | |
}, | |
query: function(){ | |
var sql = this.transformSQL(arguments[0]), | |
params = arguments[1] || []; | |
return new Promise(function(resolve,reject){ | |
this.client.query(sql, params, function(err,result){ | |
if(err) return reject(err); | |
resolve(result); | |
}) | |
}.bind(this)) | |
}, | |
close: function(){ | |
if(this.done) this.done(); | |
if(this.client) pg.end(); | |
}, | |
transformSQL: function(sql){ | |
var str = "", | |
number = 0; | |
for (var i = 0; i < sql.length; i++) | |
if(sql[i]=='?') str += "$"+(++number); | |
else str += sql[i]; | |
return str; | |
} | |
}; |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
CREATE OR REPLACE FUNCTION inserturl2( | |
_cid int, | |
_tit varchar, | |
_des varchar, | |
_url varchar, | |
_scheme varchar, | |
_host varchar, | |
_path varchar, | |
_query varchar) | |
RETURNS integer AS | |
$BODY$ | |
declare _OID int; _EID int; _SID int; _MD5URL varchar(32);_UID int;_co int; | |
begin | |
select md5(_url) into _MD5URL; | |
select uid from URL where MD5URL=_MD5URL into _UID; | |
if _UID is null then | |
select EID from Entity where EName = 'URL' into _EID; | |
select insertobj(_EID,_tit,_des) into _OID; | |
select SID from URLScheme where Scheme = _scheme into _SID; | |
insert into URL(UID,Scheme,HostName,Path,MD5URL,queryget) | |
values(_OID,_SID,_host,_path,_MD5URL,_query); | |
end if; | |
select oid from co where cid=_cid and oid=_OID into _co; | |
if _co is null then | |
insert into co(cid,oid) values(_cid,_OID); | |
end if; | |
return _OID; | |
end; | |
$BODY$ | |
LANGUAGE plpgsql; | |
drop function insertclass( | |
_pcid integer, | |
_type integer, | |
_cname character varying, | |
_ename character varying); | |
drop function insertclass( | |
_namepath character varying, | |
_cname character varying); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Install
Run