Skip to content

Instantly share code, notes, and snippets.

@LinZap
Created November 10, 2015 17:36
Show Gist options
  • Save LinZap/a08483b7d1cce6033a00 to your computer and use it in GitHub Desktop.
Save LinZap/a08483b7d1cce6033a00 to your computer and use it in GitHub Desktop.
Google Search Cralwer (Zap Ver.)
var g = require("./node-g-search"),
db = require("./node-pg-promise"),
url = require("url");
var keyword = '暨南大學';
var p = db.connect({
user:'postgres',
password: 'postgres',
host: 'localhost',
dbname: 'gsearch_zap'
})
.then(function(){
return db.query('select insertclass(?,?) as cid',[1,keyword])
.then(function(res){
console.log(res.rows[0].cid);
return res.rows[0].cid;
})
})
.then(function(cid){
return g.searchAll(keyword)
.then(function(data){
var proPool = [];
data.forEach(function(item){
var href = item.href,
tit = item.tit,
des = item.des;
proPool.push(inserturl2(cid,tit,des,href));
});
return Promise.all(proPool).then(function(){return});
})
})
.then(function(){
db.close();
})
/*
inserturl2(
_cid int,
_tit varchar,
_des varchar,
_url varchar,
_scheme varchar,
_host varchar,
_path varchar,
_query varchar)
*/
function inserturl2(_cid,_tit,_des,_url){
var u = url.parse(_url),
_scheme = u.protocol.replace(':',''),
_host = u.host,
_path = u.path,
_query = u.query,
params = [_cid,_tit,_des,_url,_scheme,_host,_path,_query];
return db.query('select inserturl2(?,?,?,?,?,?,?,?)',params)
.then(function(res){
return res;
});
}
var request = require("request"),
cheerio = require('cheerio'),
querystring = require('querystring');
const gurl = "https://www.google.com.tw/search?";
const num = 100;
module.exports = {
option: {
headers: {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.80 Safari/537.36'
}
},
setQuery: function(q,page){
query = {q:q,num: num,start: (page-1)*num, filter:0};
this.option.url = gurl + querystring.stringify(query);
},
search: function(q,page){
this.setQuery(q,page);
return new Promise(function(resolve,reject){
request(this.option , function(error, response, body){
if(error) return reject(error);
resolve(this.fetch(cheerio.load(body)));
}.bind(this));
}.bind(this))
},
searchAll: function(q){
return this.search(q,1).then(function(res){
var pagenum = res.pagenum,
promsPool = [],
allData = res.data;
for (var i = 2; i <= pagenum; i++) {
var prom = this.search(q,i).then(function(res){
allData = allData.concat(res.data);
return res;
})
promsPool.push(prom);
}
return Promise.all(promsPool).then(function(){return allData;});
}.bind(this))
},
fetch: function($){
var a = $('.r').children('a'),
t = $('.st'),
p = $('#nav').find('td').not('.navend').length;
d = [];
a.each(function(i,el){
var href = $(el).attr('href'),
tit = $(el).text(),
des = t.eq(i).text();
d.push({ tit: tit, href: href, des: des });
});
return {
data: d,
pagenum: p
};
}
};
'use strict';
var pg = require('pg'),
util = require('util');
module.exports = {
connect: function(opt){
//postgres://user:password@host:port/database
var connStr = "postgres://"+opt.user+":"+opt.password+"@"+opt.host;
connStr+= ":"+(opt.port||5432)+"/"+opt.dbname;
if(this.client) pg.end();
return new Promise(function(resolve,reject){
pg.connect(connStr,function(err, client, done) {
// can not handle err, throw err
if(err){ console.error("%s",err); return; }
this.client = client;
this.done = done;
resolve(client);
}.bind(this));
}.bind(this));
},
query: function(){
var sql = this.transformSQL(arguments[0]),
params = arguments[1] || [];
return new Promise(function(resolve,reject){
this.client.query(sql, params, function(err,result){
if(err) return reject(err);
resolve(result);
})
}.bind(this))
},
close: function(){
if(this.done) this.done();
if(this.client) pg.end();
},
transformSQL: function(sql){
var str = "",
number = 0;
for (var i = 0; i < sql.length; i++)
if(sql[i]=='?') str += "$"+(++number);
else str += sql[i];
return str;
}
};
CREATE OR REPLACE FUNCTION inserturl2(
_cid int,
_tit varchar,
_des varchar,
_url varchar,
_scheme varchar,
_host varchar,
_path varchar,
_query varchar)
RETURNS integer AS
$BODY$
declare _OID int; _EID int; _SID int; _MD5URL varchar(32);_UID int;_co int;
begin
select md5(_url) into _MD5URL;
select uid from URL where MD5URL=_MD5URL into _UID;
if _UID is null then
select EID from Entity where EName = 'URL' into _EID;
select insertobj(_EID,_tit,_des) into _OID;
select SID from URLScheme where Scheme = _scheme into _SID;
insert into URL(UID,Scheme,HostName,Path,MD5URL,queryget)
values(_OID,_SID,_host,_path,_MD5URL,_query);
end if;
select oid from co where cid=_cid and oid=_OID into _co;
if _co is null then
insert into co(cid,oid) values(_cid,_OID);
end if;
return _OID;
end;
$BODY$
LANGUAGE plpgsql;
drop function insertclass(
_pcid integer,
_type integer,
_cname character varying,
_ename character varying);
drop function insertclass(
_namepath character varying,
_cname character varying);
@LinZap
Copy link
Author

LinZap commented Nov 10, 2015

Install

npm install pg --save
npm install request --save
npm install cheerio --save

Run

node index.js

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment