Created
June 29, 2014 06:52
-
-
Save huzhifeng/e988a4d8cd5f01bd8c0f to your computer and use it in GitHub Desktop.
Node.js crawler for http://www.gaoxiaogif.com
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
var request = require('request'); | |
var cheerio = require('cheerio'); | |
//var Iconv = require('iconv').Iconv; | |
//var iconv = new Iconv('GBK', 'UTF-8//TRANSLIT//IGNORE'); | |
var iconv = require('iconv-lite'); | |
var util = require('util'); | |
var baseUrl = 'http://www.gaoxiaogif.com'; | |
var navItems = [ | |
{ | |
tid: 'zhenrengif', | |
tname: '真人' | |
}, | |
{ | |
tid: 'meinvgif', | |
tname: '美女' | |
}, | |
{ | |
tid: 'erdonggif', | |
tname: '儿童' | |
}, | |
{ | |
tid: 'dongwugif', | |
tname: '动物' | |
}, | |
{ | |
tid: 'tiyugif', | |
tname: '体育' | |
}, | |
{ | |
tid: 'jiaotongdongtai', | |
tname: '交通' | |
}, | |
{ | |
tid: 'kongbudongtaitupian', | |
tname: '恐怖' | |
}, | |
{ | |
tid: 'ogiftupian', | |
tname: '未分类' | |
}, | |
]; | |
function isFullUrl(url) { | |
return /^https?:\/\//.test(url); | |
} | |
function formatUrl(url) { | |
if (!url) { | |
return ''; | |
} | |
if (isFullUrl(url)) { | |
return url; | |
} else { | |
if (url.charAt(0) === '/') { | |
return util.format('http://www.gaoxiaogif.com%s', url); | |
} else { | |
return util.format('http://www.gaoxiaogif.com/%s', url); | |
} | |
} | |
} | |
function crawlTag(entry) { | |
var url = entry.url; | |
var options = { | |
'url': url, | |
'method': 'GET', | |
'headers': { | |
'User-Agent': 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36', | |
'Connection': 'Keep-Alive', | |
'Host': 'www.gaoxiaogif.com' | |
}, | |
'encoding': null, | |
//'proxy': 'http://127.0.0.1:7788' | |
}; | |
request(options, function(err, res, body) { | |
console.log(util.format('Current page: %s', url)); | |
if (err || res.statusCode != 200) { | |
console.log(util.inspect({ | |
'err': err, | |
'res': res | |
}, {depth: null})); | |
return; | |
} | |
body = iconv.decode(body, 'gbk');//iconv.convert(body).toString(); | |
var $ = cheerio.load(body); | |
$('dl.listitem').each(function(index, listItem) { | |
var eTimeYM = $(listItem).find('span.ym').first(); | |
var eTimeD = $(listItem).find('span.d').first(); | |
var eTitle = $(listItem).find('h3 a').first(); | |
var eA = $(listItem).find('div.cont a').first(); | |
var eImg = $(listItem).find('div.cont img').first(); | |
if ((eTimeYM.length !== 1) || | |
(eTimeD.length !== 1) || | |
(eTitle.length !== 1) || | |
(eA.length !== 1) || | |
(eImg.length !== 1)) { | |
console.log('Invalid listItem'); | |
console.log(util.inspect($(listItem), {depth: null})); | |
return; | |
} | |
var date = util.format('%s/%s', eTimeYM.text().trim(), eTimeD.text().trim()); | |
var isValidDate = /^\d{4}(\/\d{2}){2}$/.test(date); | |
if (!isValidDate) { | |
console.log('Invalid date:' + date); | |
console.dir({ | |
'eTimeYM': eTimeYM, | |
'eTimeD': eTimeD | |
}); | |
return; | |
} | |
var title = eTitle.text().trim() || eImg.attr('alt').trim(); | |
var link = formatUrl(eTitle.attr('href')); | |
if (!title) { | |
console.log('Invalid title:' + title); | |
console.log(util.inspect({ | |
'eTitle': eTitle, | |
'eImg': eImg | |
}, {depth: null})); | |
return; | |
} | |
var src = formatUrl(eA.attr('href')); | |
if (!src) { | |
console.log('Invalid src:' + src); | |
console.dir(eA); | |
return; | |
} | |
var img = { | |
'src': src, | |
'alt': eImg.attr('alt') || title | |
}; | |
var thumbnailUrl = eImg.attr('src') || eImg.attr('original'); | |
if (thumbnailUrl) { | |
img.thumbnail = thumbnailUrl; | |
} | |
var obj = { | |
'title': title, | |
'date': date, | |
'created': new Date(), | |
'tags': ['gif', entry.tname], | |
'imgs': [img], | |
'link': link | |
}; | |
console.dir(obj); | |
}); | |
var ePageList = $('.pagelist a'); | |
var eNext = ePageList.eq(ePageList.length - 2); | |
if ((eNext.length === 1) && | |
eNext.text() && | |
(eNext.text() === '下一页')) { | |
var nextUrl = eNext.attr('href'); | |
if (isFullUrl(nextUrl)) { | |
entry.url = nextUrl; | |
setTimeout(crawlTag, 1000, entry); | |
} | |
} else { | |
console.log(util.format('Last page of %s[%s]', entry.tname, entry.tid)); | |
} | |
}); | |
} | |
function main() { | |
navItems.forEach(function(entry, index, list) { | |
entry.url = util.format('%s/%s/', baseUrl, entry.tid); | |
setTimeout(crawlTag, index * 30 * 1000, entry); | |
}); | |
} | |
main(); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment