Skip to content

Instantly share code, notes, and snippets.

@huzhifeng
Created June 30, 2014 14:49
Show Gist options
  • Save huzhifeng/649dea8e92dce9dc7f93 to your computer and use it in GitHub Desktop.
Save huzhifeng/649dea8e92dce9dc7f93 to your computer and use it in GitHub Desktop.
Node.js crawler for http://www.gifbin.com
var request = require('request');
var cheerio = require('cheerio');
var util = require('util');
var baseUrl = 'http://www.gifbin.com';
var options = {
'url': '',
'method': 'GET',
'headers': {
'User-Agent': 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36',
'Connection': 'Keep-Alive',
'Host': 'www.gifbin.com'
},
};
function isFullUrl(url) {
return /^https?:\/\//.test(url);
}
function formatUrl(url) {
if (!url) {
return '';
}
if (isFullUrl(url)) {
return url;
} else {
if (url.charAt(0) === '/') {
return util.format('http://www.gifbin.com%s', url);
} else {
return util.format('http://www.gifbin.com/%s', url);
}
}
}
function crawlDetail(entry) {
options.url = entry.url;
request(options, function(err, res, body) {
console.log(util.format('Current detail: %s', entry.url));
if (err || res.statusCode != 200) {
console.log(util.inspect({
'err': err,
'res': res
}, {depth: null}));
return;
}
var $ = cheerio.load(body);
var eImg = $('#main img').first();
var eDate = $('#stats div.floatl').first();
var eTitle = $('#main h1').first();
if ((eImg.length !== 1) ||
(eDate.length !== 1) ||
(eTitle.length !== 1) ||
!eImg.attr('src') ||
!eDate.text().trim() ||
!eTitle.text().trim()) {
console.log('Invalid eImg, eDate or eTitle');
console.log(util.inspect({
'eImg': eImg,
'eDate': eDate,
'eTitle': eTitle
}, {depth: null}));
return;
}
var title = eTitle.text().trim() || entry.title || $('title').text().trim();
var date = eDate.text().trim().replace('Added on ', '');
var src = formatUrl(eImg.attr('src'));
var alt = eImg.attr('alt') || eImg.attr('title') || title;
if (!title || !date || !isFullUrl(src)) {
console.log('Invalid title, date or src');
console.dir({
'title': title,
'date': date,
'src': src
});
return;
}
var img = {
'src': src,
'alt': alt,
'thumbnail': entry.thumbnailSrc
};
var tags = ['gif'];
$('#gif-tags a').each(function() {
tags.push($(this).text().trim());
});
var obj = {
'title': title,
'date': date,
'created': new Date(),
'tags': tags,
'imgs': [img],
'link': entry.url
};
console.dir(obj);
});
}
function crawlPager(url) {
options.url = url;
request(options, function(err, res, body) {
console.log(util.format('Current pager: %s', url));
if (err || res.statusCode != 200) {
console.log(util.inspect({
'err': err,
'res': res
}, {depth: null}));
return;
}
var $ = cheerio.load(body);
$('#content div.thumbs li').each(function() {
var eA = $(this).find('a').last();
var eImg = $(this).find('img').first();
var link = formatUrl($(eA).attr('href'));
var thumbnailSrc = formatUrl($(eImg).attr('src'));
var title = $(eA).text().trim() || $(eImg).attr('alt');
var entry = {
'url': link,
'thumbnailSrc': thumbnailSrc,
'title': title
};
if (isFullUrl(link)) {
setTimeout(crawlDetail, 1000, entry);
} else {
console.log('Invalid link:' + link);
}
});
var nextUrl = '';
$('#content div.pagination a').each(function() {
if ($(this).text().trim() === 'Next >') {
nextUrl = formatUrl($(this).attr('href'));
}
});
if (isFullUrl(nextUrl)) {
setTimeout(crawlPager, 1000, nextUrl);
} else {
console.log(util.format('Last pager: %s', url));
}
});
}
function main() {
crawlPager('http://www.gifbin.com/thumbs/');
}
main();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment