Skip to content

Instantly share code, notes, and snippets.

@wizardforcel
Created November 22, 2016 01:57
Show Gist options
  • Save wizardforcel/6e113a4c4617f56b5cbe0527d7701610 to your computer and use it in GitHub Desktop.
Save wizardforcel/6e113a4c4617f56b5cbe0527d7701610 to your computer and use it in GitHub Desktop.
Yiibai
// $ = cheerio.load(html);
var cheerio = require('cheerio');
// request(method, url, options).getBody().toString()
var request = require('sync-request');
var fs = require('fs');
var process = require('process');
var headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'zh-CN,zh;q=0.8',
'Cookie': 'PHPSESSID=82kdct9ad90jh6amd8qk9cj991; Hm_lvt_424e222704e2274e37a8169110e6e00a=1477571676,1478254658,1479737264,1479777622; Hm_lpvt_424e222704e2274e37a8169110e6e00a=1479778096; CNZZDATA4255568=cnzz_eid%3D1047891998-1452514698-%26ntime%3D1479774631; jiathis_rdc=%7B%22http%3A//www.yiibai.com/laravel/laravel_security.html%22%3A-1986076540%2C%22http%3A//www.yiibai.com/laravel/%22%3A-1986069385%2C%22http%3A//www.yiibai.com/laravel/laravel_installation.html%22%3A-1986021903%2C%22http%3A//www.yiibai.com/laravel/laravel_application_structure.html%22%3A-1985969698%2C%22http%3A//www.yiibai.com/laravel/laravel_routing.html%22%3A-1985968650%2C%22http%3A//www.yiibai.com/laravel/laravel_configuration.html%22%3A%220%7C1479778096042%22%7D'
};
var type = process.argv[2];
var target = process.argv[3];
if(!type || !target)
{
showUsage();
process.exit(0);
}
var types = ['single', 'category'];
if(types.indexOf(type) == -1)
{
showUsage();
process.exit(0);
}
var ofile = fs.openSync('out.html', 'w');
var efile = fs.openSync('error.txt', 'w');
if(type === 'single')
dlSingle(target, ofile, efile);
else if(type === 'category')
dlCategory(Number.parseInt(target), ofile, efile);
fs.closeSync(ofile);
fs.closeSync(efile);
console.log('Done..');
function showUsage()
{
var usage = "用法:node yiibai single url\n" +
" 或 node yiibai category index";
console.log(usage);
}
function dlCategory(which, ofile, efile)
{
var html = request('GET', 'http://www.yiibai.com/', {headers: headers}).getBody().toString();
var bookList = getBookList(html, which);
for(var i in bookList) {
var url = bookList[i];
console.log(url);
dlSingle(url, ofile, efile)
}
}
function dlSingle(url, ofile, efile)
{
html = request('GET', url, {headers: headers}).getBody().toString();
var content = getContent(html);
fs.writeSync(ofile, content, null, 'utf-8');
fs.writeSync(ofile, '\r\n<!--split-->\r\n', null, 'utf-8');
var toc = getToc(html);
for(var j in toc) {
try {
url = toc[j];
console.log(url);
html = request('GET', url, {headers: headers}).getBody().toString();
content = getContent(html);
fs.writeSync(ofile, content, null, 'utf-8');
fs.writeSync(ofile, '\r\n<!--split-->\r\n', null, 'utf-8');
} catch(ex) {
fs.writeSync(efile, ex.toString(), null, 'utf-8');
}
}
}
function getBookList(html, which) {
var $ = cheerio.load(html);
var $list = $('.list-group').eq(which).find('.list-group-item');
var res = [];
for(var i = 1; i < $list.length; i++)
{
var url = $list.eq(i).attr('href');
res.push(url);
}
return res;
}
function getToc(html) {
var $ = cheerio.load(html);
var $list = $('.list-group-item');
var res = [];
for(var i = 1; i < $list.length; i++) {
var url = $list.eq(i).attr('href');
res.push('http://www.yiibai.com' + url);
}
return res;
}
function getContent(html) {
var $ = cheerio.load(html);
$('.art_pre_next').remove();
$('.copyright').remove();
$('p[style="color: #A00;font-weight:bold;"]').remove();
$('script').remove();
$('ins').remove();
var $imgs = $('img');
for(var i = 0; i < $imgs.length; i++) {
var url = $imgs.eq(i).attr('src');
if(url.startsWith('/')) {
$imgs.eq(i).attr('src', 'http://www.yiibai.com' + url);
}
}
var title = '<h1>' + $('title').text() + '</h1>';
var content = $('.content-body').eq(0).html();
return title + '\n' + content;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment