Created
November 22, 2016 01:57
-
-
Save wizardforcel/6e113a4c4617f56b5cbe0527d7701610 to your computer and use it in GitHub Desktop.
Yiibai
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// $ = cheerio.load(html); | |
var cheerio = require('cheerio'); | |
// request(method, url, options).getBody().toString() | |
var request = require('sync-request'); | |
var fs = require('fs'); | |
var process = require('process'); | |
var headers = { | |
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36', | |
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', | |
'Accept-Language': 'zh-CN,zh;q=0.8', | |
'Cookie': 'PHPSESSID=82kdct9ad90jh6amd8qk9cj991; Hm_lvt_424e222704e2274e37a8169110e6e00a=1477571676,1478254658,1479737264,1479777622; Hm_lpvt_424e222704e2274e37a8169110e6e00a=1479778096; CNZZDATA4255568=cnzz_eid%3D1047891998-1452514698-%26ntime%3D1479774631; jiathis_rdc=%7B%22http%3A//www.yiibai.com/laravel/laravel_security.html%22%3A-1986076540%2C%22http%3A//www.yiibai.com/laravel/%22%3A-1986069385%2C%22http%3A//www.yiibai.com/laravel/laravel_installation.html%22%3A-1986021903%2C%22http%3A//www.yiibai.com/laravel/laravel_application_structure.html%22%3A-1985969698%2C%22http%3A//www.yiibai.com/laravel/laravel_routing.html%22%3A-1985968650%2C%22http%3A//www.yiibai.com/laravel/laravel_configuration.html%22%3A%220%7C1479778096042%22%7D' | |
}; | |
var type = process.argv[2]; | |
var target = process.argv[3]; | |
if(!type || !target) | |
{ | |
showUsage(); | |
process.exit(0); | |
} | |
var types = ['single', 'category']; | |
if(types.indexOf(type) == -1) | |
{ | |
showUsage(); | |
process.exit(0); | |
} | |
var ofile = fs.openSync('out.html', 'w'); | |
var efile = fs.openSync('error.txt', 'w'); | |
if(type === 'single') | |
dlSingle(target, ofile, efile); | |
else if(type === 'category') | |
dlCategory(Number.parseInt(target), ofile, efile); | |
fs.closeSync(ofile); | |
fs.closeSync(efile); | |
console.log('Done..'); | |
function showUsage() | |
{ | |
var usage = "用法:node yiibai single url\n" + | |
" 或 node yiibai category index"; | |
console.log(usage); | |
} | |
function dlCategory(which, ofile, efile) | |
{ | |
var html = request('GET', 'http://www.yiibai.com/', {headers: headers}).getBody().toString(); | |
var bookList = getBookList(html, which); | |
for(var i in bookList) { | |
var url = bookList[i]; | |
console.log(url); | |
dlSingle(url, ofile, efile) | |
} | |
} | |
function dlSingle(url, ofile, efile) | |
{ | |
html = request('GET', url, {headers: headers}).getBody().toString(); | |
var content = getContent(html); | |
fs.writeSync(ofile, content, null, 'utf-8'); | |
fs.writeSync(ofile, '\r\n<!--split-->\r\n', null, 'utf-8'); | |
var toc = getToc(html); | |
for(var j in toc) { | |
try { | |
url = toc[j]; | |
console.log(url); | |
html = request('GET', url, {headers: headers}).getBody().toString(); | |
content = getContent(html); | |
fs.writeSync(ofile, content, null, 'utf-8'); | |
fs.writeSync(ofile, '\r\n<!--split-->\r\n', null, 'utf-8'); | |
} catch(ex) { | |
fs.writeSync(efile, ex.toString(), null, 'utf-8'); | |
} | |
} | |
} | |
function getBookList(html, which) { | |
var $ = cheerio.load(html); | |
var $list = $('.list-group').eq(which).find('.list-group-item'); | |
var res = []; | |
for(var i = 1; i < $list.length; i++) | |
{ | |
var url = $list.eq(i).attr('href'); | |
res.push(url); | |
} | |
return res; | |
} | |
function getToc(html) { | |
var $ = cheerio.load(html); | |
var $list = $('.list-group-item'); | |
var res = []; | |
for(var i = 1; i < $list.length; i++) { | |
var url = $list.eq(i).attr('href'); | |
res.push('http://www.yiibai.com' + url); | |
} | |
return res; | |
} | |
function getContent(html) { | |
var $ = cheerio.load(html); | |
$('.art_pre_next').remove(); | |
$('.copyright').remove(); | |
$('p[style="color: #A00;font-weight:bold;"]').remove(); | |
$('script').remove(); | |
$('ins').remove(); | |
var $imgs = $('img'); | |
for(var i = 0; i < $imgs.length; i++) { | |
var url = $imgs.eq(i).attr('src'); | |
if(url.startsWith('/')) { | |
$imgs.eq(i).attr('src', 'http://www.yiibai.com' + url); | |
} | |
} | |
var title = '<h1>' + $('title').text() + '</h1>'; | |
var content = $('.content-body').eq(0).html(); | |
return title + '\n' + content; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment