Created
September 25, 2015 13:50
-
-
Save ishitcno1/d1a000f6a21387082204 to your computer and use it in GitHub Desktop.
guoxue123.cn book downloader
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** Download books from www.guoxue123.cn | |
* usage: node downloader.js <index page> | |
* example: node downloader.js www.guoxue123.cn/xiaosuo/jd/jpshz/index.htm | |
*/ | |
var http = require('http'); | |
var url = require('url'); | |
var parser = require('cheerio'); | |
var iconv = require('iconv').Iconv('GBK', 'UTF-8'); | |
var fs = require('fs'); | |
if (process.argv.length < 3) { | |
console.log('No url argument.'); | |
} | |
var indexUrl = process.argv[2]; | |
var parsedIndexUrl = url.parse(indexUrl); | |
var hostName = parsedIndexUrl.hostname; | |
var port = 80; | |
var basicPath = parsedIndexUrl.pathname.replace('index.htm', ''); | |
var articles = []; | |
var bookName; | |
var completed = 0; | |
// get index page | |
var options = { | |
host: hostName, | |
port: port, | |
path: parsedIndexUrl.pathname | |
}; | |
var buffer; | |
console.log('Processing.........'); | |
http.get(options, function(res) { | |
res.on('data', function(chunk) { | |
buffer = buffer ? Buffer.concat([buffer, chunk]) : chunk; | |
}); | |
res.on('end', function() { | |
articles = parseIndex(iconv.convert(buffer)); | |
for (var i = 0, l = articles.length; i < l; i++) { | |
getArticle(articles[i]); | |
} | |
}); | |
}); | |
function parseIndex(page) { | |
var html = parser.load(page); | |
bookName = html('span.s3').eq(2).text(); | |
var links = html('table').eq(1).find('a'); | |
var articles = [] | |
var a, i, length = links.length; | |
for (i = 0; i < length; i++) { | |
a = links.eq(i); | |
articles.push({ | |
title: a.text(), | |
hostname: hostName, | |
port: 80, | |
pathname: basicPath + a.attr('href') | |
}); | |
} | |
return articles; | |
} | |
function getArticle(article) { | |
var options = { | |
host: article.hostname, | |
port: article.port, | |
path: article.pathname | |
}; | |
var buffer; | |
http.get(options, function(res) { | |
res.on('data', function(chunk) { | |
buffer = buffer ? Buffer.concat([buffer, chunk]) : chunk; | |
}).on('end', function() { | |
parseArticle(article, iconv.convert(buffer)); | |
}); | |
}); | |
} | |
function parseArticle(article, page) { | |
var html = parser.load(page); | |
var content = html('span.q').text(); | |
if (!content) { | |
content = html('span.h').text(); | |
} | |
article.content = content; | |
completed += 1; | |
console.log('Parsing........%d/%d', completed, articles.length); | |
if (completed == articles.length) { | |
writeArticles(); | |
} | |
} | |
function writeArticles() { | |
console.log('Writing.........'); | |
fs.open(bookName + '.txt', 'w', function(err, fd) { | |
if (err) throw err; | |
var article; | |
for (var i = 0, l = articles.length; i < l; i++) { | |
article = articles[i]; | |
fs.write(fd, '## ' + article.title + '\n\n' + article.content + '\n'); | |
} | |
console.log('Completed'); | |
}); | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"name": "downloader", | |
"version": "1.0.0", | |
"description": "", | |
"main": "app.js", | |
"scripts": { | |
"test": "echo \"Error: no test specified\" && exit 1" | |
}, | |
"author": "", | |
"license": "ISC", | |
"dependencies": { | |
"buffertools": "^2.1.3", | |
"cheerio": "^0.19.0", | |
"iconv": "^2.1.10" | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment