Skip to content

Instantly share code, notes, and snippets.

@ishitcno1
Created September 25, 2015 13:50
Show Gist options
  • Save ishitcno1/d1a000f6a21387082204 to your computer and use it in GitHub Desktop.
Save ishitcno1/d1a000f6a21387082204 to your computer and use it in GitHub Desktop.
guoxue123.cn book downloader
/** Download books from www.guoxue123.cn
* usage: node downloader.js <index page>
* example: node downloader.js www.guoxue123.cn/xiaosuo/jd/jpshz/index.htm
*/
var http = require('http');
var url = require('url');
var parser = require('cheerio');
var iconv = require('iconv').Iconv('GBK', 'UTF-8');
var fs = require('fs');
if (process.argv.length < 3) {
console.log('No url argument.');
}
var indexUrl = process.argv[2];
var parsedIndexUrl = url.parse(indexUrl);
var hostName = parsedIndexUrl.hostname;
var port = 80;
var basicPath = parsedIndexUrl.pathname.replace('index.htm', '');
var articles = [];
var bookName;
var completed = 0;
// get index page
var options = {
host: hostName,
port: port,
path: parsedIndexUrl.pathname
};
var buffer;
console.log('Processing.........');
http.get(options, function(res) {
res.on('data', function(chunk) {
buffer = buffer ? Buffer.concat([buffer, chunk]) : chunk;
});
res.on('end', function() {
articles = parseIndex(iconv.convert(buffer));
for (var i = 0, l = articles.length; i < l; i++) {
getArticle(articles[i]);
}
});
});
function parseIndex(page) {
var html = parser.load(page);
bookName = html('span.s3').eq(2).text();
var links = html('table').eq(1).find('a');
var articles = []
var a, i, length = links.length;
for (i = 0; i < length; i++) {
a = links.eq(i);
articles.push({
title: a.text(),
hostname: hostName,
port: 80,
pathname: basicPath + a.attr('href')
});
}
return articles;
}
function getArticle(article) {
var options = {
host: article.hostname,
port: article.port,
path: article.pathname
};
var buffer;
http.get(options, function(res) {
res.on('data', function(chunk) {
buffer = buffer ? Buffer.concat([buffer, chunk]) : chunk;
}).on('end', function() {
parseArticle(article, iconv.convert(buffer));
});
});
}
function parseArticle(article, page) {
var html = parser.load(page);
var content = html('span.q').text();
if (!content) {
content = html('span.h').text();
}
article.content = content;
completed += 1;
console.log('Parsing........%d/%d', completed, articles.length);
if (completed == articles.length) {
writeArticles();
}
}
function writeArticles() {
console.log('Writing.........');
fs.open(bookName + '.txt', 'w', function(err, fd) {
if (err) throw err;
var article;
for (var i = 0, l = articles.length; i < l; i++) {
article = articles[i];
fs.write(fd, '## ' + article.title + '\n\n' + article.content + '\n');
}
console.log('Completed');
});
}
{
"name": "downloader",
"version": "1.0.0",
"description": "",
"main": "app.js",
"scripts": {
"test": "echo \"Error: no test specified\" && exit 1"
},
"author": "",
"license": "ISC",
"dependencies": {
"buffertools": "^2.1.3",
"cheerio": "^0.19.0",
"iconv": "^2.1.10"
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment