Skip to content

Instantly share code, notes, and snippets.

@wizardforcel
Last active July 5, 2017 13:37
Show Gist options
  • Save wizardforcel/9781da292ac9d04e3b3172cf1e5c67da to your computer and use it in GitHub Desktop.
Save wizardforcel/9781da292ac9d04e3b3172cf1e5c67da to your computer and use it in GitHub Desktop.
runoob crawler for html
var cheerio = require('cheerio');
var request = require('sync-request');
var fs = require('fs');
var process = require('process');
var url = process.argv[2];
if(!url)
{
console.log('请指定页面。');
process.exit(0);
}
var ofile = fs.openSync('out.html', 'w');
var html = request('GET', url).getBody().toString();
var toc = getToc(html);
for(var i in toc) {
var url = toc[i];
console.log('page: ' + url);
html = request('GET', url).getBody().toString();
var content = getContent(html);
fs.writeSync(ofile, content, null, 'utf-8');
fs.writeSync(ofile, '\n<!--split-->\n', null, 'utf-8');
}
fs.closeSync(ofile);
console.log('Done..');
function getToc(html) {
var $ = cheerio.load(html);
var $list = $('#leftcolumn').find('a');
var res = [];
for(var i = 0; i < $list.length; i++)
{
var url = $list.eq(i).attr('href');
res.push('http://www.runoob.com/' + url);
}
return res;
}
function getContent(html) {
var $ = cheerio.load(html);
var content = $('#content').html();
return content;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment