Last active
July 5, 2017 13:37
-
-
Save wizardforcel/9781da292ac9d04e3b3172cf1e5c67da to your computer and use it in GitHub Desktop.
runoob crawler for html
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
var cheerio = require('cheerio'); | |
var request = require('sync-request'); | |
var fs = require('fs'); | |
var process = require('process'); | |
var url = process.argv[2]; | |
if(!url) | |
{ | |
console.log('请指定页面。'); | |
process.exit(0); | |
} | |
var ofile = fs.openSync('out.html', 'w'); | |
var html = request('GET', url).getBody().toString(); | |
var toc = getToc(html); | |
for(var i in toc) { | |
var url = toc[i]; | |
console.log('page: ' + url); | |
html = request('GET', url).getBody().toString(); | |
var content = getContent(html); | |
fs.writeSync(ofile, content, null, 'utf-8'); | |
fs.writeSync(ofile, '\n<!--split-->\n', null, 'utf-8'); | |
} | |
fs.closeSync(ofile); | |
console.log('Done..'); | |
function getToc(html) { | |
var $ = cheerio.load(html); | |
var $list = $('#leftcolumn').find('a'); | |
var res = []; | |
for(var i = 0; i < $list.length; i++) | |
{ | |
var url = $list.eq(i).attr('href'); | |
res.push('http://www.runoob.com/' + url); | |
} | |
return res; | |
} | |
function getContent(html) { | |
var $ = cheerio.load(html); | |
var content = $('#content').html(); | |
return content; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment