Created
December 5, 2018 14:09
-
-
Save yuanliwei/2e7602fe5e1c083ae22a82c6f0067336 to your computer and use it in GitHub Desktop.
小说爬虫.js
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
var request = require('request') | |
var cheerio = require('cheerio') | |
var iconv = require('iconv-lite'); | |
async function get(url) { | |
return new Promise((resolve) => { | |
request.get(url, { | |
encoding: null, | |
gzip: true | |
}, (err, resp, body) => { | |
body = iconv.decode(body, 'gbk') | |
body = body.replace(/<\/p>/gi,'</p>\n') | |
let $ = cheerio.load(body) | |
let title = $('#content h1').text() | |
console.log('title',title); | |
let article = $('#content').text().replace(title,'') | |
let result = `\n\t${title}\n\n${article}\n\n\n\n` | |
let next = $('li>a.next').attr('href') | |
console.log('next',next); | |
resolve([result, next]) | |
}) | |
}) | |
} | |
async function start() { | |
let fs = require('fs') | |
let url = '/wapbook/97027_158708817.html' | |
while(url){ | |
let result = await get(`https://m.boquge.com${url}`) | |
fs.appendFileSync('books2.txt', result[0], 'utf-8') | |
url = result[1] | |
} | |
} | |
start() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment