Last active
June 15, 2017 05:50
-
-
Save yuanoook/9fcf292c1ed2d599740eb1ac0150f4f0 to your computer and use it in GitHub Desktop.
scripts download https://pronuncian.com/podcasts/episode221
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env node | |
var fs = require('fs'); | |
var request = require('request'); | |
var cheerio = require('cheerio'); | |
var _ = require('lodash'); | |
var combo = ''; | |
var res_count = 0; | |
saveCombo().then(combo => { | |
writeComboFile(combo); | |
console.log('success'); | |
}); | |
async function saveCombo() { | |
var last = 221; | |
var episode_indexs = _.range(1, last+1); | |
console.log('episode_indexs start with ', episode_indexs[0]); | |
console.log('episode_indexs end with ', episode_indexs[episode_indexs.length-1]); | |
var result = {}; | |
for(let i in episode_indexs) { | |
let episode_index = episode_indexs[i]; | |
result[episode_index] = await getContentOfEpisode(episode_index); | |
console.log(episode_index, 'finished : )'); | |
} | |
var res_arr = _.values(result); | |
console.log('----------------------------\n\nres_arr length before ', res_arr.length); | |
res_arr = res_arr.filter(x => !!x); | |
console.log('res_arr length after ', res_arr.length); | |
return res_arr.join(); | |
} | |
function writeComboFile(combo) { | |
var filename = './pronuncian.com.podcasts.scripts.html'; | |
var content = | |
`<!DOCTYPE html> | |
<html lang="en"> | |
<head> | |
<meta charset="UTF-8"> | |
<title>pronuncian.com/podcasts/episodeN</title> | |
</head> | |
<body> | |
${ combo} | |
</body> | |
</html> | |
`; | |
fs.writeFileSync(filename, content); | |
} | |
async function getContentOfEpisode(index) { | |
return new Promise((resolve, reject) => { | |
var url = `https://pronuncian.com/podcasts/episode${index}`; | |
console.log(index, url); | |
request({ | |
url, | |
headers: { | |
'User-Agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36" | |
} | |
}, (err, res, body) => { | |
if (err) { | |
console.log(err); | |
//try three more times; | |
getContentOfEpisode(index).then(res => { | |
resolve(res); | |
}).catch(err => { | |
console.log(err); | |
getContentOfEpisode(index).then(res => { | |
resolve(res); | |
}).catch(err => { | |
console.log(err); | |
getContentOfEpisode(index).then(res => { | |
resolve(res); | |
}).catch(err => { | |
reject(err); | |
}) | |
}) | |
}) | |
} | |
if (isEmptyPage(body, index)) return resolve(''); | |
var content = cheerio.load(body)('.post'); | |
var header = cheerio.html(content.find('header')); | |
var main = `<div class="paragragh">${cheerio.html(content.find('p'))}</div>`; | |
res = `<div class="page">${header}${main}</div>`; | |
resolve(res); | |
}) | |
}); | |
} | |
function isEmptyPage(content, index) { | |
const res = content.indexOf('The page you are looking for has been moved or deleted.') > -1; | |
console.log('isEmptyPage ', index, res); | |
return res; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment