Skip to content

Instantly share code, notes, and snippets.

@yuanoook
Last active June 15, 2017 05:50
Show Gist options
  • Save yuanoook/9fcf292c1ed2d599740eb1ac0150f4f0 to your computer and use it in GitHub Desktop.
Save yuanoook/9fcf292c1ed2d599740eb1ac0150f4f0 to your computer and use it in GitHub Desktop.
#!/usr/bin/env node
var fs = require('fs');
var request = require('request');
var cheerio = require('cheerio');
var _ = require('lodash');
var combo = '';
var res_count = 0;
saveCombo().then(combo => {
writeComboFile(combo);
console.log('success');
});
async function saveCombo() {
var last = 221;
var episode_indexs = _.range(1, last+1);
console.log('episode_indexs start with ', episode_indexs[0]);
console.log('episode_indexs end with ', episode_indexs[episode_indexs.length-1]);
var result = {};
for(let i in episode_indexs) {
let episode_index = episode_indexs[i];
result[episode_index] = await getContentOfEpisode(episode_index);
console.log(episode_index, 'finished : )');
}
var res_arr = _.values(result);
console.log('----------------------------\n\nres_arr length before ', res_arr.length);
res_arr = res_arr.filter(x => !!x);
console.log('res_arr length after ', res_arr.length);
return res_arr.join();
}
function writeComboFile(combo) {
var filename = './pronuncian.com.podcasts.scripts.html';
var content =
`<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>pronuncian.com/podcasts/episodeN</title>
</head>
<body>
${ combo}
</body>
</html>
`;
fs.writeFileSync(filename, content);
}
async function getContentOfEpisode(index) {
return new Promise((resolve, reject) => {
var url = `https://pronuncian.com/podcasts/episode${index}`;
console.log(index, url);
request({
url,
headers: {
'User-Agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"
}
}, (err, res, body) => {
if (err) {
console.log(err);
//try three more times;
getContentOfEpisode(index).then(res => {
resolve(res);
}).catch(err => {
console.log(err);
getContentOfEpisode(index).then(res => {
resolve(res);
}).catch(err => {
console.log(err);
getContentOfEpisode(index).then(res => {
resolve(res);
}).catch(err => {
reject(err);
})
})
})
}
if (isEmptyPage(body, index)) return resolve('');
var content = cheerio.load(body)('.post');
var header = cheerio.html(content.find('header'));
var main = `<div class="paragragh">${cheerio.html(content.find('p'))}</div>`;
res = `<div class="page">${header}${main}</div>`;
resolve(res);
})
});
}
function isEmptyPage(content, index) {
const res = content.indexOf('The page you are looking for has been moved or deleted.') > -1;
console.log('isEmptyPage ', index, res);
return res;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment