Skip to content

Instantly share code, notes, and snippets.

@pyadav
Forked from evgeny-t/crawler.js
Created December 9, 2016 05:15
Show Gist options
  • Select an option

  • Save pyadav/480c2f4f652eb6b907c296b28c84b1c5 to your computer and use it in GitHub Desktop.

Select an option

Save pyadav/480c2f4f652eb6b907c296b28c84b1c5 to your computer and use it in GitHub Desktop.
crawler for stitcher podcasts
'use strict';
/*
npm install cheerio promise request request-promise
*/
var fs = require('fs');
var rp = require('request-promise');
var request = require('request');
var cheerio = require('cheerio');
var Promise = require('promise');
var stitcher = 'http://www.stitcher.com';
var p = stitcher + '/podcast/the-web-platform-podcast';
function crawl(index) {
return rp(p + `?episodes=1&newest=${index}`)
.then(function (body) {
var $ = cheerio.load(body);
var entries = $('a.title');
var inProgress = [];
entries.each(function(/*i, elem*/) {
var that = $(this);
var filename = that.text().replace(/:/g, '').replace(/ /g, '-');
inProgress.push(crawlAudio(filename, stitcher + that.attr('href')));
});
if (entries.length > 0) {
return Promise.all(inProgress)
.then(function() {
return crawl(index + entries.length);
});
} else {
return Promise.all(inProgress);
}
})
.catch(function(error) {
console.log(error);
});
}
function download(url, filename) {
var writer = fs.createWriteStream(filename);
var reader = request(url);
reader.pipe(writer);
console.log(`download ${url} ${filename}`);
return new Promise(function(resolve, reject) {
reader.on('end', function() {
resolve();
});
reader.on('error', function(err) {
reject(err);
});
});
}
function crawlAudio(filename, url) {
return rp(url)
.then(function(body) {
var match = body.match(/episodeURL:\s*"([^"]+)"/);
if (match) {
filename = filename + '.mp3';
return download(match[1], filename);
}
});
}
crawl(0);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment