-
-
Save pyadav/480c2f4f652eb6b907c296b28c84b1c5 to your computer and use it in GitHub Desktop.
crawler for stitcher podcasts
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| 'use strict'; | |
| /* | |
| npm install cheerio promise request request-promise | |
| */ | |
| var fs = require('fs'); | |
| var rp = require('request-promise'); | |
| var request = require('request'); | |
| var cheerio = require('cheerio'); | |
| var Promise = require('promise'); | |
| var stitcher = 'http://www.stitcher.com'; | |
| var p = stitcher + '/podcast/the-web-platform-podcast'; | |
| function crawl(index) { | |
| return rp(p + `?episodes=1&newest=${index}`) | |
| .then(function (body) { | |
| var $ = cheerio.load(body); | |
| var entries = $('a.title'); | |
| var inProgress = []; | |
| entries.each(function(/*i, elem*/) { | |
| var that = $(this); | |
| var filename = that.text().replace(/:/g, '').replace(/ /g, '-'); | |
| inProgress.push(crawlAudio(filename, stitcher + that.attr('href'))); | |
| }); | |
| if (entries.length > 0) { | |
| return Promise.all(inProgress) | |
| .then(function() { | |
| return crawl(index + entries.length); | |
| }); | |
| } else { | |
| return Promise.all(inProgress); | |
| } | |
| }) | |
| .catch(function(error) { | |
| console.log(error); | |
| }); | |
| } | |
| function download(url, filename) { | |
| var writer = fs.createWriteStream(filename); | |
| var reader = request(url); | |
| reader.pipe(writer); | |
| console.log(`download ${url} ${filename}`); | |
| return new Promise(function(resolve, reject) { | |
| reader.on('end', function() { | |
| resolve(); | |
| }); | |
| reader.on('error', function(err) { | |
| reject(err); | |
| }); | |
| }); | |
| } | |
| function crawlAudio(filename, url) { | |
| return rp(url) | |
| .then(function(body) { | |
| var match = body.match(/episodeURL:\s*"([^"]+)"/); | |
| if (match) { | |
| filename = filename + '.mp3'; | |
| return download(match[1], filename); | |
| } | |
| }); | |
| } | |
| crawl(0); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment