Skip to content

Instantly share code, notes, and snippets.

@tyndyll
Last active October 14, 2015 08:49
Show Gist options
  • Select an option

  • Save tyndyll/6c357c450cf0dd7428c4 to your computer and use it in GitHub Desktop.

Select an option

Save tyndyll/6c357c450cf0dd7428c4 to your computer and use it in GitHub Desktop.
Take a Podcast/MP3 RSS feed and download and all the media files contained therein
#!/usr/bin/env python
import hashlib
import logging
import optparse
import os
import sys
import urllib2
import xml.dom.minidom
logging.basicConfig(level=logging.INFO)
parser = optparse.OptionParser()
parser.add_option("-l", "--limit", type="int", dest="limit")
parser.add_option("-t", "--tag", type="string", dest="tag", default="enclosure")
parser.add_option("--no-attribute", action="store_true", dest="no_attribute")
opts, args = parser.parse_args()
pod_dir = ".podcast_downloads"
feed = None
data = None
if len(args) != 1:
parser.error("Usage: name_of_script <URL>")
else:
try:
feed = args[0]
data = urllib2.urlopen(urllib2.Request(feed, None, {'User-Agent': 'Mozilla/5.0 (X11; U; Linux i686) Gecko/20071127 Firefox/2.0.0.11'})).read()
except Exception as E:
print E
sys.exit(2)
pod_dir = "%s/%s" % (os.path.expanduser("~"), pod_dir)
if not os.path.exists(pod_dir):
logging.info("Creating podcast download directory %s" % pod_dir)
os.makedirs(pod_dir)
logging.debug("XML Data: %s" % data)
parser = xml.dom.minidom.parseString(data)
md5string = hashlib.md5()
md5string.update(feed)
cache_file = open("%s/%s" % (pod_dir, md5string.hexdigest()), "a+")
cache_file.seek(0)
cache = cache_file.read().splitlines()
downloads = 0
for enclosure_element in parser.getElementsByTagName(opts.tag):
url = enclosure_element.getAttribute("url") if not opts.no_attribute else enclosure_element.firstChild.nodeValue
md5string = hashlib.md5()
md5string.update(url)
if md5string.hexdigest() in cache:
logging.info("URL exists: %s" % url)
continue
else:
logging.info("Downloading URL: %s" % url)
try:
with open(os.path.basename(url), "wb") as f:
f.write(urllib2.urlopen(urllib2.Request(url, None, {'User-Agent': 'Mozilla/5.0 (X11; U; Linux i686) Gecko/20071127 Firefox/2.0.0.11'})).read())
cache_file.write("%s\n" % md5string.hexdigest())
except IOError as E:
logging.error("Could not download %s:", url, E)
downloads += 1
if opts.limit is not None:
if opts.limit <= downloads:
logging.info("Hit maximum downloads. Stopping")
break
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment