duggan · December 22, 2015 17:47
diff --git a/Podcache.md b/Podcache.md
diff --git a/.ignores b/.ignores
 title: Some Busted Entry No Longer Available
 # Comments start with a '#' and contine to the end of the line
 title: Some Other Broken Entry
diff --git a/podcache.py b/podcache.py
 import os
 import time
 import re
 import argparse
 import signal
 import json
 from threading import Event
 from collections import deque
 from pprint import pprint
 import feedparser
 import requests
 import progressbar

 DEFAULT_CHUNK_SIZE = 1024 * 100
 IGNORES_FILE = '.ignores'

 # Event for signalling.
 shutdown = Event()

 # Triggers the shutdown event on receipt of a signal.
 def shutdown_handler(x,y):
    shutdown.set()

 # Register some signals with our shutdown handler.
 signal.signal(signal.SIGTERM, shutdown_handler)
 signal.signal(signal.SIGQUIT, shutdown_handler)
 signal.signal(signal.SIGINT, shutdown_handler)


 parser = argparse.ArgumentParser(description='Podcast downloader.')
 parser.add_argument('-f', '--feed', required=True,
                    help="RSS feed url")
 parser.add_argument('-n', '--name', default=None, required=False,
                    help="Alternate name for feed")
 parser.add_argument('-o', '--output', default=None, required=False,
                    help="Location for downloaded items")
 parser.add_argument('-t', '--type', default='.mp3', required=False,
                    help="File extension to look for in feed items")
 parser.add_argument('-i', '--filter', default=None, required=False,
                    help="Apply a regular expression filter on titles")

 opts = parser.parse_args()

 download_directory = opts.output
 feed_name = opts.name
 chunk_size = DEFAULT_CHUNK_SIZE

 matcher = None
 if opts.filter:
    try:
        matcher = re.compile('%s' % opts.filter, re.UNICODE)
    except Exception as e:
        print("Problem with filter:")
        print(e)
        exit(1)

 f = feedparser.parse(opts.feed)

 podcasts = []
 for entry in f["entries"]:
    podcast = {}
    podcast["title"] = entry["title"]

    if "subtitle_detail" in entry:
        podcast["text"] = entry["subtitle_detail"]["value"]
    elif "subtitle" in entry:
        podcast["text"] = entry["subtitle"]
    else:
        print("Could not figure out description, skipping")

    if "links" in entry:
        links = [item for item in entry["links"] if item["href"].endswith(opts.type)]
        if len(links):
            podcast["href"] = links[0]["href"]
        else:
            print("Could not figure out audio URL")
            pprint(entry["links"])
            exit(1)
    else:
        print("Could not figure out audio URL")
        exit(1)

    if matcher:
        if matcher.search(podcast["title"]):
            podcasts.append(podcast)
    else:
        podcasts.append(podcast)

 if opts.output:
    download_directory = opts.output
 elif opts.name:
    download_directory = opts.name
 else:
    pattern = re.compile('[\W_]+', re.UNICODE)
    download_directory = "./%s" % pattern.sub('', f["feed"]["title"]).lower()

 # Create directory for podcast
 if not os.path.exists(download_directory):
    os.makedirs(download_directory)

 ignores = []
 ignores_file = os.path.join(download_directory, IGNORES_FILE)
 if os.path.exists(ignores_file):
    try:
        # Horrible little parser
        with open(ignores_file, 'r') as f:
            for line in f:
                # ignore comments
                if not line.startswith("#"):
                    raw = line.rstrip().split(":")
                    field = raw[0]
                    value = raw[1]
                    if len(raw) > 2:
                        value = ":".join(raw[1:])
                    ignores.append({field.strip(): value.strip()})
    except:
        print('Invalid ignores file.')
        print('Should be one entry per line, title:foo bar baz')

 for podcast in podcasts:
    if shutdown.is_set():
        print("Stopping...")
        break

    write_properties = 'wb'
    local_filename = os.path.join(download_directory, podcast["href"].split('/')[-1])
    metadata_filename = "%s.metadata.json" % local_filename

    ignore = False
    for rule in ignores:
        for k, v in rule.items():
            if k in podcast:
                if podcast[k] == v:
                    ignore = True

    if ignore:
        print('Ignoring "%s" from %s file...' % (podcast["title"], IGNORES_FILE))
        continue

    print("Processing episode: %s" % podcast["title"])
    print("URL: %s" % podcast["href"])

    # Write some metadata alongside
    print("Writing metadata to %s" % metadata_filename)
    with open(metadata_filename, 'w') as f:
        json.dump(podcast, f, indent = 4)

    try:
        r = requests.get(podcast["href"], stream=True)
        if r.status_code > 400:
            print("--- ERROR:")
            print("--- Could not download this podast :( (Status %d)" % r.status_code)
            print("---")
            continue
    except requests.exceptions.RequestException as e:
        print("--- ERROR:")
        print(e)
        print("---")
        continue

    expected_size = 0
    if 'content-length' in r.headers:
        expected_size = int(r.headers['content-length'])

    progress = 0
    # Couldn't get a content-length from server
    if expected_size > 0:
        # Check whether already downloaded
        if os.path.isfile(local_filename):
            size_on_disk = os.path.getsize(local_filename)
            if size_on_disk == expected_size:
                print("Already downloaded, skipping...")
                continue
            else:
                print("%s downloaded, but mismatched file size (%d/%d)" % (local_filename, size_on_disk, expected_size))
                r = requests.get(podcast["href"], stream=True, headers={'Range': 'bytes=%d-%d' % (size_on_disk, expected_size)})
                if r.status_code == 206:
                    print("Resuming...")
                    write_properties = 'ab'
                    progress = size_on_disk
                else:
                    print("Redownloading...")
                    os.remove(local_filename)

    if expected_size == 0:
        print("Warning: could not determine file size, no progress will be indicated.")

    print("Saving to: %s" % local_filename)
    bar = progressbar.ProgressBar(maxval=expected_size, \
                    widgets=[progressbar.Bar('#', '[', ']'), ' ',
                            progressbar.Percentage(), ' ',
                            progressbar.FileTransferSpeed()])

    with open(local_filename, write_properties) as f:
        for chunk in r.iter_content(chunk_size=chunk_size):
            # check for shutdown
            if shutdown.is_set():
                print("Stopping...")
                break
            if chunk:
                f.write(chunk)
                # do progress
                progress += chunk_size
                if progress <= expected_size:
                    bar.update(progress)
                if progress >= expected_size:
                    bar.finish()

diff --git a/requirements.txt b/requirements.txt
 feedparser
 requests
 progressbar2
	title: Some Busted Entry No Longer Available
	# Comments start with a '#' and contine to the end of the line
	title: Some Other Broken Entry
	import os
	import time
	import re
	import argparse
	import signal
	import json
	from threading import Event
	from collections import deque
	from pprint import pprint
	import feedparser
	import requests
	import progressbar

	DEFAULT_CHUNK_SIZE = 1024 * 100
	IGNORES_FILE = '.ignores'

	# Event for signalling.
	shutdown = Event()

	# Triggers the shutdown event on receipt of a signal.
	def shutdown_handler(x,y):
	shutdown.set()

	# Register some signals with our shutdown handler.
	signal.signal(signal.SIGTERM, shutdown_handler)
	signal.signal(signal.SIGQUIT, shutdown_handler)
	signal.signal(signal.SIGINT, shutdown_handler)


	parser = argparse.ArgumentParser(description='Podcast downloader.')
	parser.add_argument('-f', '--feed', required=True,
	help="RSS feed url")
	parser.add_argument('-n', '--name', default=None, required=False,
	help="Alternate name for feed")
	parser.add_argument('-o', '--output', default=None, required=False,
	help="Location for downloaded items")
	parser.add_argument('-t', '--type', default='.mp3', required=False,
	help="File extension to look for in feed items")
	parser.add_argument('-i', '--filter', default=None, required=False,
	help="Apply a regular expression filter on titles")

	opts = parser.parse_args()

	download_directory = opts.output
	feed_name = opts.name
	chunk_size = DEFAULT_CHUNK_SIZE

	matcher = None
	if opts.filter:
	try:
	matcher = re.compile('%s' % opts.filter, re.UNICODE)
	except Exception as e:
	print("Problem with filter:")
	print(e)
	exit(1)

	f = feedparser.parse(opts.feed)

	podcasts = []
	for entry in f["entries"]:
	podcast = {}
	podcast["title"] = entry["title"]

	if "subtitle_detail" in entry:
	podcast["text"] = entry["subtitle_detail"]["value"]
	elif "subtitle" in entry:
	podcast["text"] = entry["subtitle"]
	else:
	print("Could not figure out description, skipping")

	if "links" in entry:
	links = [item for item in entry["links"] if item["href"].endswith(opts.type)]
	if len(links):
	podcast["href"] = links[0]["href"]
	else:
	print("Could not figure out audio URL")
	pprint(entry["links"])
	exit(1)
	else:
	print("Could not figure out audio URL")
	exit(1)

	if matcher:
	if matcher.search(podcast["title"]):
	podcasts.append(podcast)
	else:
	podcasts.append(podcast)

	if opts.output:
	download_directory = opts.output
	elif opts.name:
	download_directory = opts.name
	else:
	pattern = re.compile('[\W_]+', re.UNICODE)
	download_directory = "./%s" % pattern.sub('', f["feed"]["title"]).lower()

	# Create directory for podcast
	if not os.path.exists(download_directory):
	os.makedirs(download_directory)

	ignores = []
	ignores_file = os.path.join(download_directory, IGNORES_FILE)
	if os.path.exists(ignores_file):
	try:
	# Horrible little parser
	with open(ignores_file, 'r') as f:
	for line in f:
	# ignore comments
	if not line.startswith("#"):
	raw = line.rstrip().split(":")
	field = raw[0]
	value = raw[1]
	if len(raw) > 2:
	value = ":".join(raw[1:])
	ignores.append({field.strip(): value.strip()})
	except:
	print('Invalid ignores file.')
	print('Should be one entry per line, title:foo bar baz')

	for podcast in podcasts:
	if shutdown.is_set():
	print("Stopping...")
	break

	write_properties = 'wb'
	local_filename = os.path.join(download_directory, podcast["href"].split('/')[-1])
	metadata_filename = "%s.metadata.json" % local_filename

	ignore = False
	for rule in ignores:
	for k, v in rule.items():
	if k in podcast:
	if podcast[k] == v:
	ignore = True

	if ignore:
	print('Ignoring "%s" from %s file...' % (podcast["title"], IGNORES_FILE))
	continue

	print("Processing episode: %s" % podcast["title"])
	print("URL: %s" % podcast["href"])

	# Write some metadata alongside
	print("Writing metadata to %s" % metadata_filename)
	with open(metadata_filename, 'w') as f:
	json.dump(podcast, f, indent = 4)

	try:
	r = requests.get(podcast["href"], stream=True)
	if r.status_code > 400:
	print("--- ERROR:")
	print("--- Could not download this podast :( (Status %d)" % r.status_code)
	print("---")
	continue
	except requests.exceptions.RequestException as e:
	print("--- ERROR:")
	print(e)
	print("---")
	continue

	expected_size = 0
	if 'content-length' in r.headers:
	expected_size = int(r.headers['content-length'])

	progress = 0
	# Couldn't get a content-length from server
	if expected_size > 0:
	# Check whether already downloaded
	if os.path.isfile(local_filename):
	size_on_disk = os.path.getsize(local_filename)
	if size_on_disk == expected_size:
	print("Already downloaded, skipping...")
	continue
	else:
	print("%s downloaded, but mismatched file size (%d/%d)" % (local_filename, size_on_disk, expected_size))
	r = requests.get(podcast["href"], stream=True, headers={'Range': 'bytes=%d-%d' % (size_on_disk, expected_size)})
	if r.status_code == 206:
	print("Resuming...")
	write_properties = 'ab'
	progress = size_on_disk
	else:
	print("Redownloading...")
	os.remove(local_filename)

	if expected_size == 0:
	print("Warning: could not determine file size, no progress will be indicated.")

	print("Saving to: %s" % local_filename)
	bar = progressbar.ProgressBar(maxval=expected_size, \
	widgets=[progressbar.Bar('#', '[', ']'), ' ',
	progressbar.Percentage(), ' ',
	progressbar.FileTransferSpeed()])

	with open(local_filename, write_properties) as f:
	for chunk in r.iter_content(chunk_size=chunk_size):
	# check for shutdown
	if shutdown.is_set():
	print("Stopping...")
	break
	if chunk:
	f.write(chunk)
	# do progress
	progress += chunk_size
	if progress <= expected_size:
	bar.update(progress)
	if progress >= expected_size:
	bar.finish()