Skip to content

Instantly share code, notes, and snippets.

@mbafford
Last active February 4, 2020 14:17
Show Gist options
  • Save mbafford/5333101 to your computer and use it in GitHub Desktop.
Save mbafford/5333101 to your computer and use it in GitHub Desktop.
Quick and simple program to convert the Diecast podcast (http://www.shamusyoung.com/twentysidedtale/?cat=287) RSS feed into something my podcast reader can handle. It simply pulls the mp3 URL from the description and adds an enclosure element pointing at that URL.
diecast.xml
run.sh
original-rss.xml
.env/
#!/usr/bin/env python3
# Expects the AWS_ enviornment variables to be set for boto to know how to connect to AWS/S3 - they are:
# AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY
#
# Change the bucket name from mbafford-static for your own uses.
#
# Obvious enhancements would be to add the necessary tags for displaying artwork for the show in podcast software.
# This is just a hacky way to take the existing blog RSS feed and make it readable by podcast software
# validated and found mostly not broken with:
# http://www.feedvalidator.org/check.cgi?url=https%3A%2F%2Fmbafford-static.s3.amazonaws.com%2Fdiecast.xml
# https://podba.se/validate/?url=https://mbafford-static.s3.amazonaws.com/diecast.xml
# http://castfeedvalidator.com/?url=https://mbafford-static.s3.amazonaws.com/diecast.xml
import re
from urllib.request import urlopen, Request
from xml.dom.minidom import parseString
import boto
def findSourceURL(item, type):
itemxml = item.toxml()
m = re.search("<source[^>]+src=[\"']([^\"']*\\.mp3)[\"']", itemxml)
if m: return m.group(1)
return None
def fetch_rss_feed_xml():
# Circumvent a potential bot blocker, see http://stackoverflow.com/questions/3336549/pythons-urllib2-why-do-i-get-error-403-when-i-urlopen-a-wikipedia-page
url = 'http://www.shamusyoung.com/twentysidedtale/?feed=rss2&cat=287'
req = Request(url, headers={'User-Agent' : "Diecast feed generator ([email protected])"})
conn = urlopen(req)
rss = conn.read()
# a mix of UTF-8 and windows-1252 makes for an XML parsing error
# in the case of this script, fixing the errors isn't that important
# so just get rid of them
rss = rss.decode('utf-8', errors='replace').replace(u"\uFFFD", "")
rssxml = parseString(rss)
return rssxml
def podcastify_xml(rssxml):
# For iTunes
feedNodes = rssxml.getElementsByTagName("rss")
if feedNodes: feedNodes[0].setAttribute("xmlns:itunes", "http://www.itunes.com/dtds/podcast-1.0.dtd")
for link in rssxml.getElementsByTagName("atom:link"):
link.parentNode.removeChild( link )
for item in rssxml.getElementsByTagName("item"):
mp3url = findSourceURL(item, 'mp3')
if mp3url:
if not mp3url.startswith("http"):
if mp3url.startswith("/"):
mp3url = "http://www.shamusyoung.com" + mp3url
else:
mp3url = "http://www.shamusyoung.com/twentysidedtale/" + mp3url
# remove existing enclosures, if any
enclosures = item.getElementsByTagName("enclosure")
for enclosure in enclosures:
enclosure.parentNode.removeChild( enclosure )
enclosure = rssxml.createElement("enclosure")
enclosure.setAttribute("url", mp3url)
enclosure.setAttribute("type", "audio/mpeg")
enclosure.setAttribute("length", "75000000") # for iTunes
item.appendChild(enclosure)
channel = rssxml.getElementsByTagName("channel")[0]
image = rssxml.createElement("itunes:image")
image.setAttribute("href", "http://shamusyoung.com/twentysidedtale/images/diecast2018.jpg")
channel.insertBefore(image, channel.firstChild)
author = rssxml.createElement("itunes:author")
author.appendChild( rssxml.createTextNode("Twenty Sided") )
channel.insertBefore(author, channel.firstChild)
category = rssxml.createElement("itunes:category")
category.setAttribute("text", "Games & Hobbies")
channel.insertBefore(category, channel.firstChild)
explicit = rssxml.createElement("itunes:explicit")
explicit.appendChild( rssxml.createTextNode("no") )
channel.insertBefore(explicit, channel.firstChild)
link = rssxml.createElement("atom:link")
link.setAttribute("href", "https://mbafford-static.s3.amazonaws.com/diecast.xml")
link.setAttribute("rel", "self")
link.setAttribute("type", "application/rss+xml")
channel.insertBefore( link, channel.firstChild )
def upload_xml( rssxml ):
s3 = boto.connect_s3( is_secure=False )
bucket = s3.get_bucket('mbafford-static')
s3key = boto.s3.key.Key(bucket)
s3key.key = 'diecast.xml'
s3key.set_contents_from_string( rssxml.toprettyxml(encoding='utf-8'), headers={'Content-Type' : 'application/rss+xml'}, policy='public-read' )
def write_xml_to_file( rssxml, filename ):
with open(filename, 'w') as f:
f.write( rssxml.toprettyxml(encoding='utf-8').decode('utf-8') )
rssxml = fetch_rss_feed_xml()
write_xml_to_file( rssxml, 'original-rss.xml' )
podcastify_xml( rssxml )
write_xml_to_file( rssxml, 'diecast.xml' )
upload_xml( rssxml )
@hezamu
Copy link

hezamu commented May 18, 2013

Thanks for this Matt, great stuff!

I updated it to work with iTunes. I didn't test the S3 part, but the generated XML works with iTunes. You can grab the changes from my fork.

https://gist.github.com/hezamu/5604249

@mbafford
Copy link
Author

Merged and re-run. Updated feed at http://mbafford-static.s3.amazonaws.com/diecast.xml works with iTunes 11.0.2 now. Thank you for the patch!

It's a shame that iTunes requires a length parameter that it's going to then ignore. I remember having to do this a long time ago with another similar project, but I'd forgotten all about that requirement.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment