Skip to content

Instantly share code, notes, and snippets.

@blha303
Created August 5, 2013 19:51
Show Gist options
  • Save blha303/6158976 to your computer and use it in GitHub Desktop.
Save blha303/6158976 to your computer and use it in GitHub Desktop.
RSS feed rewriter
# RSS feed rewriter
# For adding more content from the target page that isn't included normally
# For an example output, compare
# http://www.escapistmagazine.com/rss/videos/list/1.xml
# and
# http://irc.lazle.co/xml/zeropunctuation.xml
# I added Escapist's embed code loaded from their site in the item description.
# A maybe better example is
# http://www.escapistmagazine.com/rss/articles/columns/extra-punctuation.xml
# and
# http://irc.lazle.co/xml/extrapunctuation.xml
# I wanted the whole article text in my feed reader :D
# Here's an example usage: https://gist.github.com/blha303/36a1c9eef45cf75df904
# (from a much earlier version of the script. fully functional, but there are
# differences between them)
# Future plans:
# Need to make this more accessible for others
# Clean up
# Copyright 2013 Steven Smith (blha303). All Rights Reserved.
# New BSD license
# http://www.opensource.org/licenses/BSD-3-Clause
import yql # I like how simple feed importing is with yql
import sys
from time import sleep
from urllib import urlopen
from bs4 import BeautifulSoup as Soup
yql_env = "http://datatables.org/alltables.env"
YQL = yql.Public()
def main(feedurl, title="A feed!", link="http://google.com", desc=":)", debug=False, aprint=False):
filename = "" # Location output of this file is being read to. used for checking for feed updates
# Copy in the top of the original XML file if you're not sure.
base = """<?xml version="1.0" encoding="ISO-8859-1"?>
<rss version="2.0" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:content="http://purl.org/rss/1.0/modules/content/" >
<channel>
<title>{title}</title>
<link>{link}</link>
<description><![CDATA[{description}]]></description>
<language>en-us</language>
<docs>http://blogs.law.harvard.edu/tech/rss</docs>
""".format(title=title, link=link, description=desc)
end = """
</channel>
</rss>"""
if filename != "":
with open(filename, "r") as f:
olddata = f.read()
oldsoup = Soup(olddata)
else:
oldsoup = None
query = "SELECT title,link,description,pubDate,category from rss where url=@feed" # add more fields to retrieve if needed
result = YQL.execute(query, {"feed": feedurl}, env=yql_env)
if not result.rows:
return "No response?"
if oldsoup:
if oldsoup.find('item'):
if oldsoup.find('item').find('title').text == result.rows[0]["title"]:
# No new articles
if debug: print "No new articles"
if aprint: print str(olddata)
return str(olddata)
if debug: print result.rows
if aprint: print base
items = []
for row in result.rows:
if debug: print "Description for " + row["title"]
description = row['description'] # use beautifulsoup or something to retrieve the info you want
# Add more tags below if needed in the dest feed
items.append("""
<item>
<title>{title}</title>
<link>{url}</link>
<guid>{url}</guid>
<description><![CDATA[{description}]]></description>
<pubDate>{date}</pubDate>
<category>{category}</category>
</item>""".format(title=row["title"].replace("&", "&amp;"), url=row["link"].split("?")[0],
description=description.replace("&", "&amp;"), date=row["pubDate"],
category=row["category"]))
if debug: print "Finished " + title
if aprint: print items[-1]
sleep(1) # to avoid hammering the site in description lookups
if aprint: print end
return base + "".join(items) + end
if __name__ == "__main__":
if len(sys.argv) > 1:
if sys.argv[1] == "debug":
print main("http://irc.lazle.co/xml/zeropunctuation.xml", debug=True)
elif sys.argv[1] == "aprint":
main("http://irc.lazle.co/xml/zeropunctuation.xml", aprint=True)
else:
print main("http://irc.lazle.co/xml/zeropunctuation.xml")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment