Created
August 5, 2013 19:51
-
-
Save blha303/6158976 to your computer and use it in GitHub Desktop.
RSS feed rewriter
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# RSS feed rewriter | |
# For adding more content from the target page that isn't included normally | |
# For an example output, compare | |
# http://www.escapistmagazine.com/rss/videos/list/1.xml | |
# and | |
# http://irc.lazle.co/xml/zeropunctuation.xml | |
# I added Escapist's embed code loaded from their site in the item description. | |
# A maybe better example is | |
# http://www.escapistmagazine.com/rss/articles/columns/extra-punctuation.xml | |
# and | |
# http://irc.lazle.co/xml/extrapunctuation.xml | |
# I wanted the whole article text in my feed reader :D | |
# Here's an example usage: https://gist.github.com/blha303/36a1c9eef45cf75df904 | |
# (from a much earlier version of the script. fully functional, but there are | |
# differences between them) | |
# Future plans: | |
# Need to make this more accessible for others | |
# Clean up | |
# Copyright 2013 Steven Smith (blha303). All Rights Reserved. | |
# New BSD license | |
# http://www.opensource.org/licenses/BSD-3-Clause | |
import yql # I like how simple feed importing is with yql | |
import sys | |
from time import sleep | |
from urllib import urlopen | |
from bs4 import BeautifulSoup as Soup | |
yql_env = "http://datatables.org/alltables.env" | |
YQL = yql.Public() | |
def main(feedurl, title="A feed!", link="http://google.com", desc=":)", debug=False, aprint=False): | |
filename = "" # Location output of this file is being read to. used for checking for feed updates | |
# Copy in the top of the original XML file if you're not sure. | |
base = """<?xml version="1.0" encoding="ISO-8859-1"?> | |
<rss version="2.0" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:content="http://purl.org/rss/1.0/modules/content/" > | |
<channel> | |
<title>{title}</title> | |
<link>{link}</link> | |
<description><![CDATA[{description}]]></description> | |
<language>en-us</language> | |
<docs>http://blogs.law.harvard.edu/tech/rss</docs> | |
""".format(title=title, link=link, description=desc) | |
end = """ | |
</channel> | |
</rss>""" | |
if filename != "": | |
with open(filename, "r") as f: | |
olddata = f.read() | |
oldsoup = Soup(olddata) | |
else: | |
oldsoup = None | |
query = "SELECT title,link,description,pubDate,category from rss where url=@feed" # add more fields to retrieve if needed | |
result = YQL.execute(query, {"feed": feedurl}, env=yql_env) | |
if not result.rows: | |
return "No response?" | |
if oldsoup: | |
if oldsoup.find('item'): | |
if oldsoup.find('item').find('title').text == result.rows[0]["title"]: | |
# No new articles | |
if debug: print "No new articles" | |
if aprint: print str(olddata) | |
return str(olddata) | |
if debug: print result.rows | |
if aprint: print base | |
items = [] | |
for row in result.rows: | |
if debug: print "Description for " + row["title"] | |
description = row['description'] # use beautifulsoup or something to retrieve the info you want | |
# Add more tags below if needed in the dest feed | |
items.append(""" | |
<item> | |
<title>{title}</title> | |
<link>{url}</link> | |
<guid>{url}</guid> | |
<description><![CDATA[{description}]]></description> | |
<pubDate>{date}</pubDate> | |
<category>{category}</category> | |
</item>""".format(title=row["title"].replace("&", "&"), url=row["link"].split("?")[0], | |
description=description.replace("&", "&"), date=row["pubDate"], | |
category=row["category"])) | |
if debug: print "Finished " + title | |
if aprint: print items[-1] | |
sleep(1) # to avoid hammering the site in description lookups | |
if aprint: print end | |
return base + "".join(items) + end | |
if __name__ == "__main__": | |
if len(sys.argv) > 1: | |
if sys.argv[1] == "debug": | |
print main("http://irc.lazle.co/xml/zeropunctuation.xml", debug=True) | |
elif sys.argv[1] == "aprint": | |
main("http://irc.lazle.co/xml/zeropunctuation.xml", aprint=True) | |
else: | |
print main("http://irc.lazle.co/xml/zeropunctuation.xml") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment