blha303 · August 5, 2013 19:51
diff --git a/rssfeedrewrite.py b/rssfeedrewrite.py
 # RSS feed rewriter
 # For adding more content from the target page that isn't included normally
 # For an example output, compare 
 # http://www.escapistmagazine.com/rss/videos/list/1.xml
 # and
 # http://irc.lazle.co/xml/zeropunctuation.xml
 # I added Escapist's embed code loaded from their site in the item description.
 # A maybe better example is
 # http://www.escapistmagazine.com/rss/articles/columns/extra-punctuation.xml
 # and
 # http://irc.lazle.co/xml/extrapunctuation.xml
 # I wanted the whole article text in my feed reader :D
 # Here's an example usage: https://gist.github.com/blha303/36a1c9eef45cf75df904
 # (from a much earlier version of the script. fully functional, but there are
 #  differences between them)

 # Future plans:
 #   Need to make this more accessible for others
 #   Clean up

 # Copyright 2013 Steven Smith (blha303). All Rights Reserved.
 # New BSD license
 # http://www.opensource.org/licenses/BSD-3-Clause

 import yql # I like how simple feed importing is with yql
 import sys
 from time import sleep
 from urllib import urlopen
 from bs4 import BeautifulSoup as Soup

 yql_env = "http://datatables.org/alltables.env"

 YQL = yql.Public()


 def main(feedurl, title="A feed!", link="http://google.com", desc=":)", debug=False, aprint=False):
    filename = "" # Location output of this file is being read to. used for checking for feed updates
    # Copy in the top of the original XML file if you're not sure.
    base = """<?xml version="1.0" encoding="ISO-8859-1"?>
 <rss version="2.0" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:content="http://purl.org/rss/1.0/modules/content/" >
    <channel>
        <title>{title}</title>
        <link>{link}</link>
        <description><![CDATA[{description}]]></description>
        <language>en-us</language>
        <docs>http://blogs.law.harvard.edu/tech/rss</docs>
 """.format(title=title, link=link, description=desc)

    end = """
    </channel>
 </rss>"""

    if filename != "":
        with open(filename, "r") as f:
            olddata = f.read()
        oldsoup = Soup(olddata)
    else:
        oldsoup = None

    query = "SELECT title,link,description,pubDate,category from rss where url=@feed" # add more fields to retrieve if needed
    result = YQL.execute(query, {"feed": feedurl}, env=yql_env)

    if not result.rows:
        return "No response?"
    if oldsoup:
        if oldsoup.find('item'):
            if oldsoup.find('item').find('title').text == result.rows[0]["title"]:
                # No new articles
                if debug: print "No new articles"
                if aprint: print str(olddata)
                return str(olddata)

    if debug: print result.rows
    if aprint: print base
    items = []
    for row in result.rows:
        if debug: print "Description for " + row["title"]
        description = row['description'] # use beautifulsoup or something to retrieve the info you want
        # Add more tags below if needed in the dest feed
        items.append("""
        <item>
            <title>{title}</title>
            <link>{url}</link>
            <guid>{url}</guid>
            <description><![CDATA[{description}]]></description>
            <pubDate>{date}</pubDate>
            <category>{category}</category>
        </item>""".format(title=row["title"].replace("&", "&amp;"), url=row["link"].split("?")[0],
                          description=description.replace("&", "&amp;"), date=row["pubDate"],
                          category=row["category"]))
        if debug: print "Finished " + title
        if aprint: print items[-1]
        sleep(1) # to avoid hammering the site in description lookups
    if aprint: print end
    return base + "".join(items) + end

 if __name__ == "__main__":
    if len(sys.argv) > 1:
        if sys.argv[1] == "debug":
            print main("http://irc.lazle.co/xml/zeropunctuation.xml", debug=True)
        elif sys.argv[1] == "aprint":
            main("http://irc.lazle.co/xml/zeropunctuation.xml", aprint=True)
    else:
        print main("http://irc.lazle.co/xml/zeropunctuation.xml")
	# RSS feed rewriter
	# For adding more content from the target page that isn't included normally
	# For an example output, compare
	# http://www.escapistmagazine.com/rss/videos/list/1.xml
	# and
	# http://irc.lazle.co/xml/zeropunctuation.xml
	# I added Escapist's embed code loaded from their site in the item description.
	# A maybe better example is
	# http://www.escapistmagazine.com/rss/articles/columns/extra-punctuation.xml
	# and
	# http://irc.lazle.co/xml/extrapunctuation.xml
	# I wanted the whole article text in my feed reader :D
	# Here's an example usage: https://gist.github.com/blha303/36a1c9eef45cf75df904
	# (from a much earlier version of the script. fully functional, but there are
	# differences between them)

	# Future plans:
	# Need to make this more accessible for others
	# Clean up

	# Copyright 2013 Steven Smith (blha303). All Rights Reserved.
	# New BSD license
	# http://www.opensource.org/licenses/BSD-3-Clause

	import yql # I like how simple feed importing is with yql
	import sys
	from time import sleep
	from urllib import urlopen
	from bs4 import BeautifulSoup as Soup

	yql_env = "http://datatables.org/alltables.env"

	YQL = yql.Public()


	def main(feedurl, title="A feed!", link="http://google.com", desc=":)", debug=False, aprint=False):
	filename = "" # Location output of this file is being read to. used for checking for feed updates
	# Copy in the top of the original XML file if you're not sure.
	base = """<?xml version="1.0" encoding="ISO-8859-1"?>
	<rss version="2.0" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:content="http://purl.org/rss/1.0/modules/content/" >
	<channel>
	<title>{title}</title>
	<link>{link}</link>
	<description><![CDATA[{description}]]></description>
	<language>en-us</language>
	<docs>http://blogs.law.harvard.edu/tech/rss</docs>
	""".format(title=title, link=link, description=desc)

	end = """
	</channel>
	</rss>"""

	if filename != "":
	with open(filename, "r") as f:
	olddata = f.read()
	oldsoup = Soup(olddata)
	else:
	oldsoup = None

	query = "SELECT title,link,description,pubDate,category from rss where url=@feed" # add more fields to retrieve if needed
	result = YQL.execute(query, {"feed": feedurl}, env=yql_env)

	if not result.rows:
	return "No response?"
	if oldsoup:
	if oldsoup.find('item'):
	if oldsoup.find('item').find('title').text == result.rows[0]["title"]:
	# No new articles
	if debug: print "No new articles"
	if aprint: print str(olddata)
	return str(olddata)

	if debug: print result.rows
	if aprint: print base
	items = []
	for row in result.rows:
	if debug: print "Description for " + row["title"]
	description = row['description'] # use beautifulsoup or something to retrieve the info you want
	# Add more tags below if needed in the dest feed
	items.append("""
	<item>
	<title>{title}</title>
	<link>{url}</link>
	<guid>{url}</guid>
	<description><![CDATA[{description}]]></description>
	<pubDate>{date}</pubDate>
	<category>{category}</category>
	</item>""".format(title=row["title"].replace("&", "&"), url=row["link"].split("?")[0],
	description=description.replace("&", "&"), date=row["pubDate"],
	category=row["category"]))
	if debug: print "Finished " + title
	if aprint: print items[-1]
	sleep(1) # to avoid hammering the site in description lookups
	if aprint: print end
	return base + "".join(items) + end

	if __name__ == "__main__":
	if len(sys.argv) > 1:
	if sys.argv[1] == "debug":
	print main("http://irc.lazle.co/xml/zeropunctuation.xml", debug=True)
	elif sys.argv[1] == "aprint":
	main("http://irc.lazle.co/xml/zeropunctuation.xml", aprint=True)
	else:
	print main("http://irc.lazle.co/xml/zeropunctuation.xml")