GitHub's user activity RSS feed is plain annoying. Too little context to really grasp what the tickets, comments and commits are about.
This script is going to fix that. Stay tuned.
/bin | |
/include | |
/lib |
import logging | |
import lxml.etree | |
import lxml.html | |
import sys | |
import urllib2 | |
logging.basicConfig(level=logging.DEBUG, format=logging.BASIC_FORMAT) | |
logger = logging.getLogger('ghrss') | |
def modify_rss(url): | |
"""Process an XML document of an RSS feed | |
Returns the processed XML document as a pretty-printed XML string | |
Removes entries whose links are duplicates, i.e. point to the same webpage. | |
Replaces contents of GitHub issue entries with the actual webpage of the | |
issue. | |
""" | |
if url.startswith(('http://', 'https://')): | |
feed = urllib2.urlopen(url) | |
else: | |
feed = open(url) | |
root = lxml.etree.parse(feed).getroot() | |
atom_ns = root.nsmap[None] | |
def tag(tagname): | |
return '{{{0}}}{1}'.format(atom_ns, tagname) | |
pages = set() | |
for entry in root.findall(tag('entry')): | |
link = entry.find(tag('link')).attrib['href'] | |
page = link.split('#')[0] | |
if page in pages: | |
logger.info('remove %s', link) | |
root.remove(entry) | |
elif page.split('/')[-2] == 'issues': | |
logger.info('fix %s', link) | |
content = entry.find(tag('content')) | |
content.text = modify_ticket(page) | |
pages.add(page) | |
return lxml.etree.tostring(root, pretty_print=True) | |
def modify_ticket(url): | |
page = urllib2.urlopen(url) | |
dom = lxml.html.parse(page, parser=lxml.html.HTMLParser()) | |
root = dom.getroot() | |
comments = root.find('.//div[@class="new-comments"]') | |
style = '<link href="github.css" media="screen" rel="stylesheet" type="text/css"/>' | |
comments = lxml.etree.tostring(comments, pretty_print=True) | |
return style + comments | |
print modify_rss(sys.argv[1]) |