Skip to content

Instantly share code, notes, and snippets.

@rcarmo
Created April 19, 2011 22:01
Show Gist options
  • Save rcarmo/929829 to your computer and use it in GitHub Desktop.
Save rcarmo/929829 to your computer and use it in GitHub Desktop.
A github RSS feed sanitizer
#!/usr/bin/env python
# encoding: utf-8
"""
parser.py
A github RSS feed sanitizer, whereby we mean "sane" - i.e., that groups updates
per project and not as a mess of completely unrelated events.
Created by Rui Carmo on 2011-01-06. Still incomplete.
Distributed under the MIT license
"""
import os, re, sys, feedparser, htmlentitydefs
from xml.etree.cElementTree import XMLParser, tostring
try:
import json
except:
import simplejson as json
patterns = [re.compile(x) for x in ['.+(the) (\w+)\/(\w+) wiki$', '.+(on|at) (\w+)\/(\w+)$']]
projects = {}
# Use inline styles to ensure this has the proper look in RSS
css = {
'thumbnail': "width:30px; height:30px; background-color: white; border: 1px solid #D0D0D0; float: left; margin-right: 0.7em; padding: 2px; background: url(%(thumbnail)s) no-repeat;"
}
template = u"""<entry><div><div class="thumbnail"> </div>%(title)s</div>%(description)s</entry>"""
# method for injection in feedparser
def _start_thumbnail(self, attrsD):
context = self._getContext()
context.setdefault('thumbnail', attrsD)
def collate(entry):
for p in patterns:
m = p.match(entry.title)
if m:
(dummy, owner, project) = m.groups()
break
project = 'General'
try:
p = projects[project]
except KeyError:
p = projects[project] = []
buffer = template % {'title': entry.title, 'description': entry.content[0]['value']}
tree = XMLParser()
for i in htmlentitydefs.name2codepoint.keys():
buffer = buffer.replace("&%s;" % i,"&#%d;" % htmlentitydefs.name2codepoint[i])
tree.feed(buffer.encode('utf-8'))
t = tree.close()
for e in t.findall('entry'):
try:
e.attrib['style'] = css[e.attrib['class']] % locals()
except:
pass
p.append(tostring(t))
def main():
#feedparser._debug=1
feedparser._FeedParserMixin._start_thumbnail = _start_thumbnail
d = feedparser.parse('https://github.com/rcarmo.private.atom?token=foobar', agent = 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; .NET CLR 1.1.4322)') # be annoying
for i in d.entries:
collate(i)
for p in projects.keys():
print "<h2>%s</h2>" % p
for e in projects[p]:
print e
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment