Created
April 19, 2011 22:01
-
-
Save rcarmo/929829 to your computer and use it in GitHub Desktop.
A github RSS feed sanitizer
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# encoding: utf-8 | |
""" | |
parser.py | |
A github RSS feed sanitizer, whereby we mean "sane" - i.e., that groups updates | |
per project and not as a mess of completely unrelated events. | |
Created by Rui Carmo on 2011-01-06. Still incomplete. | |
Distributed under the MIT license | |
""" | |
import os, re, sys, feedparser, htmlentitydefs | |
from xml.etree.cElementTree import XMLParser, tostring | |
try: | |
import json | |
except: | |
import simplejson as json | |
patterns = [re.compile(x) for x in ['.+(the) (\w+)\/(\w+) wiki$', '.+(on|at) (\w+)\/(\w+)$']] | |
projects = {} | |
# Use inline styles to ensure this has the proper look in RSS | |
css = { | |
'thumbnail': "width:30px; height:30px; background-color: white; border: 1px solid #D0D0D0; float: left; margin-right: 0.7em; padding: 2px; background: url(%(thumbnail)s) no-repeat;" | |
} | |
template = u"""<entry><div><div class="thumbnail"> </div>%(title)s</div>%(description)s</entry>""" | |
# method for injection in feedparser | |
def _start_thumbnail(self, attrsD): | |
context = self._getContext() | |
context.setdefault('thumbnail', attrsD) | |
def collate(entry): | |
for p in patterns: | |
m = p.match(entry.title) | |
if m: | |
(dummy, owner, project) = m.groups() | |
break | |
project = 'General' | |
try: | |
p = projects[project] | |
except KeyError: | |
p = projects[project] = [] | |
buffer = template % {'title': entry.title, 'description': entry.content[0]['value']} | |
tree = XMLParser() | |
for i in htmlentitydefs.name2codepoint.keys(): | |
buffer = buffer.replace("&%s;" % i,"&#%d;" % htmlentitydefs.name2codepoint[i]) | |
tree.feed(buffer.encode('utf-8')) | |
t = tree.close() | |
for e in t.findall('entry'): | |
try: | |
e.attrib['style'] = css[e.attrib['class']] % locals() | |
except: | |
pass | |
p.append(tostring(t)) | |
def main(): | |
#feedparser._debug=1 | |
feedparser._FeedParserMixin._start_thumbnail = _start_thumbnail | |
d = feedparser.parse('https://github.com/rcarmo.private.atom?token=foobar', agent = 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; .NET CLR 1.1.4322)') # be annoying | |
for i in d.entries: | |
collate(i) | |
for p in projects.keys(): | |
print "<h2>%s</h2>" % p | |
for e in projects[p]: | |
print e | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment