Created
March 19, 2013 03:01
-
-
Save jepler/5193404 to your computer and use it in GitHub Desktop.
python code to get title, link, and my comment from a newsblur blurblog feed
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import re | |
import sys | |
import xml.etree.ElementTree | |
import BeautifulSoup | |
def exc_wrap(f): | |
def inner(*args, **kw): | |
try: | |
return f(*args, **kw) | |
except Exception, e: | |
raise | |
return inner | |
@exc_wrap | |
def get_title(e): | |
return e.findtext('{http://www.w3.org/2005/Atom}title') | |
@exc_wrap | |
def get_link(e): | |
return e.find('{http://www.w3.org/2005/Atom}link').get('href') | |
def normtext(el): | |
print >>sys.stderr, "normtext", el | |
return re.sub("\s+", " ", " ".join( | |
se.text or '' for se in el.getiterator()).strip()) | |
def bsparse(text): | |
soup = BeautifulSoup.BeautifulSoup(text, | |
convertEntities=BeautifulSoup.BeautifulSoup.HTML_ENTITIES) | |
def emit(soup): | |
if isinstance(soup, BeautifulSoup.NavigableString): | |
if isinstance(soup, BeautifulSoup.Comment): | |
return | |
builder.data(soup) | |
else: | |
builder.start(soup.name, dict(soup.attrs)) | |
for s in soup: | |
emit(s) | |
builder.end(soup.name) | |
builder = xml.etree.ElementTree.TreeBuilder() | |
emit(soup) | |
return builder.close() | |
@exc_wrap | |
def get_comment(e): | |
b = e.find('{http://www.w3.org/2005/Atom}summary').text | |
print >>sys.stderr, "<<<", repr(b) | |
b = bsparse(b) | |
print >>sys.stderr, ">>>", b | |
if not b: return | |
global gb; gb = b | |
b = b[0] | |
if b.tag != 'table': return | |
if b.get('cellpadding') != "0": return | |
if b.get('style') != 'border: 1px solid #E0E0E0; margin: 0; padding: 0; background-color: #F0F0F0': return | |
bc = b.getchildren() | |
if len(bc) != 2: return | |
t0 = normtext(bc[0]) | |
t1 = normtext(bc[1]) | |
if t0 != 'jepler shared this story': return | |
return t1 | |
def parse_rss(u): | |
x = xml.etree.ElementTree.fromstring(u) | |
for e in x.findall('{http://www.w3.org/2005/Atom}entry'): | |
t = get_title(e) | |
l = get_link(e) | |
c = get_comment(e) | |
print >>sys.stderr, repr(c) | |
if t or l or c: | |
yield t, l, c |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment