Created
July 26, 2010 04:21
-
-
Save clehner/490174 to your computer and use it in GitHub Desktop.
A better blip.append_content() for python wave robots.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from BeautifulSoup import BeautifulSoup, NavigableString | |
from waveapi import element | |
import re | |
import htmlentitydefs | |
IMAGE_PLACEHOLDER = '***{{((IMAGE_ELEMENT))}}***' | |
def append_content_to_blip(blip, content, type=None): | |
if type == 'text/plain': | |
# Replace characters that Wave breaks on | |
text = content.replace('\t', ' ').replace('\r', '\n') | |
blip.append(text) | |
return | |
imgs = [] | |
# originally by Pamela Fox (Google) | |
# http://google-wave-resources.googlecode.com/svn/trunk/samples/extensions/robots/python/maildigester/handler.py | |
def cleanup(soup): | |
for tag in soup: | |
if not isinstance(tag, NavigableString): | |
if tag.name == 'img': | |
imgs.append({'url': tag.get('src'), | |
'width': tag.get('width'), | |
'height': tag.get('height')}) | |
# replace it with an image element later | |
tag.replaceWith(IMAGE_PLACEHOLDER) | |
if tag.name == 'a': | |
tag['href'] = tag['href'].replace('&', '&') | |
cleanup(tag) | |
html = unescape(content) | |
soup = BeautifulSoup(html.strip()) | |
cleanup(soup) | |
html = unicode(soup) | |
html = html.replace('\t', ' ') | |
# Since its HTML, it should use <br>s instead of line breaks. | |
html = html.replace('\r', '').replace('\n', '') | |
blip.append_markup(html) | |
# Because append_markup doesn't accept images, we replace img tags in the | |
# html with placeholders and then replace them with image elements. | |
for img in imgs: | |
image = element.Image(url=img['url'], | |
width=img['width'], | |
height=img['height']) | |
placeholder = blip.first(IMAGE_PLACEHOLDER) | |
# Image elements don't allow links on them. | |
# So insert an extra space after images so that a link can still | |
# be clicked if it would normally be on the image. | |
placeholder.insert_after(' ') | |
placeholder.replace(image) | |
def unescape(text): | |
''' | |
Replaces HTML entities with unicode characters | |
by Fredrik Lundh | |
http://effbot.org/zone/re-sub.htm#unescape-html | |
''' | |
def fixup(m): | |
text = m.group(0) | |
if text[:2] == "&#": | |
# character reference | |
try: | |
if text[:3] == "&#x": | |
return unichr(int(text[3:-1], 16)) | |
else: | |
return unichr(int(text[2:-1])) | |
except ValueError: | |
pass | |
else: | |
# named entity | |
try: | |
text = unichr(htmlentitydefs.name2codepoint[text[1:-1]]) | |
except KeyError: | |
pass | |
return text # leave as is | |
return re.sub("&#?\w+;", fixup, text) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment