Created
May 14, 2011 05:36
-
-
Save amundo/971949 to your computer and use it in GitHub Desktop.
unescape hex entities in html
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
"""html2text: Turn HTML into equivalent Markdown-structured text.""" | |
__version__ = "2.35" | |
__author__ = "Aaron Swartz ([email protected])" | |
__copyright__ = "(C) 2004-2008 Aaron Swartz. GNU GPL 3." | |
__contributors__ = ["Martin 'Joey' Schulze", "Ricardo Reyes"] | |
# TODO: | |
# Support decoded entities with unifiable. | |
# Relative URL resolution | |
if not hasattr(__builtins__, 'True'): True, False = 1, 0 | |
import re, sys, urllib, htmlentitydefs, codecs, StringIO, types | |
import sgmllib | |
sgmllib.charref = re.compile('&#([xX]?[0-9a-fA-F]+)[^0-9a-fA-F]') | |
try: from textwrap import wrap | |
except: pass | |
# Use Unicode characters instead of their ascii psuedo-replacements | |
UNICODE_SNOB = 0 | |
# Put the links after each paragraph instead of at the end. | |
LINKS_EACH_PARAGRAPH = 0 | |
# Wrap long lines at position. 0 for no wrapping. (Requires Python 2.3.) | |
BODY_WIDTH = 78 | |
# Don't show internal links (href="#local-anchor") -- corresponding link targets | |
# won't be visible in the plain text file anyway. | |
SKIP_INTERNAL_LINKS = False | |
### Entity Nonsense ### | |
def name2cp(k): | |
if k == 'apos': return ord("'") | |
if hasattr(htmlentitydefs, "name2codepoint"): # requires Python 2.3 | |
return htmlentitydefs.name2codepoint[k] | |
else: | |
k = htmlentitydefs.entitydefs[k] | |
if k.startswith("&#") and k.endswith(";"): return int(k[2:-1]) # not in latin-1 | |
return ord(codecs.latin_1_decode(k)[0]) | |
unifiable = {'rsquo':"'", 'lsquo':"'", 'rdquo':'"', 'ldquo':'"', | |
'copy':'(C)', 'mdash':'--', 'nbsp':' ', 'rarr':'->', 'larr':'<-', 'middot':'*', | |
'ndash':'-', 'oelig':'oe', 'aelig':'ae', | |
'agrave':'a', 'aacute':'a', 'acirc':'a', 'atilde':'a', 'auml':'a', 'aring':'a', | |
'egrave':'e', 'eacute':'e', 'ecirc':'e', 'euml':'e', | |
'igrave':'i', 'iacute':'i', 'icirc':'i', 'iuml':'i', | |
'ograve':'o', 'oacute':'o', 'ocirc':'o', 'otilde':'o', 'ouml':'o', | |
'ugrave':'u', 'uacute':'u', 'ucirc':'u', 'uuml':'u'} | |
unifiable_n = {} | |
for k in unifiable.keys(): | |
unifiable_n[name2cp(k)] = unifiable[k] | |
def charref(name): | |
if name[0] in ['x','X']: | |
c = int(name[1:], 16) | |
else: | |
c = int(name) | |
if not UNICODE_SNOB and c in unifiable_n.keys(): | |
return unifiable_n[c] | |
else: | |
return unichr(c) | |
def entityref(c): | |
if not UNICODE_SNOB and c in unifiable.keys(): | |
return unifiable[c] | |
else: | |
try: name2cp(c) | |
except KeyError: return "&" + c | |
else: return unichr(name2cp(c)) | |
def replaceEntities(s): | |
s = s.group(1) | |
if s[0] == "#": | |
return charref(s[1:]) | |
else: return entityref(s) | |
r_unescape = re.compile(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));") | |
def unescape(s): | |
return r_unescape.sub(replaceEntities, s) | |
def fixattrs(attrs): | |
# Fix bug in sgmllib.py | |
if not attrs: return attrs | |
newattrs = [] | |
for attr in attrs: | |
newattrs.append((attr[0], unescape(attr[1]))) | |
return newattrs | |
### End Entity Nonsense ### | |
def onlywhite(line): | |
"""Return true if the line does only consist of whitespace characters.""" | |
for c in line: | |
if c is not ' ' and c is not ' ': | |
return c is ' ' | |
return line | |
def optwrap(text): | |
"""Wrap all paragraphs in the provided text.""" | |
if not BODY_WIDTH: | |
return text | |
assert wrap, "Requires Python 2.3." | |
result = '' | |
newlines = 0 | |
for para in text.split("\n"): | |
if len(para) > 0: | |
if para[0] is not ' ' and para[0] is not '-' and para[0] is not '*': | |
for line in wrap(para, BODY_WIDTH): | |
result += line + "\n" | |
result += "\n" | |
newlines = 2 | |
else: | |
if not onlywhite(para): | |
result += para + "\n" | |
newlines = 1 | |
else: | |
if newlines < 2: | |
result += "\n" | |
newlines += 1 | |
return result | |
def hn(tag): | |
if tag[0] == 'h' and len(tag) == 2: | |
try: | |
n = int(tag[1]) | |
if n in range(1, 10): return n | |
except ValueError: return 0 | |
class _html2text(sgmllib.SGMLParser): | |
def __init__(self, out=sys.stdout.write): | |
sgmllib.SGMLParser.__init__(self) | |
if out is None: self.out = self.outtextf | |
else: self.out = out | |
self.outtext = u'' | |
self.quiet = 0 | |
self.p_p = 0 | |
self.outcount = 0 | |
self.start = 1 | |
self.space = 0 | |
self.a = [] | |
self.astack = [] | |
self.acount = 0 | |
self.list = [] | |
self.blockquote = 0 | |
self.pre = 0 | |
self.startpre = 0 | |
self.lastWasNL = 0 | |
self.abbr_title = None # current abbreviation definition | |
self.abbr_data = None # last inner HTML (for abbr being defined) | |
self.abbr_list = {} # stack of abbreviations to write later | |
def outtextf(self, s): | |
self.outtext += s | |
def close(self): | |
sgmllib.SGMLParser.close(self) | |
self.pbr() | |
self.o('', 0, 'end') | |
return self.outtext | |
def handle_charref(self, c): | |
self.o(charref(c)) | |
def handle_entityref(self, c): | |
self.o(entityref(c)) | |
def unknown_starttag(self, tag, attrs): | |
self.handle_tag(tag, attrs, 1) | |
def unknown_endtag(self, tag): | |
self.handle_tag(tag, None, 0) | |
def previousIndex(self, attrs): | |
""" returns the index of certain set of attributes (of a link) in the | |
self.a list | |
If the set of attributes is not found, returns None | |
""" | |
if not attrs.has_key('href'): return None | |
i = -1 | |
for a in self.a: | |
i += 1 | |
match = 0 | |
if a.has_key('href') and a['href'] == attrs['href']: | |
if a.has_key('title') or attrs.has_key('title'): | |
if (a.has_key('title') and attrs.has_key('title') and | |
a['title'] == attrs['title']): | |
match = True | |
else: | |
match = True | |
if match: return i | |
def handle_tag(self, tag, attrs, start): | |
attrs = fixattrs(attrs) | |
if hn(tag): | |
self.p() | |
if start: self.o(hn(tag)*"#" + ' ') | |
if tag in ['p', 'div']: self.p() | |
if tag == "br" and start: self.o(" \n") | |
if tag == "hr" and start: | |
self.p() | |
self.o("* * *") | |
self.p() | |
if tag in ["head", "style", 'script']: | |
if start: self.quiet += 1 | |
else: self.quiet -= 1 | |
if tag in ["body"]: | |
self.quiet = 0 # sites like 9rules.com never close <head> | |
if tag == "blockquote": | |
if start: | |
self.p(); self.o('> ', 0, 1); self.start = 1 | |
self.blockquote += 1 | |
else: | |
self.blockquote -= 1 | |
self.p() | |
if tag in ['em', 'i', 'u']: self.o("_") | |
if tag in ['strong', 'b']: self.o("**") | |
if tag == "code" and not self.pre: self.o('`') #TODO: `` `this` `` | |
if tag == "abbr": | |
if start: | |
attrsD = {} | |
for (x, y) in attrs: attrsD[x] = y | |
attrs = attrsD | |
self.abbr_title = None | |
self.abbr_data = '' | |
if attrs.has_key('title'): | |
self.abbr_title = attrs['title'] | |
else: | |
if self.abbr_title != None: | |
self.abbr_list[self.abbr_data] = self.abbr_title | |
self.abbr_title = None | |
self.abbr_data = '' | |
if tag == "a": | |
if start: | |
attrsD = {} | |
for (x, y) in attrs: attrsD[x] = y | |
attrs = attrsD | |
if attrs.has_key('href') and not (SKIP_INTERNAL_LINKS and attrs['href'].startswith('#')): | |
self.astack.append(attrs) | |
self.o("[") | |
else: | |
self.astack.append(None) | |
else: | |
if self.astack: | |
a = self.astack.pop() | |
if a: | |
i = self.previousIndex(a) | |
if i is not None: | |
a = self.a[i] | |
else: | |
self.acount += 1 | |
a['count'] = self.acount | |
a['outcount'] = self.outcount | |
self.a.append(a) | |
self.o("][" + `a['count']` + "]") | |
if tag == "img" and start: | |
attrsD = {} | |
for (x, y) in attrs: attrsD[x] = y | |
attrs = attrsD | |
if attrs.has_key('src'): | |
attrs['href'] = attrs['src'] | |
alt = attrs.get('alt', '') | |
i = self.previousIndex(attrs) | |
if i is not None: | |
attrs = self.a[i] | |
else: | |
self.acount += 1 | |
attrs['count'] = self.acount | |
attrs['outcount'] = self.outcount | |
self.a.append(attrs) | |
self.o("![") | |
self.o(alt) | |
self.o("]["+`attrs['count']`+"]") | |
if tag == 'dl' and start: self.p() | |
if tag == 'dt' and not start: self.pbr() | |
if tag == 'dd' and start: self.o(' ') | |
if tag == 'dd' and not start: self.pbr() | |
if tag in ["ol", "ul"]: | |
if start: | |
self.list.append({'name':tag, 'num':0}) | |
else: | |
if self.list: self.list.pop() | |
self.p() | |
if tag == 'li': | |
if start: | |
self.pbr() | |
if self.list: li = self.list[-1] | |
else: li = {'name':'ul', 'num':0} | |
self.o(" "*len(self.list)) #TODO: line up <ol><li>s > 9 correctly. | |
if li['name'] == "ul": self.o("* ") | |
elif li['name'] == "ol": | |
li['num'] += 1 | |
self.o(`li['num']`+". ") | |
self.start = 1 | |
else: | |
self.pbr() | |
if tag in ["table", "tr"] and start: self.p() | |
if tag == 'td': self.pbr() | |
if tag == "pre": | |
if start: | |
self.startpre = 1 | |
self.pre = 1 | |
else: | |
self.pre = 0 | |
self.p() | |
def pbr(self): | |
if self.p_p == 0: self.p_p = 1 | |
def p(self): self.p_p = 2 | |
def o(self, data, puredata=0, force=0): | |
if self.abbr_data is not None: self.abbr_data += data | |
if not self.quiet: | |
if puredata and not self.pre: | |
data = re.sub('\s+', ' ', data) | |
if data and data[0] == ' ': | |
self.space = 1 | |
data = data[1:] | |
if not data and not force: return | |
if self.startpre: | |
#self.out(" :") #TODO: not output when already one there | |
self.startpre = 0 | |
bq = (">" * self.blockquote) | |
if not (force and data and data[0] == ">") and self.blockquote: bq += " " | |
if self.pre: | |
bq += " " | |
data = data.replace("\n", "\n"+bq) | |
if self.start: | |
self.space = 0 | |
self.p_p = 0 | |
self.start = 0 | |
if force == 'end': | |
# It's the end. | |
self.p_p = 0 | |
self.out("\n") | |
self.space = 0 | |
if self.p_p: | |
self.out(('\n'+bq)*self.p_p) | |
self.space = 0 | |
if self.space: | |
if not self.lastWasNL: self.out(' ') | |
self.space = 0 | |
if self.a and ((self.p_p == 2 and LINKS_EACH_PARAGRAPH) or force == "end"): | |
if force == "end": self.out("\n") | |
newa = [] | |
for link in self.a: | |
if self.outcount > link['outcount']: | |
self.out(" ["+`link['count']`+"]: " + link['href']) #TODO: base href | |
if link.has_key('title'): self.out(" ("+link['title']+")") | |
self.out("\n") | |
else: | |
newa.append(link) | |
if self.a != newa: self.out("\n") # Don't need an extra line when nothing was done. | |
self.a = newa | |
if self.abbr_list and force == "end": | |
for abbr, definition in self.abbr_list.items(): | |
self.out(" *[" + abbr + "]: " + definition + "\n") | |
self.p_p = 0 | |
self.out(data) | |
self.lastWasNL = data and data[-1] == '\n' | |
self.outcount += 1 | |
def handle_data(self, data): | |
if r'\/script>' in data: self.quiet -= 1 | |
self.o(data, 1) | |
def unknown_decl(self, data): pass | |
def wrapwrite(text): sys.stdout.write(text.encode('utf8')) | |
def html2text_file(html, out=wrapwrite): | |
h = _html2text(out) | |
h.feed(html) | |
h.feed("") | |
return h.close() | |
def html2text(html): | |
return optwrap(html2text_file(html, None)) | |
if __name__ == "__main__": | |
if sys.argv[1:]: | |
arg = sys.argv[1] | |
if arg.startswith('http://'): | |
j = urllib.urlopen(arg) | |
try: | |
from feedparser import _getCharacterEncoding as enc | |
except ImportError: | |
enc = lambda x, y: ('utf-8', 1) | |
text = j.read() | |
encoding = enc(j.headers, text)[0] | |
if encoding == 'us-ascii': encoding = 'utf-8' | |
data = text.decode(encoding) | |
else: | |
encoding = 'utf8' | |
if len(sys.argv) > 2: | |
encoding = sys.argv[2] | |
data = open(arg, 'r').read().decode(encoding) | |
else: | |
data = sys.stdin.read().decode('utf8') | |
wrapwrite(html2text(data)) | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
'''OpenAnything: a kind and thoughtful library for HTTP web services | |
This program is part of 'Dive Into Python', a free Python book for | |
experienced programmers. Visit http://diveintopython.org/ for the | |
latest version. | |
''' | |
__author__ = 'Mark Pilgrim ([email protected])' | |
__version__ = '$Revision: 1.6 $'[11:-2] | |
__date__ = '$Date: 2004/04/16 21:16:24 $' | |
__copyright__ = 'Copyright (c) 2004 Mark Pilgrim' | |
__license__ = 'Python' | |
import urllib2, urlparse, gzip | |
from StringIO import StringIO | |
USER_AGENT = 'OpenAnything/%s +http://diveintopython.org/http_web_services/' % __version__ | |
class SmartRedirectHandler(urllib2.HTTPRedirectHandler): | |
def http_error_301(self, req, fp, code, msg, headers): | |
result = urllib2.HTTPRedirectHandler.http_error_301( | |
self, req, fp, code, msg, headers) | |
result.status = code | |
return result | |
def http_error_302(self, req, fp, code, msg, headers): | |
result = urllib2.HTTPRedirectHandler.http_error_302( | |
self, req, fp, code, msg, headers) | |
result.status = code | |
return result | |
class DefaultErrorHandler(urllib2.HTTPDefaultErrorHandler): | |
def http_error_default(self, req, fp, code, msg, headers): | |
result = urllib2.HTTPError( | |
req.get_full_url(), code, msg, headers, fp) | |
result.status = code | |
return result | |
def openAnything(source, etag=None, lastmodified=None, agent=USER_AGENT): | |
"""URL, filename, or string --> stream | |
This function lets you define parsers that take any input source | |
(URL, pathname to local or network file, or actual data as a string) | |
and deal with it in a uniform manner. Returned object is guaranteed | |
to have all the basic stdio read methods (read, readline, readlines). | |
Just .close() the object when you're done with it. | |
If the etag argument is supplied, it will be used as the value of an | |
If-None-Match request header. | |
If the lastmodified argument is supplied, it must be a formatted | |
date/time string in GMT (as returned in the Last-Modified header of | |
a previous request). The formatted date/time will be used | |
as the value of an If-Modified-Since request header. | |
If the agent argument is supplied, it will be used as the value of a | |
User-Agent request header. | |
""" | |
if hasattr(source, 'read'): | |
return source | |
if source == '-': | |
return sys.stdin | |
if urlparse.urlparse(source)[0] == 'http': | |
# open URL with urllib2 | |
request = urllib2.Request(source) | |
request.add_header('User-Agent', agent) | |
if lastmodified: | |
request.add_header('If-Modified-Since', lastmodified) | |
if etag: | |
request.add_header('If-None-Match', etag) | |
request.add_header('Accept-encoding', 'gzip') | |
opener = urllib2.build_opener(SmartRedirectHandler(), DefaultErrorHandler()) | |
return opener.open(request) | |
# try to open with native open function (if source is a filename) | |
try: | |
return open(source) | |
except (IOError, OSError): | |
pass | |
# treat source as string | |
return StringIO(str(source)) | |
def fetch(source, etag=None, lastmodified=None, agent=USER_AGENT): | |
'''Fetch data and metadata from a URL, file, stream, or string''' | |
result = {} | |
f = openAnything(source, etag, lastmodified, agent) | |
result['data'] = f.read() | |
if hasattr(f, 'headers'): | |
# save ETag, if the server sent one | |
result['etag'] = f.headers.get('ETag') | |
# save Last-Modified header, if the server sent one | |
result['lastmodified'] = f.headers.get('Last-Modified') | |
if f.headers.get('content-encoding') == 'gzip': | |
# data came back gzip-compressed, decompress it | |
result['data'] = gzip.GzipFile(fileobj=StringIO(result['data'])).read() | |
if hasattr(f, 'url'): | |
result['url'] = f.url | |
result['status'] = 200 | |
if hasattr(f, 'status'): | |
result['status'] = f.status | |
f.close() | |
return result | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
import html2text | |
from openanything import openAnything | |
from fileinput import input | |
import codecs | |
import sys | |
sys.stdout = codecs.getwriter('utf-8')(sys.stdout) | |
""" replace icky character entities with Unicode goodness """ | |
if __name__ == "__main__": | |
html2text.UNICODESNOB = 1 | |
for line in input(): | |
line = line.decode('utf-8') | |
print html2text.unescape(line), |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment