Last active
December 16, 2015 05:39
-
-
Save jetsanix/5386415 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
"""html2text: Turn HTML into equivalent Markdown-structured text.""" | |
__version__ = "3.1" | |
__author__ = "Aaron Swartz ([email protected])" | |
__copyright__ = "(C) 2004-2008 Aaron Swartz. GNU GPL 3." | |
__contributors__ = ["Martin 'Joey' Schulze", "Ricardo Reyes", "Kevin Jay North"] | |
# TODO: | |
# Support decoded entities with unifiable. | |
try: | |
True | |
except NameError: | |
setattr(__builtins__, 'True', 1) | |
setattr(__builtins__, 'False', 0) | |
def has_key(x, y): | |
if hasattr(x, 'has_key'): return x.has_key(y) | |
else: return y in x | |
try: | |
import htmlentitydefs | |
import urlparse | |
import HTMLParser | |
except ImportError: #Python3 | |
import html.entities as htmlentitydefs | |
import urllib.parse as urlparse | |
import html.parser as HTMLParser | |
try: #Python3 | |
import urllib.request as urllib | |
except: | |
import urllib | |
import optparse, re, sys, codecs, types | |
try: from textwrap import wrap | |
except: pass | |
# Use Unicode characters instead of their ascii psuedo-replacements | |
UNICODE_SNOB = 0 | |
# Put the links after each paragraph instead of at the end. | |
LINKS_EACH_PARAGRAPH = 0 | |
# Wrap long lines at position. 0 for no wrapping. (Requires Python 2.3.) | |
BODY_WIDTH = False | |
# Don't show internal links (href="#local-anchor") -- corresponding link targets | |
# won't be visible in the plain text file anyway. | |
SKIP_INTERNAL_LINKS = True | |
# Use inline, rather than reference, formatting for images and links | |
INLINE_LINKS = True | |
# Number of pixels Google indents nested lists | |
GOOGLE_LIST_INDENT = 36 | |
IGNORE_ANCHORS = False | |
IGNORE_IMAGES = False | |
### Entity Nonsense ### | |
def name2cp(k): | |
if k == 'apos': return ord("'") | |
if hasattr(htmlentitydefs, "name2codepoint"): # requires Python 2.3 | |
return htmlentitydefs.name2codepoint[k] | |
else: | |
k = htmlentitydefs.entitydefs[k] | |
if k.startswith("&#") and k.endswith(";"): return int(k[2:-1]) # not in latin-1 | |
return ord(codecs.latin_1_decode(k)[0]) | |
unifiable = {'rsquo':"'", 'lsquo':"'", 'rdquo':'"', 'ldquo':'"', | |
'copy':'(C)', 'mdash':'--', 'nbsp':' ', 'rarr':'->', 'larr':'<-', 'middot':'*', | |
'ndash':'-', 'oelig':'oe', 'aelig':'ae', | |
'agrave':'a', 'aacute':'a', 'acirc':'a', 'atilde':'a', 'auml':'a', 'aring':'a', | |
'egrave':'e', 'eacute':'e', 'ecirc':'e', 'euml':'e', | |
'igrave':'i', 'iacute':'i', 'icirc':'i', 'iuml':'i', | |
'ograve':'o', 'oacute':'o', 'ocirc':'o', 'otilde':'o', 'ouml':'o', | |
'ugrave':'u', 'uacute':'u', 'ucirc':'u', 'uuml':'u', | |
'lrm':'', 'rlm':''} | |
unifiable_n = {} | |
for k in unifiable.keys(): | |
unifiable_n[name2cp(k)] = unifiable[k] | |
def charref(name): | |
if name[0] in ['x','X']: | |
c = int(name[1:], 16) | |
else: | |
c = int(name) | |
if not UNICODE_SNOB and c in unifiable_n.keys(): | |
return unifiable_n[c] | |
else: | |
try: | |
return unichr(c) | |
except NameError: #Python3 | |
return chr(c) | |
def entityref(c): | |
if not UNICODE_SNOB and c in unifiable.keys(): | |
return unifiable[c] | |
else: | |
try: name2cp(c) | |
except KeyError: return "&" + c + ';' | |
else: | |
try: | |
return unichr(name2cp(c)) | |
except NameError: #Python3 | |
return chr(name2cp(c)) | |
def replaceEntities(s): | |
s = s.group(1) | |
if s[0] == "#": | |
return charref(s[1:]) | |
else: return entityref(s) | |
r_unescape = re.compile(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));") | |
def unescape(s): | |
return r_unescape.sub(replaceEntities, s) | |
### End Entity Nonsense ### | |
def onlywhite(line): | |
"""Return true if the line does only consist of whitespace characters.""" | |
for c in line: | |
if c is not ' ' and c is not ' ': | |
return c is ' ' | |
return line | |
def optwrap(text): | |
"""Wrap all paragraphs in the provided text.""" | |
return text | |
if not BODY_WIDTH: | |
return text | |
assert wrap, "Requires Python 2.3." | |
result = '' | |
newlines = 0 | |
for para in text.split("\n"): | |
if len(para) > 0: | |
if para[0] != ' ' and para[0] != '-' and para[0] != '*': | |
for line in wrap(para, BODY_WIDTH): | |
result += line + "\n" | |
result += "\n" | |
newlines = 2 | |
else: | |
if not onlywhite(para): | |
result += para + "\n" | |
newlines = 1 | |
else: | |
if newlines < 2: | |
result += "\n" | |
newlines += 1 | |
return result | |
def hn(tag): | |
if tag[0] == 'h' and len(tag) == 2: | |
try: | |
n = int(tag[1]) | |
if n in range(1, 10): return n | |
except ValueError: return 0 | |
def dumb_property_dict(style): | |
"""returns a hash of css attributes""" | |
return dict([(x.strip(), y.strip()) for x, y in [z.split(':', 1) for z in style.split(';') if ':' in z]]); | |
def dumb_css_parser(data): | |
"""returns a hash of css selectors, each of which contains a hash of css attributes""" | |
# remove @import sentences | |
importIndex = data.find('@import') | |
while importIndex != -1: | |
data = data[0:importIndex] + data[data.find(';', importIndex) + 1:] | |
importIndex = data.find('@import') | |
# parse the css. reverted from dictionary compehension in order to support older pythons | |
elements = [x.split('{') for x in data.split('}') if '{' in x.strip()] | |
elements = dict([(a.strip(), dumb_property_dict(b)) for a, b in elements]) | |
return elements | |
def element_style(attrs, style_def, parent_style): | |
"""returns a hash of the 'final' style attributes of the element""" | |
style = parent_style.copy() | |
if 'class' in attrs: | |
for css_class in attrs['class'].split(): | |
css_style = style_def['.' + css_class] | |
style.update(css_style) | |
if 'style' in attrs: | |
immediate_style = dumb_property_dict(attrs['style']) | |
style.update(immediate_style) | |
return style | |
def google_list_style(style): | |
"""finds out whether this is an ordered or unordered list""" | |
if 'list-style-type' in style: | |
list_style = style['list-style-type'] | |
if list_style in ['disc', 'circle', 'square', 'none']: | |
return 'ul' | |
return 'ol' | |
def google_nest_count(style): | |
"""calculate the nesting count of google doc lists""" | |
nest_count = 0 | |
if 'margin-left' in style: | |
nest_count = int(style['margin-left'][:-2]) / GOOGLE_LIST_INDENT | |
return nest_count | |
def google_has_height(style): | |
"""check if the style of the element has the 'height' attribute explicitly defined""" | |
if 'height' in style: | |
return True | |
return False | |
def google_text_emphasis(style): | |
"""return a list of all emphasis modifiers of the element""" | |
emphasis = [] | |
if 'text-decoration' in style: | |
emphasis.append(style['text-decoration']) | |
if 'font-style' in style: | |
emphasis.append(style['font-style']) | |
if 'font-weight' in style: | |
emphasis.append(style['font-weight']) | |
return emphasis | |
def google_fixed_width_font(style): | |
"""check if the css of the current element defines a fixed width font""" | |
font_family = '' | |
if 'font-family' in style: | |
font_family = style['font-family'] | |
if 'Courier New' == font_family or 'Consolas' == font_family: | |
return True | |
return False | |
def list_numbering_start(attrs): | |
"""extract numbering from list element attributes""" | |
if 'start' in attrs: | |
return int(attrs['start']) - 1 | |
else: | |
return 0 | |
class _html2text(HTMLParser.HTMLParser): | |
def __init__(self, out=None, baseurl=''): | |
HTMLParser.HTMLParser.__init__(self) | |
if out is None: self.out = self.outtextf | |
else: self.out = out | |
self.outtextlist = [] # empty list to store output characters before they are "joined" | |
try: | |
self.outtext = unicode() | |
except NameError: # Python3 | |
self.outtext = str() | |
self.quiet = 0 | |
self.p_p = 0 # number of newline character to print before next output | |
self.outcount = 0 | |
self.start = 1 | |
self.space = 0 | |
self.a = [] | |
self.astack = [] | |
self.acount = 0 | |
self.list = [] | |
self.blockquote = 0 | |
self.pre = 0 | |
self.startpre = 0 | |
self.code = False | |
self.br_toggle = '' | |
self.lastWasNL = 0 | |
self.lastWasList = False | |
self.style = 0 | |
self.style_def = {} | |
self.tag_stack = [] | |
self.emphasis = 0 | |
self.drop_white_space = 0 | |
self.inheader = False | |
self.abbr_title = None # current abbreviation definition | |
self.abbr_data = None # last inner HTML (for abbr being defined) | |
self.abbr_list = {} # stack of abbreviations to write later | |
self.baseurl = baseurl | |
if options.google_doc: | |
del unifiable_n[name2cp('nbsp')] | |
unifiable['nbsp'] = ' _place_holder;' | |
def feed(self, data): | |
data = data.replace("</' + 'script>", "</ignore>") | |
HTMLParser.HTMLParser.feed(self, data) | |
def outtextf(self, s): | |
self.outtextlist.append(s) | |
if s: self.lastWasNL = s[-1] == '\n' | |
def close(self): | |
HTMLParser.HTMLParser.close(self) | |
self.pbr() | |
self.o('', 0, 'end') | |
self.outtext = self.outtext.join(self.outtextlist) | |
if options.google_doc: | |
self.outtext = self.outtext.replace(' _place_holder;', ' '); | |
return self.outtext | |
def handle_charref(self, c): | |
self.o(charref(c), 1) | |
def handle_entityref(self, c): | |
self.o(entityref(c), 1) | |
def handle_starttag(self, tag, attrs): | |
self.handle_tag(tag, attrs, 1) | |
def handle_endtag(self, tag): | |
self.handle_tag(tag, None, 0) | |
def previousIndex(self, attrs): | |
""" returns the index of certain set of attributes (of a link) in the | |
self.a list | |
If the set of attributes is not found, returns None | |
""" | |
if not has_key(attrs, 'href'): return None | |
i = -1 | |
for a in self.a: | |
i += 1 | |
match = 0 | |
if has_key(a, 'href') and a['href'] == attrs['href']: | |
if has_key(a, 'title') or has_key(attrs, 'title'): | |
if (has_key(a, 'title') and has_key(attrs, 'title') and | |
a['title'] == attrs['title']): | |
match = True | |
else: | |
match = True | |
if match: return i | |
def drop_last(self, nLetters): | |
if not self.quiet: | |
self.outtext = self.outtext[:-nLetters] | |
def handle_emphasis(self, start, tag_style, parent_style): | |
"""handles various text emphases""" | |
tag_emphasis = google_text_emphasis(tag_style) | |
parent_emphasis = google_text_emphasis(parent_style) | |
# handle Google's text emphasis | |
strikethrough = 'line-through' in tag_emphasis and options.hide_strikethrough | |
bold = 'bold' in tag_emphasis and not 'bold' in parent_emphasis | |
italic = 'italic' in tag_emphasis and not 'italic' in parent_emphasis | |
fixed = google_fixed_width_font(tag_style) and not \ | |
google_fixed_width_font(parent_style) and not self.pre | |
if start: | |
# crossed-out text must be handled before other attributes | |
# in order not to output qualifiers unnecessarily | |
if bold or italic or fixed: | |
self.emphasis += 1 | |
if strikethrough: | |
self.quiet += 1 | |
if italic: | |
self.o("_") | |
self.drop_white_space += 1 | |
if bold: | |
self.o("**") | |
self.drop_white_space += 1 | |
if fixed: | |
self.o('`') | |
self.drop_white_space += 1 | |
self.code = True | |
else: | |
if bold or italic or fixed: | |
# there must not be whitespace before closing emphasis mark | |
self.emphasis -= 1 | |
self.space = 0 | |
self.outtext = self.outtext.rstrip() | |
if fixed: | |
if self.drop_white_space: | |
# empty emphasis, drop it | |
self.drop_last(1) | |
self.drop_white_space -= 1 | |
else: | |
self.o('`') | |
self.code = False | |
if bold: | |
if self.drop_white_space: | |
# empty emphasis, drop it | |
self.drop_last(2) | |
self.drop_white_space -= 1 | |
else: | |
self.o("**") | |
if italic: | |
if self.drop_white_space: | |
# empty emphasis, drop it | |
self.drop_last(1) | |
self.drop_white_space -= 1 | |
else: | |
self.o("_") | |
# space is only allowed after *all* emphasis marks | |
if (bold or italic) and not self.emphasis: | |
self.o(" ") | |
if strikethrough: | |
self.quiet -= 1 | |
def handle_tag(self, tag, attrs, start): | |
#attrs = fixattrs(attrs) | |
if attrs is None: | |
attrs = {} | |
else: | |
attrs = dict(attrs) | |
if options.google_doc: | |
# the attrs parameter is empty for a closing tag. in addition, we | |
# need the attributes of the parent nodes in order to get a | |
# complete style description for the current element. we assume | |
# that google docs export well formed html. | |
parent_style = {} | |
if start: | |
if self.tag_stack: | |
parent_style = self.tag_stack[-1][2] | |
tag_style = element_style(attrs, self.style_def, parent_style) | |
self.tag_stack.append((tag, attrs, tag_style)) | |
else: | |
dummy, attrs, tag_style = self.tag_stack.pop() | |
if self.tag_stack: | |
parent_style = self.tag_stack[-1][2] | |
if hn(tag): | |
self.p() | |
if start: | |
self.inheader = True | |
self.o(hn(tag)*"#" + ' ') | |
else: | |
self.inheader = False | |
return # prevent redundant emphasis marks on headers | |
if tag in ['p', 'div']: | |
if options.google_doc: | |
if start and google_has_height(tag_style): | |
self.p() | |
else: | |
self.soft_br() | |
else: | |
self.p() | |
if tag == "br" and start: self.o(" \n") | |
if tag == "hr" and start: | |
self.p() | |
self.o("* * *") | |
self.p() | |
if tag in ["head", "style"]: | |
if start: self.quiet += 1 | |
else: self.quiet -= 1 | |
if tag == "style": | |
if start: self.style += 1 | |
else: self.style -= 1 | |
if tag in ["body"]: | |
self.quiet = 0 # sites like 9rules.com never close <head> | |
if tag == "blockquote": | |
if start: | |
self.p(); self.o('> ', 0, 1); self.start = 1 | |
self.blockquote += 1 | |
else: | |
self.blockquote -= 1 | |
self.p() | |
if tag in ['em', 'i', 'u']: self.o("_") | |
if tag in ['strong', 'b']: self.o("**") | |
if tag in ['del', 'strike', "script"]: | |
if start: | |
self.o("<"+tag+">") | |
else: | |
self.o("</"+tag+">") | |
if options.google_doc: | |
if not self.inheader: | |
# handle some font attributes, but leave headers clean | |
self.handle_emphasis(start, tag_style, parent_style) | |
if tag == "code" and not self.pre: self.o('`') #TODO: `` `this` `` | |
if tag == "abbr": | |
if start: | |
self.abbr_title = None | |
self.abbr_data = '' | |
if has_key(attrs, 'title'): | |
self.abbr_title = attrs['title'] | |
else: | |
if self.abbr_title != None: | |
self.abbr_list[self.abbr_data] = self.abbr_title | |
self.abbr_title = None | |
self.abbr_data = '' | |
if tag == "a" and not IGNORE_ANCHORS: | |
if start: | |
if has_key(attrs, 'href') and not (SKIP_INTERNAL_LINKS and attrs['href'].startswith('#')): | |
self.astack.append(attrs) | |
self.o("[") | |
else: | |
self.astack.append(None) | |
else: | |
if self.astack: | |
a = self.astack.pop() | |
if a: | |
if INLINE_LINKS: | |
self.o("](" + a['href'] + ")") | |
else: | |
i = self.previousIndex(a) | |
if i is not None: | |
a = self.a[i] | |
else: | |
self.acount += 1 | |
a['count'] = self.acount | |
a['outcount'] = self.outcount | |
self.a.append(a) | |
self.o("][" + str(a['count']) + "]") | |
if tag == "img" and start and not IGNORE_IMAGES: | |
if has_key(attrs, 'src'): | |
attrs['href'] = attrs['src'] | |
alt = attrs.get('alt', '') | |
if INLINE_LINKS: | |
self.o("") | |
else: | |
i = self.previousIndex(attrs) | |
if i is not None: | |
attrs = self.a[i] | |
else: | |
self.acount += 1 | |
attrs['count'] = self.acount | |
attrs['outcount'] = self.outcount | |
self.a.append(attrs) | |
self.o("![") | |
self.o(alt) | |
self.o("]["+ str(attrs['count']) +"]") | |
if tag == 'dl' and start: self.p() | |
if tag == 'dt' and not start: self.pbr() | |
if tag == 'dd' and start: self.o(' ') | |
if tag == 'dd' and not start: self.pbr() | |
if tag in ["ol", "ul"]: | |
# Google Docs create sub lists as top level lists | |
if (not self.list) and (not self.lastWasList): | |
self.p() | |
if start: | |
if options.google_doc: | |
list_style = google_list_style(tag_style) | |
else: | |
list_style = tag | |
numbering_start = list_numbering_start(attrs) | |
self.list.append({'name':list_style, 'num':numbering_start}) | |
else: | |
if self.list: self.list.pop() | |
self.lastWasList = True | |
else: | |
self.lastWasList = False | |
if tag == 'li': | |
self.pbr() | |
if start: | |
if self.list: li = self.list[-1] | |
else: li = {'name':'ul', 'num':0} | |
if options.google_doc: | |
nest_count = google_nest_count(tag_style) | |
else: | |
nest_count = len(self.list) | |
self.o(" " * nest_count) #TODO: line up <ol><li>s > 9 correctly. | |
if li['name'] == "ul": self.o(options.ul_item_mark + " ") | |
elif li['name'] == "ol": | |
li['num'] += 1 | |
self.o(str(li['num'])+". ") | |
self.start = 1 | |
if tag in ["table", "tr"] and start: self.p() | |
if tag == 'td': self.pbr() | |
if tag == "pre": | |
if start: | |
self.startpre = 1 | |
self.pre = 1 | |
else: | |
self.pre = 0 | |
self.p() | |
if tag in ["iframe"]: | |
if start: | |
_tag = "<"+tag | |
for k, v in attrs.iteritems(): | |
_tag += " "+str(k)+"='"+str(v)+"'" | |
_tag += ">" | |
self.o(_tag) | |
else: | |
self.o("</"+tag+">") | |
def pbr(self): | |
if self.p_p == 0: self.p_p = 1 | |
def p(self): self.p_p = 2 | |
def soft_br(self): | |
self.pbr() | |
self.br_toggle = ' ' | |
def o(self, data, puredata=0, force=0): | |
if self.abbr_data is not None: self.abbr_data += data | |
if not self.quiet: | |
if options.google_doc: | |
# prevent white space immediately after 'begin emphasis' marks ('**' and '_') | |
lstripped_data = data.lstrip() | |
if self.drop_white_space and not (self.pre or self.code): | |
data = lstripped_data | |
if lstripped_data != '': | |
self.drop_white_space = 0 | |
if puredata and not self.pre: | |
data = re.sub('\s+', ' ', data) | |
if data and data[0] == ' ': | |
self.space = 1 | |
data = data[1:] | |
if not data and not force: return | |
if self.startpre: | |
#self.out(" :") #TODO: not output when already one there | |
self.startpre = 0 | |
bq = (">" * self.blockquote) | |
if not (force and data and data[0] == ">") and self.blockquote: bq += " " | |
if self.pre: | |
bq += " " | |
data = data.replace("\n", "\n"+bq) | |
if self.start: | |
self.space = 0 | |
self.p_p = 0 | |
self.start = 0 | |
if force == 'end': | |
# It's the end. | |
self.p_p = 0 | |
self.out("\n") | |
self.space = 0 | |
if self.p_p: | |
self.out((self.br_toggle+'\n'+bq)*self.p_p) | |
self.space = 0 | |
self.br_toggle = '' | |
if self.space: | |
if not self.lastWasNL: self.out(' ') | |
self.space = 0 | |
if self.a and ((self.p_p == 2 and LINKS_EACH_PARAGRAPH) or force == "end"): | |
if force == "end": self.out("\n") | |
newa = [] | |
for link in self.a: | |
if self.outcount > link['outcount']: | |
self.out(" ["+ str(link['count']) +"]: " + urlparse.urljoin(self.baseurl, link['href'])) | |
if has_key(link, 'title'): self.out(" ("+link['title']+")") | |
self.out("\n") | |
else: | |
newa.append(link) | |
if self.a != newa: self.out("\n") # Don't need an extra line when nothing was done. | |
self.a = newa | |
if self.abbr_list and force == "end": | |
for abbr, definition in self.abbr_list.items(): | |
self.out(" *[" + abbr + "]: " + definition + "\n") | |
self.p_p = 0 | |
self.out(data) | |
self.outcount += 1 | |
def handle_data(self, data): | |
if r'\/script>' in data: self.quiet -= 1 | |
if self.style: | |
self.style_def.update(dumb_css_parser(data)) | |
self.o(data, 1) | |
def unknown_decl(self, data): pass | |
def wrapwrite(text): | |
text = text.encode('utf-8') | |
try: #Python3 | |
sys.stdout.buffer.write(text) | |
except AttributeError: | |
sys.stdout.write(text) | |
def html2text_file(html, out=wrapwrite, baseurl=''): | |
h = _html2text(out, baseurl) | |
h.feed(html) | |
h.feed("") | |
return h.close() | |
def html2text(html, baseurl=''): | |
return optwrap(html2text_file(html, None, baseurl)) | |
class Storage: pass | |
options = Storage() | |
options.google_doc = False | |
options.ul_item_mark = '*' | |
if __name__ == "__main__": | |
baseurl = '' | |
p = optparse.OptionParser('%prog [(filename|url) [encoding]]', | |
version='%prog ' + __version__) | |
p.add_option("-g", "--google-doc", action="store_true", dest="google_doc", | |
default=False, help="convert an html-exported Google Document") | |
p.add_option("-d", "--dash-unordered-list", action="store_true", dest="ul_style_dash", | |
default=False, help="use a dash rather than a star for unordered list items") | |
p.add_option("-b", "--body-width", dest="body_width", action="store", type="int", | |
default=78, help="number of characters per output line, 0 for no wrap") | |
p.add_option("-i", "--google-list-indent", dest="list_indent", action="store", type="int", | |
default=GOOGLE_LIST_INDENT, help="number of pixels Google indents nested lists") | |
p.add_option("-s", "--hide-strikethrough", action="store_true", dest="hide_strikethrough", | |
default=False, help="hide strike-through text. only relevent when -g is specified as well") | |
(options, args) = p.parse_args() | |
# handle options | |
if options.ul_style_dash: | |
options.ul_item_mark = '-' | |
else: | |
options.ul_item_mark = '*' | |
BODY_WIDTH = options.body_width | |
GOOGLE_LIST_INDENT = options.list_indent | |
# process input | |
if len(args) > 0: | |
file_ = args[0] | |
encoding = None | |
if len(args) == 2: | |
encoding = args[1] | |
if len(args) > 2: | |
p.error('Too many arguments') | |
if file_.startswith('http://') or file_.startswith('https://'): | |
baseurl = file_ | |
j = urllib.urlopen(baseurl) | |
text = j.read() | |
if encoding is None: | |
try: | |
from feedparser import _getCharacterEncoding as enc | |
except ImportError: | |
enc = lambda x, y: ('utf-8', 1) | |
encoding = enc(j.headers, text)[0] | |
if encoding == 'us-ascii': | |
encoding = 'utf-8' | |
data = text.decode(encoding) | |
else: | |
data = open(file_, 'rb').read() | |
if encoding is None: | |
try: | |
from chardet import detect | |
except ImportError: | |
detect = lambda x: {'encoding': 'utf-8'} | |
encoding = detect(data)['encoding'] | |
data = data.decode(encoding) | |
else: | |
data = sys.stdin.read() | |
wrapwrite(html2text(data, baseurl)) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# encoding: utf-8 | |
""" | |
tumblr2calepin.py | |
Created by jet tsang <[email protected]> on 2013-04-15T12:50:22.889342+08:00 | |
""" | |
import sys | |
from datetime import datetime | |
file1=sys.argv[1] | |
f1 = open(file1, "r") | |
data = f1.readlines() | |
f1.close() | |
y=1 | |
Slug = None | |
Tags = None | |
Title = None | |
for x in data: | |
x = unicode(x, "utf8") | |
xsp = x.split("!$$$!") | |
if "date" == xsp[0]: | |
d = datetime.strptime(xsp[1].strip(),'%m/%d/%Y %H:%M:%S') | |
day_string = d.strftime('%Y-%m-%d %H:%M:%S') | |
day_string_corto = d.strftime('%Y') | |
theDate = day_string | |
if "slug" == xsp[0]: | |
Slug = xsp[1].strip() | |
if "tag" == xsp[0]: | |
Tags = xsp[1].strip().replace("|", ",") | |
if "title" == xsp[0]: | |
Title = xsp[1].strip() | |
if "](ht" in Title: | |
Title = Title[1:Title.lstrip("[").find("]")] | |
if "content" == x[0:7]: | |
break | |
y += 1 | |
post = 'Date: '+theDate+'\n' | |
if Slug: | |
if Slug == "view-on-path": | |
Slug = d.strftime('%Y%m%d%H%M') | |
else: | |
Slug = d.strftime('%Y%m%d%H%M') | |
post += 'Slug: '+Slug+'\n' | |
if Tags: | |
post += 'Tags: '+Tags+'\n' | |
if Title: | |
if Title == "view-on-path": | |
Title = d.strftime('%Y%m%d%H%M') | |
else: | |
Title = d.strftime('%Y%m%d%H%M') | |
post += 'Title: '+Title+'\n' | |
post += "\n" | |
markdownfile = day_string_corto+"-"+Title+".md".replace(" ","-") | |
print " ->", Title | |
out = open( markdownfile, "w" ) | |
out.write( post.encode("utf-8") ) | |
if data[-1] == data[y]: | |
out.write( data[y] ) | |
else: | |
for x in data[y:-1]: | |
out.write( x ) | |
out.close() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/sh | |
# Created by jet tsang <[email protected]> | |
case "$1" in | |
name) | |
python tumblr_backup.py $2 # modify from https://github.com/bdoms/tumblr_backup | |
mkdir -p calepin | |
cd $2/posts/ | |
for n in *.html | |
do | |
rm -f temp.txt | |
echo "==> processing" $n | |
../../html2text.py $n > temp.txt # modify from http://www.aaronsw.com/2002/html2text/ | |
../../tumblr2calepin.py temp.txt | |
done | |
mv -f *.md ../../calepin/ | |
;; | |
*) | |
echo "E.g: $0 name YourBlogName" | |
exit 3 | |
esac |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# encoding: utf-8 | |
# standard Python library imports | |
from __future__ import with_statement | |
import os | |
import sys | |
import urllib | |
import urllib2 | |
from xml.sax.saxutils import escape | |
from xml.sax import SAXException | |
import codecs | |
import imghdr | |
from collections import defaultdict | |
import time | |
import locale | |
from glob import glob | |
import re | |
# extra required packages | |
import xmltramp | |
join = os.path.join | |
# add another JPEG recognizer | |
# see http://www.garykessler.net/library/file_sigs.html | |
def test_jpg(h, f): | |
if h[:3] == '\xFF\xD8\xFF' and h[3] in "\xDB\xE0\xE1\xE2\xE3": | |
return 'jpg' | |
imghdr.tests.append(test_jpg) | |
# variable directory names, will be set in TumblrBackup.backup() | |
save_folder = '' | |
image_folder = '' | |
# constant names | |
root_folder = os.getcwdu() | |
post_dir = 'posts' | |
xml_dir = 'xml' | |
image_dir = 'images' | |
archive_dir = 'archive' | |
theme_dir = 'theme' | |
backup_css = 'backup.css' | |
custom_css = 'custom.css' | |
avatar_base = 'avatar' | |
blog_name = '' | |
post_header = '' | |
post_ext = '.html' | |
have_custom_css = False | |
# ensure the right date/time format | |
try: | |
locale.setlocale(locale.LC_TIME, '') | |
except locale.Error: | |
pass | |
encoding = 'utf-8' | |
time_encoding = locale.getlocale(locale.LC_TIME)[1] or encoding | |
def log(account, s): | |
if not options.quiet: | |
if account: | |
sys.stdout.write('%s: ' % account) | |
sys.stdout.write(s[:-1] + ' ' * 20 + s[-1:]) | |
sys.stdout.flush() | |
def mkdir(dir, recursive=False): | |
if not os.path.exists(dir): | |
if recursive: | |
os.makedirs(dir) | |
else: | |
os.mkdir(dir) | |
def path_to(*parts): | |
return join(save_folder, *parts) | |
def open_file(open_fn, parts): | |
if len(parts) > 1: | |
mkdir(path_to(*parts[:-1])) | |
return open_fn(path_to(*parts)) | |
def open_text(*parts): | |
return open_file( | |
lambda f: codecs.open(f, 'w', encoding, 'xmlcharrefreplace'), parts | |
) | |
def open_image(*parts): | |
return open_file(lambda f: open(f, 'wb'), parts) | |
def strftime(format, t=None): | |
if t is None: | |
t = time.localtime() | |
return time.strftime(format, t).decode(time_encoding) | |
def get_api_url(account): | |
"""construct the tumblr API URL""" | |
global blog_name | |
blog_name = account | |
if '.' not in account: | |
blog_name += '.tumblr.com' | |
base = 'http://' + blog_name + '/api/read' | |
if options.private: | |
password_manager = urllib2.HTTPPasswordMgrWithDefaultRealm() | |
password_manager.add_password(None, base, '', options.private) | |
auth_manager = urllib2.HTTPBasicAuthHandler(password_manager) | |
opener = urllib2.build_opener(auth_manager) | |
urllib2.install_opener(opener) | |
return base | |
def xmlparse(url, data=None): | |
for _ in range(10): | |
try: | |
resp = urllib2.urlopen(url, data) | |
except (urllib2.URLError, urllib2.HTTPError) as e: | |
sys.stderr.write('%s getting %s\n' % (e, url)) | |
continue | |
if resp.info().gettype() == 'text/xml': | |
break | |
else: | |
return None | |
xml = resp.read() | |
try: | |
doc = xmltramp.parse(xml) | |
except SAXException as e: | |
sys.stderr.write('%s %r\n\n%r\n\n%s\n' % (resp.info().gettype(), resp.msg, e, xml)) | |
return None | |
return doc if doc._name == 'tumblr' else None | |
def save_image(image_url): | |
"""saves an image if not saved yet, returns the local file name""" | |
image_filename = image_url.split('/')[-1] | |
glob_filter = '' if '.' in image_filename else '.*' | |
# check if a file with this name already exists | |
image_glob = glob(join(image_folder, image_filename + glob_filter)) | |
if image_glob: | |
return os.path.split(image_glob[0])[1] | |
# download the image data | |
try: | |
image_response = urllib2.urlopen(image_url) | |
except urllib2.HTTPError: | |
# return the original URL | |
return image_url | |
image_data = image_response.read() | |
image_response.close() | |
# determine the file type if it's unknown | |
if '.' not in image_filename: | |
image_type = imghdr.what(None, image_data[:32]) | |
if image_type: | |
image_filename += '.' + image_type.replace('jpeg', 'jpg') | |
# save the image | |
with open_image(image_dir, image_filename) as image_file: | |
image_file.write(image_data) | |
return image_filename | |
def save_style(): | |
with open_text(backup_css) as css: | |
css.write('''\ | |
body { width: 720px; margin: 0 auto; } | |
img { max-width: 720px; } | |
blockquote { margin-left: 0; border-left: 8px #999 solid; padding: 0 24px; } | |
.archive h1, .subtitle, article { padding-bottom: 0.75em; border-bottom: 1px #ccc dotted; } | |
.post a.llink { display: none; } | |
.meta a { text-decoration: none; } | |
.avatar { float: right; } | |
''') | |
def header(heading, title='', body_class='', subtitle='', avatar=''): | |
root_rel = '' if body_class == 'index' else '../' | |
css_rel = root_rel + (custom_css if have_custom_css else backup_css) | |
if body_class: | |
body_class = ' class=' + body_class | |
h = u'''<!DOCTYPE html> | |
<meta charset=%s> | |
<title>%s</title> | |
<link rel=stylesheet href=%s> | |
<body%s> | |
''' % (encoding, heading, css_rel, body_class) | |
if avatar: | |
h += '<img src=%s%s/%s alt=Avatar class=avatar>\n' % (root_rel, theme_dir, avatar) | |
if title: | |
h += u'<h1>%s</h1>\n' % title | |
if subtitle: | |
h += u'<p class=subtitle>%s</p>\n' % subtitle | |
return h | |
def get_avatar(): | |
try: | |
resp = urllib2.urlopen('http://api.tumblr.com/v2/blog/%s/avatar' % blog_name) | |
avatar_data = resp.read() | |
except: | |
return | |
avatar_file = avatar_base + '.' + imghdr.what(None, avatar_data[:32]) | |
with open_image(theme_dir, avatar_file) as f: | |
f.write(avatar_data) | |
class TumblrBackup: | |
def __init__(self): | |
self.total_count = 0 | |
def build_index(self): | |
for f in glob(path_to(post_dir, '*.html')): | |
post = LocalPost(f) | |
self.index[post.tm.tm_year][post.tm.tm_mon].append(post) | |
def save_index(self): | |
f = glob(path_to(theme_dir, avatar_base + '.*')) | |
avatar = os.path.split(f[0])[1] if f else None | |
with open_text('index.html') as idx: | |
idx.write(header(self.title, self.title, body_class='index', | |
subtitle=self.subtitle, avatar=avatar | |
)) | |
for year in sorted(self.index.keys(), reverse=options.reverse_index): | |
self.save_year(idx, year) | |
idx.write('<p>Generated on %s.</p>\n' % strftime('%x %X')) | |
def save_year(self, idx, year): | |
idx.write('<h3>%s</h3>\n<ul>\n' % year) | |
for month in sorted(self.index[year].keys(), reverse=options.reverse_index): | |
tm = time.localtime(time.mktime([year, month, 3, 0, 0, 0, 0, 0, -1])) | |
month_name = self.save_month(year, month, tm) | |
idx.write(' <li><a href=%s/%s title="%d post(s)">%s</a></li>\n' % ( | |
archive_dir, month_name, len(self.index[year][month]), | |
strftime('%B', tm) | |
)) | |
idx.write('</ul>\n\n') | |
def save_month(self, year, month, tm): | |
file_name = '%d-%02d.html' % (year, month) | |
with open_text(archive_dir, file_name) as arch: | |
arch.write('\n\n'.join([ | |
header(self.title, strftime('%B %Y', tm), body_class='archive'), | |
'\n'.join(p.get_post() for p in sorted( | |
self.index[year][month], key=lambda x: x.date, reverse=options.reverse_month | |
)), | |
'<p><a href=../ rel=contents>Index</a></p>\n' | |
])) | |
return file_name | |
def backup(self, account): | |
"""makes single files and an index for every post on a public Tumblr blog account""" | |
base = get_api_url(account) | |
# make sure there are folders to save in | |
global save_folder, image_folder, post_ext, post_dir, have_custom_css | |
if options.blosxom: | |
save_folder = root_folder | |
post_ext = '.txt' | |
post_dir = os.curdir | |
post_class = BlosxomPost | |
else: | |
save_folder = join(root_folder, account) | |
image_folder = path_to(image_dir) | |
post_class = TumblrPost | |
have_custom_css = os.access(path_to(custom_css), os.R_OK) | |
mkdir(save_folder, True) | |
self.post_count = 0 | |
# prepare the period start and end timestamps | |
if options.period: | |
i = 0; tm = [int(options.period[:4]), 1, 1, 0, 0, 0, 0, 0, -1] | |
if len(options.period) >= 6: | |
i = 1; tm[1] = int(options.period[4:6]) | |
if len(options.period) == 8: | |
i = 2; tm[2] = int(options.period[6:8]) | |
p_start = time.mktime(tm) | |
tm[i] += 1 | |
p_stop = time.mktime(tm) | |
# get the highest post id already saved | |
ident_max = None | |
if options.incremental: | |
try: | |
ident_max = max( | |
long(os.path.splitext(os.path.split(f)[1])[0]) | |
for f in glob(path_to(post_dir, '*' + post_ext)) | |
) | |
log(account, "Backing up posts after %d\r" % ident_max) | |
except ValueError: # max() arg is an empty sequence | |
pass | |
else: | |
log(account, "Getting basic information\r") | |
# start by calling the API with just a single post | |
soup = xmlparse(base + '?num=1') | |
if not soup: | |
return | |
# collect all the meta information | |
tumblelog = soup.tumblelog | |
try: | |
self.title = escape(tumblelog('title')) | |
except KeyError: | |
self.title = account | |
self.subtitle = unicode(tumblelog) | |
# use the meta information to create a HTML header | |
global post_header | |
post_header = header(self.title, body_class='post') | |
# find the total number of posts | |
total_posts = options.count or int(soup.posts('total')) | |
last_post = options.skip + total_posts | |
def _backup(posts): | |
for p in sorted(posts, key=lambda x: long(x('id')), reverse=True): | |
post = post_class(p) | |
if ident_max and long(post.ident) <= ident_max: | |
return False | |
if options.period: | |
if post.date >= p_stop: | |
continue | |
if post.date < p_start: | |
return False | |
post.generate_content() | |
if post.error: | |
sys.stderr.write('%s%s\n' % (post.error, 50 * ' ')) | |
post.save_post() | |
self.post_count += 1 | |
return True | |
# Get the XML entries from the API, which we can only do for max 50 posts at once. | |
# Posts "arrive" in reverse chronological order. Post #0 is the most recent one. | |
MAX = 50 | |
for i in range(options.skip, last_post, MAX): | |
# find the upper bound | |
j = min(i + MAX, last_post) | |
log(account, "Getting posts %d to %d of %d\r" % (i, j - 1, total_posts)) | |
soup = xmlparse('%s?num=%d&start=%d' % (base, j - i, i)) | |
if soup is None: | |
return | |
if not _backup(soup.posts['post':]): | |
break | |
if not options.blosxom and self.post_count: | |
get_avatar() | |
if not have_custom_css: | |
save_style() | |
self.index = defaultdict(lambda: defaultdict(list)) | |
self.build_index() | |
self.save_index() | |
log(account, "%d posts backed up\n" % self.post_count) | |
self.total_count += self.post_count | |
class TumblrPost: | |
def __init__(self, post): | |
self.content = '' | |
self.post = post | |
self.xml_content = post.__repr__(1, 1) | |
self.ident = post('id') | |
self.url = post('url') | |
self.slug = post('slug') | |
self.typ = post('type') | |
self.date = int(post('unix-timestamp')) | |
self.tm = time.localtime(self.date) | |
self.title = '' | |
self.tags = [] | |
self.file_name = self.ident + post_ext | |
self.error = None | |
def generate_content(self): | |
"""generates the content for this post""" | |
post = self.post | |
content = [] | |
def append(s, fmt=u'%s'): | |
# the %s conversion calls unicode() on the xmltramp element | |
content.append(fmt % s) | |
def get_try(elt): | |
try: | |
return unicode(post[elt]) | |
except KeyError: | |
return '' | |
def append_try(elt, fmt=u'%s'): | |
elt = get_try(elt) | |
if elt: | |
append(elt, fmt) | |
if self.typ == 'regular': | |
self.title = get_try('regular-title') | |
append_try('regular-body') | |
elif self.typ == 'photo': | |
url = escape(get_try('photo-link-url')) | |
for p in post.photoset['photo':] if hasattr(post, 'photoset') else [post]: | |
src = unicode(p['photo-url']) | |
append(escape(self.get_image_url(src)), u'<img alt="" src="%s">') | |
if url: | |
content[-1] = '<a href="%s">%s</a>' % (url, content[-1]) | |
content[-1] = '<p>' + content[-1] + '</p>' | |
if p._name == 'photo' and p('caption'): | |
append(p('caption'), u'<p>%s</p>') | |
append_try('photo-caption') | |
elif self.typ == 'link': | |
url = unicode(post['link-url']) | |
self.title = u'<a href="%s">%s</a>' % (escape(url), | |
post['link-text'] if 'link-text' in post else url | |
) | |
append_try('link-description') | |
elif self.typ == 'quote': | |
append(post['quote-text'], u'<blockquote><p>%s</p></blockquote>') | |
append_try('quote-source', u'<p>%s</p>') | |
elif self.typ == 'video': | |
source = unicode(post['video-source']).strip() | |
if source.startswith('<iframe') or source.startswith('<object'): | |
append(source, u'<p>%s</p>') | |
append_try('video-caption') | |
else: | |
append(post['video-player'], u'<p>%s</p>') | |
append_try('video-caption') | |
append(escape(source), u'<p><a href="%s">Original</a></p>') | |
elif self.typ == 'audio': | |
append(post['audio-player']) | |
append_try('audio-caption') | |
elif self.typ == 'answer': | |
self.title = post.question | |
append(post.answer) | |
elif self.typ == 'conversation': | |
self.title = get_try('conversation-title') | |
append( | |
'<br>\n'.join(escape(unicode(l)) for l in post.conversation['line':]), | |
u'<p>%s</p>' | |
) | |
else: | |
self.error = u"Unknown post type '%s' in post #%s" % (self.typ, self.ident) | |
append(escape(self.xml_content), u'<pre>%s</pre>') | |
self.tags = [u'%s' % t for t in post['tag':]] | |
self.content = '\n'.join(content) | |
# fix wrongly nested HTML tags | |
for p in ('<p>(<(%s)>)', '(</(%s)>)</p>'): | |
self.content = re.sub(p % 'p|ol|iframe[^>]*', r'\1', self.content) | |
def get_image_url(self, url): | |
return url | |
#url = save_image(url) | |
#if '://' in url: # in case of download errors | |
# return url | |
#return u'../%s/%s' % (image_dir, url) | |
def get_post(self): | |
"""returns this post in HTML""" | |
post = 'date!$$$!%s<br>\n' % strftime('%x %X', self.tm) | |
if self.slug: | |
post += u'slug!$$$!%s<br>\n' % self.slug | |
if self.tags: | |
post += u'tag!$$$!%s<br>\n' % u'|'.join(t for t in self.tags) | |
if self.title: | |
post += 'title!$$$!%s<br>\n' % self.title | |
post += 'content!$$$!<br>\n' | |
post += self.content | |
return post | |
def save_post(self): | |
"""saves this post locally""" | |
with open_text(post_dir, self.file_name) as f: | |
f.write(self.get_post()) | |
os.utime(path_to(post_dir, self.file_name), | |
(self.date, self.date) | |
) | |
if options.xml: | |
with open_text(xml_dir, self.ident + '.xml') as f: | |
f.write(self.xml_content) | |
class BlosxomPost(TumblrPost): | |
def get_image_url(self, url): | |
return url | |
def get_post(self): | |
"""returns this post as a Blosxom post""" | |
post = self.title + '\nmeta-id: _' + self.ident + '\nmeta-url: ' + self.url | |
if self.tags: | |
post += '\nmeta-tags: ' + ' '.join(t.replace(' ', '+') for t in self.tags) | |
post += '\n\n' + self.content | |
return post | |
class LocalPost: | |
def __init__(self, post_file): | |
with codecs.open(post_file, 'r', encoding) as f: | |
self.lines = f.readlines() | |
# remove header and footer | |
while self.lines and '<article ' not in self.lines[0]: | |
del self.lines[0] | |
while self.lines and '</article>' not in self.lines[-1]: | |
del self.lines[-1] | |
self.file_name = os.path.split(post_file)[1] | |
self.ident = os.path.splitext(self.file_name)[0] | |
self.date = os.stat(post_file).st_mtime | |
self.tm = time.localtime(self.date) | |
def get_post(self): | |
return u''.join(self.lines) | |
if __name__ == '__main__': | |
import optparse | |
parser = optparse.OptionParser("Usage: %prog [options] blog-name ...", | |
description="Makes a local backup of Tumblr blogs." | |
) | |
parser.add_option('-q', '--quiet', action='store_true', | |
help="suppress progress messages" | |
) | |
parser.add_option('-i', '--incremental', action='store_true', | |
help="incremental backup mode" | |
) | |
parser.add_option('-x', '--xml', action='store_true', | |
help="save the original XML source" | |
) | |
parser.add_option('-b', '--blosxom', action='store_true', | |
help="save the posts in blosxom format" | |
) | |
parser.add_option('-r', '--reverse-month', action='store_false', default=True, | |
help="reverse the post order in the monthly archives" | |
) | |
parser.add_option('-R', '--reverse-index', action='store_false', default=True, | |
help="reverse the index file order" | |
) | |
parser.add_option('-a', '--auto', type='int', metavar="HOUR", | |
help="do a full backup at HOUR hours, otherwise do an incremental backup" | |
" (useful for cron jobs)" | |
) | |
parser.add_option('-n', '--count', type='int', help="save only COUNT posts") | |
parser.add_option('-s', '--skip', type='int', default=0, | |
help="skip the first SKIP posts" | |
) | |
parser.add_option('-p', '--period', help="limit the backup to PERIOD" | |
" ('y', 'm', 'd' or YYYY[MM[DD]])" | |
) | |
parser.add_option('-P', '--private', help="password for a private tumblr", | |
metavar='PASSWORD' | |
) | |
options, args = parser.parse_args() | |
if options.auto is not None: | |
if options.auto == time.localtime().tm_hour: | |
options.incremental = False | |
else: | |
options.incremental = True | |
if options.period: | |
try: | |
options.period = time.strftime( | |
{'y': '%Y', 'm': '%Y%m', 'd': '%Y%m%d'}[options.period] | |
) | |
except KeyError: | |
options.period = options.period.replace('-', '') | |
if len(options.period) not in (4, 6, 8): | |
parser.error("Period must be 'y', 'm', 'd' or YYYY[MM[DD]]") | |
if not args: | |
args = ['bbolli'] | |
tb = TumblrBackup() | |
for account in args: | |
tb.backup(account) | |
sys.exit(0 if tb.total_count else 1) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment