Created
August 28, 2016 18:39
-
-
Save userid/8c698351ca079cdc777bfed3a98acbbd to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python | |
| # encoding: utf-8 | |
| # standard Python library imports | |
| from __future__ import with_statement | |
| import codecs | |
| from collections import defaultdict | |
| from datetime import datetime | |
| import errno | |
| from glob import glob | |
| from httplib import HTTPException | |
| import imghdr | |
| try: | |
| import json | |
| except ImportError: | |
| import simplejson as json | |
| import locale | |
| import os | |
| from os.path import join, split, splitext | |
| import Queue | |
| import re | |
| import ssl | |
| import sys | |
| import threading | |
| import time | |
| import urllib | |
| import urllib2 | |
| import urlparse | |
| from xml.sax.saxutils import escape | |
| try: | |
| from settings import DEFAULT_BLOGS | |
| except ImportError: | |
| DEFAULT_BLOGS = [] | |
| # extra optional packages | |
| try: | |
| import pyexiv2 | |
| except ImportError: | |
| pyexiv2 = None | |
| try: | |
| import youtube_dl | |
| from youtube_dl.utils import sanitize_filename | |
| except ImportError: | |
| youtube_dl = None | |
| # Format of displayed tags | |
| TAG_FMT = '#%s' | |
| # Format of tag link URLs; set to None to suppress the links. | |
| # Named placeholders that will be replaced: domain, tag | |
| TAGLINK_FMT = 'http://%(domain)s/tagged/%(tag)s' | |
| # exit codes | |
| EXIT_SUCCESS = 0 | |
| EXIT_NOPOSTS = 1 | |
| # EXIT_OPTPARSE = 2 -- returned by module optparse | |
| EXIT_INTERRUPT = 3 | |
| EXIT_ERRORS = 4 | |
| # add another JPEG recognizer | |
| # see http://www.garykessler.net/library/file_sigs.html | |
| def test_jpg(h, f): | |
| if h[:3] == '\xFF\xD8\xFF' and h[3] in "\xDB\xE0\xE1\xE2\xE3": | |
| return 'jpg' | |
| imghdr.tests.append(test_jpg) | |
| # variable directory names, will be set in TumblrBackup.backup() | |
| save_folder = '' | |
| media_folder = '' | |
| # constant names | |
| root_folder = os.getcwdu() | |
| post_dir = 'posts' | |
| json_dir = 'json' | |
| media_dir = 'media' | |
| archive_dir = 'archive' | |
| theme_dir = 'theme' | |
| save_dir = '../' | |
| backup_css = 'backup.css' | |
| custom_css = 'custom.css' | |
| avatar_base = 'avatar' | |
| dir_index = 'index.html' | |
| blog_name = '' | |
| post_ext = '.html' | |
| have_custom_css = False | |
| POST_TYPES = ( | |
| 'text', 'quote', 'link', 'answer', 'video', 'audio', 'photo', 'chat' | |
| ) | |
| POST_TYPES_SET = frozenset(POST_TYPES) | |
| TYPE_ANY = 'any' | |
| TAG_ANY = '__all__' | |
| MAX_POSTS = 50 | |
| HTTP_TIMEOUT = 90 | |
| HTTP_CHUNK_SIZE = 1024 * 1024 | |
| # bb-tumblr-backup API key | |
| API_KEY = '8YUsKJvcJxo2MDwmWMDiXZGuMuIbeCwuQGP5ZHSEA4jBJPMnJT' | |
| # ensure the right date/time format | |
| try: | |
| locale.setlocale(locale.LC_TIME, '') | |
| except locale.Error: | |
| pass | |
| encoding = 'utf-8' | |
| time_encoding = locale.getlocale(locale.LC_TIME)[1] or encoding | |
| have_ssl_ctx = sys.version_info >= (2, 7, 9) | |
| if have_ssl_ctx: | |
| ssl_ctx = ssl.create_default_context() | |
| def urlopen(url): | |
| return urllib2.urlopen(url, timeout=HTTP_TIMEOUT, context=ssl_ctx) | |
| else: | |
| def urlopen(url): | |
| return urllib2.urlopen(url, timeout=HTTP_TIMEOUT) | |
| def log(account, s): | |
| if not options.quiet: | |
| if account: | |
| sys.stdout.write('%s: ' % account) | |
| sys.stdout.write(s[:-1] + ' ' * 20 + s[-1:]) | |
| sys.stdout.flush() | |
| def mkdir(dir, recursive=False): | |
| if not os.path.exists(dir): | |
| try: | |
| if recursive: | |
| os.makedirs(dir) | |
| else: | |
| os.mkdir(dir) | |
| except OSError as e: | |
| if e.errno != errno.EEXIST: | |
| raise | |
| def path_to(*parts): | |
| return join(save_folder, *parts) | |
| def open_file(open_fn, parts): | |
| if len(parts) > 1: | |
| mkdir(path_to(*parts[:-1]), (len(parts) > 2)) | |
| return open_fn(path_to(*parts)) | |
| def open_text(*parts): | |
| return open_file( | |
| lambda f: codecs.open(f, 'w', encoding, 'xmlcharrefreplace'), parts | |
| ) | |
| def open_media(*parts): | |
| return open_file(lambda f: open(f, 'wb'), parts) | |
| def strftime(format, t=None): | |
| if t is None: | |
| t = time.localtime() | |
| return time.strftime(format, t).decode(time_encoding) | |
| def get_api_url(account): | |
| """construct the tumblr API URL""" | |
| global blog_name | |
| blog_name = account | |
| if '.' not in account: | |
| blog_name += '.tumblr.com' | |
| return 'https://api.tumblr.com/v2/blog/' + blog_name + '/posts' | |
| def set_period(): | |
| """Prepare the period start and end timestamps""" | |
| i = 0 | |
| tm = [int(options.period[:4]), 1, 1, 0, 0, 0, 0, 0, -1] | |
| if len(options.period) >= 6: | |
| i = 1 | |
| tm[1] = int(options.period[4:6]) | |
| if len(options.period) == 8: | |
| i = 2 | |
| tm[2] = int(options.period[6:8]) | |
| options.p_start = time.mktime(tm) | |
| tm[i] += 1 | |
| options.p_stop = time.mktime(tm) | |
| def apiparse(base, count, start=0): | |
| params = {'api_key': API_KEY, 'limit': count, 'reblog_info': 'true'} | |
| if start > 0: | |
| params['offset'] = start | |
| url = base + '?' + urllib.urlencode(params) | |
| for _ in range(10): | |
| try: | |
| resp = urlopen(url) | |
| data = resp.read() | |
| except (EnvironmentError, HTTPException) as e: | |
| sys.stderr.write("%s getting %s\n" % (e, url)) | |
| continue | |
| if resp.info().gettype() == 'application/json': | |
| break | |
| sys.stderr.write("Unexpected Content-Type: '%s'\n" % resp.info().gettype()) | |
| return None | |
| else: | |
| return None | |
| try: | |
| doc = json.loads(data) | |
| except ValueError as e: | |
| sys.stderr.write('%s: %s\n%d %s %s\n%r\n' % ( | |
| e.__class__.__name__, e, resp.getcode(), resp.msg, resp.info().gettype(), data | |
| )) | |
| return None | |
| return doc if doc.get('meta', {}).get('status', 0) == 200 else None | |
| def add_exif(image_name, tags): | |
| try: | |
| metadata = pyexiv2.ImageMetadata(image_name) | |
| metadata.read() | |
| except EnvironmentError: | |
| sys.stderr.write("Error reading metadata for image %s\n" % image_name) | |
| return | |
| KW_KEY = 'Iptc.Application2.Keywords' | |
| if '-' in options.exif: # remove all tags | |
| if KW_KEY in metadata.iptc_keys: | |
| del metadata[KW_KEY] | |
| else: # add tags | |
| if KW_KEY in metadata.iptc_keys: | |
| tags |= set(metadata[KW_KEY].value) | |
| tags = list(tag.strip().lower() for tag in tags | options.exif if tag) | |
| metadata[KW_KEY] = pyexiv2.IptcTag(KW_KEY, tags) | |
| try: | |
| metadata.write() | |
| except EnvironmentError: | |
| sys.stderr.write("Writing metadata failed for tags: %s in: %s\n" % (tags, image_name)) | |
| def save_style(): | |
| with open_text(backup_css) as css: | |
| css.write('''\ | |
| @import url("override.css"); | |
| body { width: 720px; margin: 0 auto; } | |
| body > footer { padding: 1em 0; } | |
| header > img { float: right; } | |
| img { max-width: 720px; } | |
| blockquote { margin-left: 0; border-left: 8px #999 solid; padding: 0 24px; } | |
| .archive h1, .subtitle, article { padding-bottom: 0.75em; border-bottom: 1px #ccc dotted; } | |
| .post a.llink { display: none; } | |
| header a, footer a { text-decoration: none; } | |
| footer, article footer a { font-size: small; color: #999; } | |
| ''') | |
| def get_avatar(): | |
| try: | |
| resp = urlopen('http://api.tumblr.com/v2/blog/%s/avatar' % blog_name) | |
| avatar_data = resp.read() | |
| except (EnvironmentError, HTTPException): | |
| return | |
| avatar_file = avatar_base + '.' + imghdr.what(None, avatar_data[:32]) | |
| with open_media(theme_dir, avatar_file) as f: | |
| f.write(avatar_data) | |
| def get_style(): | |
| """Get the blog's CSS by brute-forcing it from the home page. | |
| The v2 API has no method for getting the style directly. | |
| See https://groups.google.com/d/msg/tumblr-api/f-rRH6gOb6w/sAXZIeYx5AUJ""" | |
| try: | |
| resp = urlopen('http://%s/' % blog_name) | |
| page_data = resp.read() | |
| except (EnvironmentError, HTTPException): | |
| return | |
| for match in re.findall(r'(?s)<style type=.text/css.>(.*?)</style>', page_data): | |
| css = match.strip().decode(encoding, 'replace') | |
| if not '\n' in css: | |
| continue | |
| css = css.replace('\r', '').replace('\n ', '\n') | |
| with open_text(theme_dir, 'style.css') as f: | |
| f.write(css + '\n') | |
| return | |
| class TumblrBackup: | |
| def __init__(self): | |
| self.errors = False | |
| self.total_count = 0 | |
| self.index = defaultdict(lambda: defaultdict(list)) | |
| self.archives = [] | |
| def exit_code(self): | |
| if self.errors: | |
| return EXIT_ERRORS | |
| if self.total_count == 0: | |
| return EXIT_NOPOSTS | |
| return EXIT_SUCCESS | |
| def build_index(self): | |
| filter = join('*', dir_index) if options.dirs else '*' + post_ext | |
| for f in glob(path_to(post_dir, filter)): | |
| post = LocalPost(f) | |
| self.index[post.tm.tm_year][post.tm.tm_mon].append(post) | |
| self.archives = sorted(((y, m) for y in self.index for m in self.index[y]), | |
| reverse=options.reverse_month | |
| ) | |
| def save_index(self): | |
| f = glob(path_to(theme_dir, avatar_base + '.*')) | |
| avatar = split(f[0])[1] if f else None | |
| with open_text(dir_index) as idx: | |
| idx.write(self.header(self.title, body_class='index', | |
| subtitle=self.subtitle, avatar=avatar | |
| )) | |
| for year in sorted(self.index.keys(), reverse=options.reverse_index): | |
| self.save_year(idx, year) | |
| idx.write(u'<footer><p>Generated on %s by <a href=https://github.com/' | |
| 'bbolli/tumblr-utils>tumblr-utils</a>.</p></footer>\n' % strftime('%x %X') | |
| ) | |
| def save_year(self, idx, year): | |
| idx.write('<h3>%s</h3>\n<ul>\n' % year) | |
| for month in sorted(self.index[year].keys(), reverse=options.reverse_index): | |
| tm = time.localtime(time.mktime([year, month, 3, 0, 0, 0, 0, 0, -1])) | |
| month_name = self.save_month(year, month, tm) | |
| idx.write(u' <li><a href=%s/%s title="%d post(s)">%s</a></li>\n' % ( | |
| archive_dir, month_name, len(self.index[year][month]), | |
| strftime('%B', tm) | |
| )) | |
| idx.write('</ul>\n\n') | |
| def save_month(self, year, month, tm): | |
| posts = sorted(self.index[year][month], key=lambda x: x.date, reverse=options.reverse_month) | |
| posts_month = len(posts) | |
| posts_page = options.posts_per_page if options.posts_per_page >= 1 else posts_month | |
| def pages_per_month(y, m): | |
| posts = len(self.index[y][m]) | |
| return posts / posts_page + bool(posts % posts_page) | |
| def next_month(previous): | |
| i = self.archives.index((year, month)) | |
| i += -1 if previous else 1 | |
| if i < 0 or i >= len(self.archives): | |
| return 0, 0 | |
| return self.archives[i] | |
| FILE_FMT = '%d-%02d-p%s' | |
| pages_month = pages_per_month(year, month) | |
| for page, start in enumerate(range(0, posts_month, posts_page), start=1): | |
| archive = [self.header(strftime('%B %Y', tm), body_class='archive')] | |
| archive.extend(p.get_post() for p in posts[start:start + posts_page]) | |
| file_name = FILE_FMT % (year, month, page) | |
| if options.dirs: | |
| base = save_dir + archive_dir + '/' | |
| suffix = '/' | |
| arch = open_text(archive_dir, file_name, dir_index) | |
| file_name += suffix | |
| else: | |
| base = '' | |
| suffix = post_ext | |
| file_name += suffix | |
| arch = open_text(archive_dir, file_name) | |
| if page > 1: | |
| pp = FILE_FMT % (year, month, page - 1) | |
| else: | |
| py, pm = next_month(True) | |
| pp = FILE_FMT % (py, pm, pages_per_month(py, pm)) if py else '' | |
| first_file = file_name | |
| if page < pages_month: | |
| np = FILE_FMT % (year, month, page + 1) | |
| else: | |
| ny, nm = next_month(False) | |
| np = FILE_FMT % (ny, nm, 1) if ny else '' | |
| archive.append(self.footer(base, pp, np, suffix)) | |
| arch.write('\n'.join(archive)) | |
| return first_file | |
| def header(self, title='', body_class='', subtitle='', avatar=''): | |
| root_rel = '' if body_class == 'index' else save_dir | |
| css_rel = root_rel + (custom_css if have_custom_css else backup_css) | |
| if body_class: | |
| body_class = ' class=' + body_class | |
| h = u'''<!DOCTYPE html> | |
| <meta charset=%s> | |
| <title>%s</title> | |
| <link rel=stylesheet href=%s> | |
| <body%s> | |
| <header> | |
| ''' % (encoding, self.title, css_rel, body_class) | |
| if avatar: | |
| h += '<img src=%s%s/%s alt=Avatar>\n' % (root_rel, theme_dir, avatar) | |
| if title: | |
| h += u'<h1>%s</h1>\n' % title | |
| if subtitle: | |
| h += u'<p class=subtitle>%s</p>\n' % subtitle | |
| h += '</header>\n' | |
| return h | |
| def footer(self, base, previous_page, next_page, suffix): | |
| f = '<footer><nav>' | |
| f += '<a href=%s rel=index>Index</a>\n' % save_dir | |
| if previous_page: | |
| f += '| <a href=%s%s%s rel=prev>Previous</a>\n' % (base, previous_page, suffix) | |
| if next_page: | |
| f += '| <a href=%s%s%s rel=next>Next</a>\n' % (base, next_page, suffix) | |
| f += '</nav></footer>\n' | |
| return f | |
| def backup(self, account): | |
| """makes single files and an index for every post on a public Tumblr blog account""" | |
| base = get_api_url(account) | |
| # make sure there are folders to save in | |
| global save_folder, media_folder, post_ext, post_dir, save_dir, have_custom_css | |
| if options.blosxom: | |
| save_folder = root_folder | |
| post_ext = '.txt' | |
| post_dir = os.curdir | |
| post_class = BlosxomPost | |
| else: | |
| save_folder = join(root_folder, options.outdir or account) | |
| media_folder = path_to(media_dir) | |
| if options.dirs: | |
| post_ext = '' | |
| save_dir = '../../' | |
| mkdir(path_to(post_dir), True) | |
| else: | |
| mkdir(save_folder, True) | |
| post_class = TumblrPost | |
| have_custom_css = os.access(path_to(custom_css), os.R_OK) | |
| self.post_count = 0 | |
| # get the highest post id already saved | |
| ident_max = None | |
| if options.incremental: | |
| try: | |
| ident_max = max( | |
| long(splitext(split(f)[1])[0]) | |
| for f in glob(path_to(post_dir, '*' + post_ext)) | |
| ) | |
| log(account, "Backing up posts after %d\r" % ident_max) | |
| except ValueError: # max() arg is an empty sequence | |
| pass | |
| else: | |
| log(account, "Getting basic information\r") | |
| # start by calling the API with just a single post | |
| soup = apiparse(base, 1) | |
| if not soup: | |
| self.errors = True | |
| return | |
| # collect all the meta information | |
| resp = soup['response'] | |
| blog = resp['blog'] | |
| try: | |
| self.title = escape(blog['title']) | |
| except KeyError: | |
| self.title = account | |
| self.subtitle = blog['description'] | |
| # use the meta information to create a HTML header | |
| TumblrPost.post_header = self.header(body_class='post') | |
| # find the post number limit to back up | |
| last_post = blog['posts'] | |
| if options.count: | |
| last_post = min(last_post, options.count + options.skip) | |
| def _backup(posts): | |
| for p in sorted(posts, key=lambda x: x['id'], reverse=True): | |
| post = post_class(p) | |
| if ident_max and long(post.ident) <= ident_max: | |
| return False | |
| if options.period: | |
| if post.date >= options.p_stop: | |
| continue | |
| if post.date < options.p_start: | |
| return False | |
| if options.request: | |
| if post.typ not in options.request: | |
| continue | |
| tags = options.request[post.typ] | |
| if not (TAG_ANY in tags or tags & post.tags_lower): | |
| continue | |
| if options.no_reblog: | |
| if 'reblogged_from_name' in p or 'reblogged_root_name' in p: | |
| if 'trail' in p and not p['trail']: | |
| continue | |
| elif 'trail' in p and 'is_current_item' not in p['trail'][-1]: | |
| continue | |
| elif 'trail' in p and p['trail'] and 'is_current_item' not in p['trail'][-1]: | |
| continue | |
| backup_pool.add_work(post.save_content) | |
| self.post_count += 1 | |
| return True | |
| # start the thread pool | |
| backup_pool = ThreadPool() | |
| try: | |
| # Get the JSON entries from the API, which we can only do for max 50 posts at once. | |
| # Posts "arrive" in reverse chronological order. Post #0 is the most recent one. | |
| last_batch = MAX_POSTS | |
| i = options.skip | |
| while i < last_post: | |
| # find the upper bound | |
| j = min(i + MAX_POSTS, last_post) | |
| log(account, "Getting posts %d to %d of %d\r" % (i, j - 1, last_post)) | |
| soup = apiparse(base, j - i, i) | |
| if soup is None: | |
| i += last_batch # try the next batch | |
| self.errors = True | |
| continue | |
| posts = soup['response']['posts'] | |
| if not _backup(posts): | |
| break | |
| last_batch = len(posts) | |
| i += last_batch | |
| except: | |
| # ensure proper thread pool termination | |
| backup_pool.cancel() | |
| raise | |
| # wait until all posts have been saved | |
| backup_pool.wait() | |
| # postprocessing | |
| if not options.blosxom and self.post_count: | |
| get_avatar() | |
| get_style() | |
| if not have_custom_css: | |
| save_style() | |
| self.build_index() | |
| self.save_index() | |
| log(account, "%d posts backed up\n" % self.post_count) | |
| self.total_count += self.post_count | |
| class TumblrPost: | |
| post_header = '' # set by TumblrBackup.backup() | |
| def __init__(self, post): | |
| self.content = '' | |
| self.post = post | |
| self.json_content = json.dumps(post, sort_keys=True, indent=4, separators=(',', ': ')) | |
| self.ident = str(post['id']) | |
| self.url = post['post_url'] | |
| self.shorturl = post['short_url'] | |
| self.typ = post['type'] | |
| self.date = post['timestamp'] | |
| self.isodate = datetime.utcfromtimestamp(self.date).isoformat() + 'Z' | |
| self.tm = time.localtime(self.date) | |
| self.title = '' | |
| self.tags = post['tags'] | |
| self.note_count = post.get('note_count', 0) | |
| self.source_title = post.get('source_title', '') | |
| self.source_url = post.get('source_url', '') | |
| if options.request: | |
| self.tags_lower = set(t.lower() for t in self.tags) | |
| self.file_name = join(self.ident, dir_index) if options.dirs else self.ident + post_ext | |
| self.llink = self.ident if options.dirs else self.file_name | |
| def save_content(self): | |
| """generates the content for this post""" | |
| post = self.post | |
| content = [] | |
| def append(s, fmt=u'%s'): | |
| content.append(fmt % s) | |
| def get_try(elt): | |
| return post.get(elt) or '' | |
| def append_try(elt, fmt=u'%s'): | |
| elt = get_try(elt) | |
| if elt: | |
| if options.save_images: | |
| elt = re.sub(r'''(?i)(<img [^>]*\bsrc\s*=\s*["'])(.*?)(["'][^>]*>)''', | |
| self.get_inline_image, elt | |
| ) | |
| append(elt, fmt) | |
| self.media_dir = join(post_dir, self.ident) if options.dirs else media_dir | |
| self.media_url = save_dir + self.media_dir | |
| self.media_folder = path_to(self.media_dir) | |
| if self.typ == 'text': | |
| self.title = get_try('title') | |
| append_try('body') | |
| elif self.typ == 'photo': | |
| url = get_try('link_url') | |
| is_photoset = len(post['photos']) > 1 | |
| for offset, p in enumerate(post['photos'], start=1): | |
| o = p['original_size'] | |
| src = o['url'] | |
| if options.save_images: | |
| src = self.get_image_url(src, offset if is_photoset else 0) | |
| append(escape(src), u'<img alt="" src="%s">') | |
| if url: | |
| content[-1] = u'<a href="%s">%s</a>' % (escape(url), content[-1]) | |
| content[-1] = '<p>' + content[-1] + '</p>' | |
| if p['caption']: | |
| append(p['caption'], u'<p>%s</p>') | |
| append_try('caption') | |
| elif self.typ == 'link': | |
| url = post['url'] | |
| self.title = u'<a href="%s">%s</a>' % (escape(url), post['title'] or url) | |
| append_try('description') | |
| elif self.typ == 'quote': | |
| append(post['text'], u'<blockquote><p>%s</p></blockquote>') | |
| append_try('source', u'<p>%s</p>') | |
| elif self.typ == 'video': | |
| src = '' | |
| if options.save_video: | |
| if post['video_type'] == 'tumblr': | |
| src = self.get_media_url(post['video_url'], '.mp4') | |
| elif youtube_dl: | |
| src = self.get_youtube_url(self.url) | |
| if not src: | |
| sys.stdout.write(u'Unable to download video in post #%s%-50s\n' % | |
| (self.ident, ' ') | |
| ) | |
| if src: | |
| append(u'<p><video controls><source src="%s" type=video/mp4>%s<br>\n<a href="%s">%s</a></video></p>' % ( | |
| src, "Your browser does not support the video element.", src, "Video file" | |
| )) | |
| else: | |
| append(post['player'][-1]['embed_code']) | |
| append_try('caption') | |
| elif self.typ == 'audio': | |
| src = '' | |
| if options.save_audio: | |
| if post['audio_type'] == 'tumblr': | |
| audio_url = post['audio_url'] | |
| if audio_url.startswith('http://a.tumblr.com/'): | |
| src = self.get_media_url(audio_url, '.mp3') | |
| elif audio_url.startswith('https://www.tumblr.com/audio_file/'): | |
| audio_url = u'http://a.tumblr.com/%so1.mp3' % audio_url.split('/')[-1] | |
| src = self.get_media_url(audio_url, '.mp3') | |
| elif post['audio_type'] == 'soundcloud': | |
| src = self.get_media_url(post['audio_url'], '.mp3') | |
| if src: | |
| append(u'<p><audio controls><source src="%s" type=audio/mpeg>%s<br>\n<a href="%s">%s</a></audio></p>' % ( | |
| src, "Your browser does not support the audio element.", src, "Audio file" | |
| )) | |
| else: | |
| append(post['player']) | |
| append_try('caption') | |
| elif self.typ == 'answer': | |
| self.title = post['question'] | |
| append_try('answer') | |
| elif self.typ == 'chat': | |
| self.title = get_try('title') | |
| append( | |
| u'<br>\n'.join('%(label)s %(phrase)s' % d for d in post['dialogue']), | |
| u'<p>%s</p>' | |
| ) | |
| else: | |
| sys.stderr.write( | |
| u"Unknown post type '%s' in post #%s%-50s\n" % (self.typ, self.ident, ' ') | |
| ) | |
| append(escape(self.json_content), u'<pre>%s</pre>') | |
| self.content = '\n'.join(content) | |
| # fix wrongly nested HTML elements | |
| for p in ('<p>(<(%s)>)', '(</(%s)>)</p>'): | |
| self.content = re.sub(p % 'p|ol|iframe[^>]*', r'\1', self.content) | |
| self.save_post() | |
| def get_youtube_url(self, youtube_url): | |
| # determine the media file name | |
| filetmpl = u'%(id)s_%(uploader_id)s_%(title)s.%(ext)s' | |
| ydl = youtube_dl.YoutubeDL({ | |
| 'outtmpl': join(self.media_folder, filetmpl), | |
| 'quiet': True, 'restrictfilenames': True, 'noplaylist': True | |
| }) | |
| ydl.add_default_info_extractors() | |
| try: | |
| result = ydl.extract_info(youtube_url, download=False) | |
| media_filename = sanitize_filename(filetmpl % result['entries'][0], restricted=True) | |
| except: | |
| return '' | |
| # check if a file with this name already exists | |
| if not os.path.isfile(media_filename): | |
| try: | |
| ydl.extract_info(youtube_url, download=True) | |
| except: | |
| return '' | |
| return u'%s/%s' % (self.media_url, split(media_filename)[1]) | |
| def get_media_url(self, media_url, extension): | |
| media_filename = self.get_filename(media_url) | |
| media_filename = os.path.splitext(media_filename)[0] + extension | |
| saved_name = self.download_media(media_url, media_filename) | |
| if saved_name is not None: | |
| media_filename = u'%s/%s' % (self.media_url, saved_name) | |
| return media_filename | |
| def get_image_url(self, image_url, offset): | |
| """Saves an image if not saved yet. Returns the new URL or | |
| the original URL in case of download errors.""" | |
| def _addexif(fn): | |
| if options.exif and fn.endswith('.jpg'): | |
| add_exif(fn, set(self.tags)) | |
| image_filename = self.get_filename(image_url, '_o%s' % offset if offset else '') | |
| saved_name = self.download_media(image_url, image_filename) | |
| if saved_name is not None: | |
| _addexif(join(self.media_folder, saved_name)) | |
| image_url = u'%s/%s' % (self.media_url, saved_name) | |
| return image_url | |
| @staticmethod | |
| def maxsize_image_url(image_url): | |
| if ".tumblr.com/" not in image_url or image_url.endswith('.gif'): | |
| return image_url | |
| # change the image resolution to 1280 | |
| return re.sub(r'_\d{2,4}(\.\w+)$', r'_1280\1', image_url) | |
| def get_inline_image(self, match): | |
| """Saves an inline image if not saved yet. Returns the new <img> tag or | |
| the original one in case of download errors.""" | |
| image_url = match.group(2) | |
| if image_url.startswith('//'): | |
| image_url = 'http:' + image_url | |
| image_url = self.maxsize_image_url(image_url) | |
| path = urlparse.urlparse(image_url).path | |
| image_filename = path.split('/')[-1] | |
| if not image_filename or not image_url.startswith('http'): | |
| return match.group(0) | |
| saved_name = self.download_media(image_url, image_filename) | |
| if saved_name is None: | |
| return match.group(0) | |
| return u'%s%s/%s%s' % (match.group(1), self.media_url, | |
| saved_name, match.group(3) | |
| ) | |
| def get_filename(self, url, offset=''): | |
| """Determine the image file name depending on options.image_names""" | |
| if options.image_names == 'i': | |
| return self.ident + offset | |
| elif options.image_names == 'bi': | |
| return account + '_' + self.ident + offset | |
| else: | |
| return url.split('/')[-1] | |
| def download_media(self, url, filename): | |
| # check if a file with this name already exists | |
| known_extension = '.' in filename[-5:] | |
| image_glob = glob(path_to(self.media_dir, | |
| filename + ('' if known_extension else '.*') | |
| )) | |
| if image_glob: | |
| return split(image_glob[0])[1] | |
| # download the media data | |
| try: | |
| resp = urlopen(url) | |
| with open_media(self.media_dir, filename) as dest: | |
| data = resp.read(HTTP_CHUNK_SIZE) | |
| hdr = data[:32] # save the first few bytes | |
| while data: | |
| dest.write(data) | |
| data = resp.read(HTTP_CHUNK_SIZE) | |
| except (EnvironmentError, ValueError, HTTPException) as e: | |
| sys.stderr.write('%s downloading %s\n' % (e, url)) | |
| try: | |
| os.unlink(path_to(self.media_dir, filename)) | |
| except OSError as e: | |
| if e.errno != errno.ENOENT: | |
| raise | |
| return None | |
| # determine the file type if it's unknown | |
| if not known_extension: | |
| image_type = imghdr.what(None, hdr) | |
| if image_type: | |
| oldname = path_to(self.media_dir, filename) | |
| filename += '.' + image_type.replace('jpeg', 'jpg') | |
| os.rename(oldname, path_to(self.media_dir, filename)) | |
| return filename | |
| def get_post(self): | |
| """returns this post in HTML""" | |
| post = self.post_header + u'<article class=%s id=p-%s>\n' % (self.typ, self.ident) | |
| post += u'<header>\n<p><time datetime=%s>%s</time>\n' % (self.isodate, strftime('%x %X', self.tm)) | |
| post += u'<a class=llink href=%s%s/%s>¶</a>\n' % (save_dir, post_dir, self.llink) | |
| post += u'<a href=%s>●</a></header>\n' % self.shorturl | |
| if self.title: | |
| post += u'<h2>%s</h2>\n' % self.title | |
| post += self.content | |
| foot = [] | |
| if self.tags: | |
| foot.append(u''.join(self.tag_link(t) for t in self.tags)) | |
| if self.note_count: | |
| foot.append(u'%d note%s' % (self.note_count, 's'[self.note_count == 1:])) | |
| if self.source_title and self.source_url: | |
| foot.append(u'<a title=Source href=%s>%s</a>' % | |
| (self.source_url, self.source_title) | |
| ) | |
| if foot: | |
| post += u'\n<footer>%s</footer>' % u' — '.join(foot) | |
| post += '\n</article>\n' | |
| return post | |
| @staticmethod | |
| def tag_link(tag): | |
| tag_disp = escape(TAG_FMT % tag) | |
| if not TAGLINK_FMT: | |
| return tag_disp + ' ' | |
| url = TAGLINK_FMT % {'domain': blog_name, 'tag': urllib.quote(tag.encode('utf-8'))} | |
| return u'<a href=%s>%s</a>\n' % (url, tag_disp) | |
| def save_post(self): | |
| """saves this post locally""" | |
| if options.dirs: | |
| f = open_text(post_dir, self.ident, dir_index) | |
| else: | |
| f = open_text(post_dir, self.file_name) | |
| with f: | |
| f.write(self.get_post()) | |
| os.utime(f.stream.name, (self.date, self.date)) # XXX: is f.stream.name portable? | |
| if options.json: | |
| with open_text(json_dir, self.ident + '.json') as f: | |
| f.write(self.json_content) | |
| class BlosxomPost(TumblrPost): | |
| def get_image_url(self, image_url, offset): | |
| return image_url | |
| def get_post(self): | |
| """returns this post as a Blosxom post""" | |
| post = self.title + '\nmeta-id: p-' + self.ident + '\nmeta-url: ' + self.url | |
| if self.tags: | |
| post += '\nmeta-tags: ' + ' '.join(t.replace(' ', '+') for t in self.tags) | |
| post += '\n\n' + self.content | |
| return post | |
| class LocalPost: | |
| def __init__(self, post_file): | |
| with codecs.open(post_file, 'r', encoding) as f: | |
| self.lines = f.readlines() | |
| # remove header and footer | |
| while self.lines and '<article ' not in self.lines[0]: | |
| del self.lines[0] | |
| while self.lines and '</article>' not in self.lines[-1]: | |
| del self.lines[-1] | |
| parts = post_file.split(os.sep) | |
| if parts[-1] == dir_index: # .../<post_id>/index.html | |
| self.file_name = os.sep.join(parts[-2:]) | |
| self.ident = parts[-2] | |
| else: | |
| self.file_name = parts[-1] | |
| self.ident = splitext(self.file_name)[0] | |
| self.date = os.stat(post_file).st_mtime | |
| self.tm = time.localtime(self.date) | |
| def get_post(self): | |
| return u''.join(self.lines) | |
| class ThreadPool: | |
| def __init__(self, thread_count=20, max_queue=1000): | |
| self.queue = Queue.Queue(max_queue) | |
| self.quit = threading.Event() | |
| self.abort = threading.Event() | |
| self.threads = [threading.Thread(target=self.handler) for _ in range(thread_count)] | |
| for t in self.threads: | |
| t.start() | |
| def add_work(self, work): | |
| self.queue.put(work) | |
| def wait(self): | |
| self.quit.set() | |
| self.queue.join() | |
| def cancel(self): | |
| self.abort.set() | |
| for i, t in enumerate(self.threads, start=1): | |
| log('', "\rStopping threads %s%s\r" % | |
| (' ' * i, '.' * (len(self.threads) - i)) | |
| ) | |
| t.join() | |
| def handler(self): | |
| while not self.abort.is_set(): | |
| try: | |
| work = self.queue.get(True, 0.1) | |
| except Queue.Empty: | |
| if self.quit.is_set(): | |
| break | |
| else: | |
| if self.quit.is_set() and self.queue.qsize() % MAX_POSTS == 0: | |
| log(account, "%d remaining posts to save\r" % self.queue.qsize()) | |
| try: | |
| work() | |
| finally: | |
| self.queue.task_done() | |
| if __name__ == '__main__': | |
| import optparse | |
| def csv_callback(option, opt, value, parser): | |
| setattr(parser.values, option.dest, set(value.split(','))) | |
| def tags_callback(option, opt, value, parser): | |
| request_callback(option, opt, TYPE_ANY + ':' + value.replace(',', ':'), parser) | |
| def request_callback(option, opt, value, parser): | |
| request = parser.values.request or {} | |
| for req in value.lower().split(','): | |
| parts = req.strip().split(':') | |
| typ = parts.pop(0) | |
| if typ != TYPE_ANY and typ not in POST_TYPES: | |
| parser.error("%s: invalid post type '%s'" % (opt, typ)) | |
| for typ in POST_TYPES if typ == TYPE_ANY else (typ,): | |
| if parts: | |
| request[typ] = request.get(typ, set()).union(parts) | |
| else: | |
| request[typ] = set([TAG_ANY]) | |
| parser.values.request = request | |
| parser = optparse.OptionParser("Usage: %prog [options] blog-name ...", | |
| description="Makes a local backup of Tumblr blogs." | |
| ) | |
| parser.add_option('-O', '--outdir', help="set the output directory" | |
| " (default: blog-name)" | |
| ) | |
| parser.add_option('-D', '--dirs', action='store_true', | |
| help="save each post in its own folder" | |
| ) | |
| parser.add_option('-q', '--quiet', action='store_true', | |
| help="suppress progress messages" | |
| ) | |
| parser.add_option('-i', '--incremental', action='store_true', | |
| help="incremental backup mode" | |
| ) | |
| parser.add_option('-k', '--skip-images', action='store_false', default=True, | |
| dest='save_images', help="do not save images; link to Tumblr instead" | |
| ) | |
| parser.add_option('--save-video', action='store_true', help="save video files") | |
| parser.add_option('--save-audio', action='store_true', help="save audio files") | |
| parser.add_option('-j', '--json', action='store_true', | |
| help="save the original JSON source" | |
| ) | |
| parser.add_option('-b', '--blosxom', action='store_true', | |
| help="save the posts in blosxom format" | |
| ) | |
| parser.add_option('-r', '--reverse-month', action='store_false', default=True, | |
| help="reverse the post order in the monthly archives" | |
| ) | |
| parser.add_option('-R', '--reverse-index', action='store_false', default=True, | |
| help="reverse the index file order" | |
| ) | |
| parser.add_option('-a', '--auto', type='int', metavar="HOUR", | |
| help="do a full backup at HOUR hours, otherwise do an incremental backup" | |
| " (useful for cron jobs)" | |
| ) | |
| parser.add_option('-n', '--count', type='int', default=0, | |
| help="save only COUNT posts" | |
| ) | |
| parser.add_option('-s', '--skip', type='int', default=0, | |
| help="skip the first SKIP posts" | |
| ) | |
| parser.add_option('-p', '--period', help="limit the backup to PERIOD" | |
| " ('y', 'm', 'd' or YYYY[MM[DD]])" | |
| ) | |
| parser.add_option('-N', '--posts-per-page', type='int', default=50, | |
| metavar='COUNT', help="set the number of posts per monthly page" | |
| ) | |
| parser.add_option('-Q', '--request', type='string', action='callback', | |
| callback=request_callback, help="save posts matching the request" | |
| u" TYPE:TAG:TAG:…,TYPE:TAG:…,…. TYPE can be %s or %s; TAGs can be" | |
| " omitted or a colon-separated list. Example: -Q %s:personal,quote" | |
| ",photo:me:self" % (', '.join(POST_TYPES), TYPE_ANY, TYPE_ANY) | |
| ) | |
| parser.add_option('-t', '--tags', type='string', action='callback', | |
| callback=tags_callback, help="save only posts tagged TAGS (comma-separated values;" | |
| " case-insensitive)" | |
| ) | |
| parser.add_option('-T', '--type', type='string', action='callback', | |
| callback=request_callback, help="save only posts of type TYPE" | |
| " (comma-separated values from %s)" % ', '.join(POST_TYPES) | |
| ) | |
| parser.add_option('--no-reblog', action='store_true', help="don't save reblogged posts") | |
| parser.add_option('-I', '--image-names', type='choice', choices=('o', 'i', 'bi'), | |
| default='o', metavar='FMT', | |
| help="image filename format ('o'=original, 'i'=<post-id>, 'bi'=<blog-name>_<post-id>)" | |
| ) | |
| parser.add_option('-e', '--exif', type='string', action='callback', | |
| callback=csv_callback, default=set(), metavar='KW', | |
| help="add EXIF keyword tags to each picture (comma-separated values;" | |
| " '-' to remove all tags, '' to add no extra tags)" | |
| ) | |
| parser.add_option('-S', '--no-ssl-verify', action='store_true', | |
| help="ignore SSL verification errors" | |
| ) | |
| options, args = parser.parse_args() | |
| if options.auto is not None and options.auto != time.localtime().tm_hour: | |
| options.incremental = True | |
| if options.period: | |
| try: | |
| pformat = {'y': '%Y', 'm': '%Y%m', 'd': '%Y%m%d'}[options.period] | |
| options.period = time.strftime(pformat) | |
| except KeyError: | |
| options.period = options.period.replace('-', '') | |
| if not re.match(r'^\d{4}(\d\d)?(\d\d)?$', options.period): | |
| parser.error("Period must be 'y', 'm', 'd' or YYYY[MM[DD]]") | |
| set_period() | |
| if have_ssl_ctx and options.no_ssl_verify: | |
| ssl_ctx = ssl._create_unverified_context() | |
| # Otherwise, it's an old Python version without SSL verification, | |
| # so this is the default. | |
| args = args or DEFAULT_BLOGS | |
| if not args: | |
| parser.error("Missing blog-name") | |
| if options.outdir and len(args) > 1: | |
| parser.error("-O can only be used for a single blog-name") | |
| if options.exif and not pyexiv2: | |
| parser.error("--exif: module 'pyexif2' is not installed") | |
| if (options.save_video or options.save_audio) and not youtube_dl: | |
| parser.error("--save-video/-audio: module 'youtube_dl' is not installed") | |
| tb = TumblrBackup() | |
| try: | |
| for account in args: | |
| tb.backup(account) | |
| except KeyboardInterrupt: | |
| sys.exit(EXIT_INTERRUPT) | |
| sys.exit(tb.exit_code()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment