Last active
August 17, 2016 11:41
-
-
Save Koasing/d2701169d6bf70b86aef271f06c58e16 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from datetime import datetime | |
from urllib.parse import quote | |
from xml.etree import ElementTree | |
import json | |
import base64 | |
import os | |
import os.path | |
import re | |
# base URL : WordPress base URL. | |
# do not forget to add trailing "/" | |
site = 'http://your_domain/' | |
base_url = site + 'wordpress/' | |
# GUID pattern : unique pattern for each posting | |
guid_pattern = 'http://your.tistory.com/{}' | |
post_status = { 'public' : 'publish', \ | |
'private' : 'private', \ | |
'draft' : 'draft', \ | |
'trash' : 'trash' } | |
tree = ElementTree.parse(source = 'Tistory-Backup.xml') | |
root = tree.getroot() | |
authors = {} | |
posts = [] | |
categories = [] | |
tags = [] | |
terms = [] # terms array is currently not used but implemented for future extension | |
for raw_category in root.findall('category'): | |
# Category handling loop | |
category = {} | |
# Weight level-1 category ID. Usually 100 is sufficient. | |
# if the blog has more than 100 categories (which is impractical), increase weight value. | |
category['term_id'] = int(raw_category.find('priority').text) * 100 | |
category['cat_name'] = raw_category.find('name').text | |
category['category_nicename'] = quote(category['cat_name'].replace(' ', '-')).lower() | |
category['category_parent'] = '' | |
category['category_description'] = '' # tistory has no category description | |
categories.append(category) | |
parent_id = category['term_id'] | |
parent_slug = category['category_nicename'] | |
for raw_subcategory in raw_category.findall('category'): | |
# level-2 category. | |
subcategory = {} | |
# add to its parent category | |
subcategory['term_id'] = int(raw_subcategory.find('priority').text) + parent_id | |
subcategory['cat_name'] = raw_subcategory.find('name').text | |
subcategory['category_nicename'] = quote(subcategory['cat_name'].replace(' ', '-')).lower() | |
subcategory['category_parent'] = parent_slug | |
subcategory['category_description'] = '' | |
categories.append(subcategory) | |
def process_body(content, attachment, useLessMargin = False): | |
# converts tistory-specific tags to WordPress tags | |
def process_moreless(m): | |
# process more/less tags, "[#M_(more)|(less)|(content)_M#]" | |
# depends on "WP show more plugin" ( https://ko.wordpress.org/plugins/wp-show-more/ ). | |
c = '[show_more more="{}" less="{}"]{}[/show_more]'.format(*m.groups()) | |
return c | |
def process_image(m): | |
# process image tags, "[##_(count)(align)(|(filename)|(attribute)|(caption))(...)(...)_##]" | |
count = int(m.group(1)) | |
align = {'C':'aligncenter', 'L':'alignleft', 'R':'alignright'}.get(m.group(2), 'alignnone') | |
# split | |
tags = m.group(3).split('|') | |
#assault len(tags) == 3 * count | |
images = [] | |
for tag in (tags[n:n+3] for n in range(0, len(tags), 3)): | |
# find filename from attachments | |
params = attachment[tag[0].strip()] | |
params['align'] = align | |
params['base'] = base_url | |
params['attr'] = tag[1].strip() | |
params['caption'] = tag[2].strip() | |
if re.match('image', params['mime']): | |
if len(params['caption']) > 0: | |
c = '[caption id="attachment_{id}" align="{align}" width="{width}"]<img class="wp-image-{id} size-full" src="{url}" {attr} /> {caption}[/caption]'.format_map(params) | |
else: | |
c = '<img class="wp-image-{id} size-full {align}" src="{url}" {attr} />'.format_map(params) | |
else: | |
c = '<a href="{url}">{label}</a>'.format_map(params) | |
images.append(c) | |
return ''.join(images) | |
def process_gallery(m): | |
# process gallery tags, "[##_Gallery(|(filename)|(catpion))(...)(...)|(attribute)_##]" | |
align = 'alignnone' | |
#split | |
tags = m.group(1).split('|') | |
#assult len(tags) == odd | |
# last tag item is attribute. split array | |
attr = tags[-1].strip() | |
tags = tags[:-1] | |
images = [] | |
for tag in (tags[n:n+2] for n in range(0, len(tags), 2)): | |
# find filename from attachments | |
params = attachment[tag[0].strip()] | |
params['align'] = align | |
params['base'] = base_url | |
params['attr'] = attr | |
params['caption'] = tag[1].strip() | |
if re.match('image', params['mime']): | |
if len(params['caption']) > 0: | |
c = '[caption id="attachment_{id}" align="{align}" width="{width}"]<img class="wp-image-{id} size-full" src="{url}" {attr} /> {caption}[/caption]'.format_map(params) | |
else: | |
c = '<img class="wp-image-{id} size-full {align}" src="{url}" {attr} />'.format_map(params) | |
else: | |
c = '<a href="{url}">{label}</a>'.format_map(params) | |
images.append(c) | |
return ''.join(images) | |
# remove line-breaking in html source | |
content = re.sub('<pre([\s\S]*?)\/pre>', lambda m: '<pre' + m.group(1).replace('\n', '<br />') + '/pre>', content) | |
content = content.replace('\n', '') | |
# process tistory-specific tags | |
content = re.sub('\[#M_(.+?)\|(.+?)\|([\s\S]+?)_M#\]', process_moreless, content) | |
content = re.sub('\[##_(\d)([CLR])\|(.+?)_##\]', process_image, content) | |
content = re.sub('\[##_Gallery\|(.+?)_##\]', process_gallery, content) | |
# strip P tag | |
if useLessMargin is True: | |
# p tag has no bottom margin. | |
content = re.sub('<[pP]>\s*?<\/[pP]>', '\n', content) | |
content = re.sub('<[pP]><br ?\/?><\/[pP]>', '\n', content) | |
content = re.sub('<[pP]>(.*?)<\/[pP]>', '\\1\n', content) | |
else: | |
# p tag has bottom margin | |
content = re.sub('<[pP]>\s*?<\/[pP]>', '\n\n', content) | |
content = re.sub('<[pP]><br ?\/?><\/[pP]>', '\n\n', content) | |
content = re.sub('<[pP]>(.*?)<\/[pP]>', '\\1\n\n', content) | |
# process simple tags. should not overlap | |
replace_map = {'<br>' : '\n', | |
'<br/>' : '\n', | |
'<br />' : '\n', | |
' ' : ' ', | |
'"' : '"'} | |
#content = reduce(lambda content, old : content.replace(old, replace_map[old]), replace_map, content) | |
for old, new in replace_map.items(): | |
content = content.replace(old, new) | |
# strip useless span tag | |
content = re.sub('<span style="(.*?)font-size: 9pt;(.*?)">', '<span style="\\1\\2">', content) | |
content = re.sub('<span style="(.*?)line-height: 1\.5;(.*?)">', '<span style="\\1\\2">', content) | |
content = re.sub('<span style="(.*?)background-color: transparent;(.*?)">', '<span style="\\1\\2">', content) | |
content = re.sub('<span style="\s*?">([\s\S]*?)<\/span>', '\\1', content) | |
return content | |
def process_comment(raw_comment, next_id, parent_id): | |
global authors | |
comments = [] | |
# parse comment | |
raw_author = raw_comment.find('commenter') | |
author = authors.get(raw_author.get('id')) | |
if author is not None: | |
author = author['author_id'] | |
else: | |
author = 0 | |
comment = {} | |
comment['comment_id'] = next_id | |
comment['comment_author'] = raw_author.find('name').text | |
comment['comment_author_email'] = '' # tistory does not support comment email | |
comment['comment_author_IP'] = raw_author.find('ip').text | |
comment['comment_author_url'] = raw_author.find('homepage').text | |
comment['comment_date'] = datetime.fromtimestamp(int(raw_comment.find('written').text)).strftime('%Y-%m-%d %H:%M:%S') | |
comment['comment_date_gmt'] = comment['comment_date'] | |
comment['comment_content'] = raw_comment.find('content').text | |
comment['comment_approved'] = '1' # i don't know why it is string | |
comment['comment_type'] = '' | |
comment['comment_parent'] = str(parent_id) | |
comment['comment_user_id'] = author | |
comment['commentmeta'] = [] | |
comments.append(comment) | |
#increase next comment id | |
next_id += 1 | |
# parse subcomments | |
for raw_subcomment in raw_comment.findall('comment'): | |
subcomments = process_comment(raw_subcomment, next_id, comment['comment_id']) | |
comments += subcomments | |
next_id += len(subcomments) | |
return comments | |
# author and tag are included in post | |
def process_post(key_tag): | |
global posts, authors, tags, terms | |
global root | |
for raw_post in root.findall(key_tag): | |
print('Processing id {}'.format(raw_post.find('id').text)) | |
author = {} | |
post = {} | |
# author. tistory manages author information internally, and does not provide in backup text. only author ID is provided. | |
# build author info randomly. user should manage how to merge author information. | |
author['author_id'] = int(raw_post.find('author').text) | |
author['author_login'] = str(author['author_id']) | |
author['author_email'] = '[email protected]' | |
author['author_display_name'] = str(author['author_id']) | |
author['author_first_name'] = '' | |
author['author_last_name'] = '' | |
if authors.get(author['author_login']) is None: | |
authors[author['author_login']] = author | |
# post | |
post['post_title'] = raw_post.find('title').text | |
post['guid'] = guid_pattern.format(raw_post.find('id').text) | |
post['post_author'] = raw_post.find('author').text | |
post['post_content'] = '' | |
post['post_excerpt'] = '' # tistory doesn't support excerpt text | |
post['post_id'] = int(raw_post.find('id').text) | |
post['post_date'] = datetime.fromtimestamp(int(raw_post.find('created').text)).strftime('%Y-%m-%d %H:%M:%S') | |
post['post_date_gmt'] = post['post_date'] | |
post['comment_status'] = "open" if key_tag == 'post' and raw_post.find('acceptComment').text == "1" else "closed" | |
post['ping_status'] = "open" if key_tag == 'post' and raw_post.find('acceptTrackback').text == "1" else "closed" | |
post['post_name'] = quote(raw_post.attrib['slogan'] if key_tag == 'post' else raw_post.find('title').text.replace(' ','-')).lower() | |
post['status'] = post_status.get(raw_post.find('visibility').text, 'private') # fallback to private if unknown status | |
post['post_parent'] = 0 # tistory doesn't support post chaining... will be used for attachments | |
post['menu_order'] = 0 | |
post['post_type'] = 'post' if key_tag == 'post' else 'page' | |
post['post_password'] = raw_post.find('password').text if post['status'] == 'private' and raw_post.find('password') is not None else '' | |
post['is_sticky'] = 0 | |
post['attachment_url'] = '' | |
post['terms'] = [] # category and tags | |
post['postmeta'] = [] # post metadata | |
post['comments'] = [] | |
for raw_comment in raw_post.findall('comment'): | |
post['comments'] = post['comments'] + process_comment(raw_comment, post['post_id'] * 100 + len(post['comments']), 0) | |
attachments = [] | |
attachments_table = {} | |
# process attachments | |
for raw_attachment in raw_post.findall('attachment'): | |
label = raw_attachment.find('label').text | |
# tistory bug handling (empty attachment) | |
if label is None: | |
continue | |
label = label.replace(' ', '_') | |
dt = datetime.fromtimestamp(int(raw_attachment.find('attached').text)) | |
fn = 'attach/{}/{}'.format(post['post_id'], label) | |
attachment = {} | |
attachment['post_title'] = os.path.splitext(label)[0] | |
attachment['guid'] = '' | |
attachment['post_author'] = post['post_author'] | |
attachment['post_content'] = '' | |
attachment['post_excerpt'] = '' | |
attachment['post_id'] = post['post_id'] * 100 + len(post['comments']) + len(attachments) | |
attachment['post_date'] = dt.strftime('%Y-%m-%d %H:%M:%S') | |
attachment['post_date_gmt'] = attachment['post_date'] | |
attachment['comment_status'] = 'closed' | |
attachment['ping_status'] = 'closed' | |
attachment['post_name'] = quote(attachment['post_title'].replace(' ','-')).lower() | |
attachment['status'] = 'inherit' | |
attachment['post_parent'] = post['post_id'] | |
attachment['menu_order'] = 0 | |
attachment['post_type'] = 'attachment' | |
attachment['post_password'] = '' | |
attachment['is_sticky'] = 0 | |
attachment['attachment_url'] = site + fn | |
attachments.append(attachment) | |
os.makedirs(os.path.dirname(fn), exist_ok=True) | |
with open(fn, 'wb') as outfile: | |
outfile.write(base64.b64decode(raw_attachment.find('content').text)) | |
sanitized = quote(label).lower() | |
for s in ["?", "[", "]", "/", "\\", "=", "<", ">", ":", ";", ",", "'", "\"", "&", "$", "#", "*", "(", ")", "|", "~", "`", "!", "{", "}", "%", "+"]: | |
sanitized = sanitized.replace(s, '') | |
for s in ["%20", "+"]: | |
sanitized = sanitized.replace(s, '-') | |
sanitized = re.sub('[\r\n\t -]+', '-', sanitized) | |
attachments_table[raw_attachment.find('name').text] = raw_attachment.attrib.copy() | |
attachments_table[raw_attachment.find('name').text].update({'label':sanitized, | |
'year':dt.year, | |
'month':dt.month, | |
'id':attachment['post_id'], | |
'url':attachment['attachment_url']}) | |
# process tags | |
for category in raw_post.findall('category'): | |
term = {} | |
term['name'] = category.text | |
term['slug'] = quote(category.text.replace(' ','-')).lower() | |
term['domain'] = 'category' | |
post['terms'].append(term) | |
for tag in raw_post.findall('tag'): | |
term = {} | |
term['name'] = tag.text | |
term['slug'] = quote(tag.text.replace(' ','-')).lower() | |
term['domain'] = 'post_tag' | |
post['terms'].append(term) | |
if len(post['terms']) == 0: | |
del post['terms'] | |
if len(post['postmeta']) == 0: | |
del post['postmeta'] | |
post['post_content'] = process_body(raw_post.find('content').text, attachments_table, True if raw_post.find('uselessMargin').text == '1' else False) | |
posts.append(post) | |
posts = posts + attachments | |
# tag. a post may have multiple tags. | |
for raw_tag in raw_post.findall('tag'): | |
tag = {} | |
tag['term_id'] = len(tags) # assign unique (sequential) value. | |
tag['tag_name'] = raw_tag.text | |
tag['tag_slug'] = quote(tag['tag_name'].replace(' ','-')).lower() | |
tag['tag_description'] = '' | |
# check duplicated | |
for index, item in enumerate(tags): | |
if item['tag_name'] == tag['tag_name']: | |
break | |
else: | |
tags.append(tag) | |
process_post('post') | |
process_post('notice') | |
# merge | |
backup = {'authors':authors, 'posts':posts, 'categories':categories, 'tags':tags, 'terms':[], 'base_url':base_url, 'version':'1.2'} | |
with open('export.json', 'w', encoding='utf8') as outfile: | |
json.dump(backup, outfile, ensure_ascii = False, indent = 2) |
한글 파일명을 Import할 때 문제점
WordPress의 Sanitize 함수는 CJK 문자를 제대로 처리하지 못한다. PHP, 웹서버, 클라이언트 설정이 복합적으로 맞물리면서 대부분의 경우 한글 파일명은 깨져서 제대로 작동하지 않는다.
여러 플러그인이 있지만, 본 프로젝트에서는 그냥 파일명을 URLENCODE 하여 처리한다. 이 때 UTF-8을 사용하므로, 서버도 유니코드 지원하도록 설정되어야 한다. 현재 사용하는 대부분의 서버는 UTF-8 설정으로 작동하므로 별 문제 없을 것이다.
패치 : formatting.php
wordpress/wp-includes/formatting.php
파일을 열고, sanitize_file_name
함수를 찾는다.
함수의 첫 줄을 다음과 같이 수정한다. 이후 파일 업로드할 때 non-latin 파일명은 자동으로 safe url로 변경된다.
function sanitize_file_name( $filename ) {
$filename = urlencode($filename);
// original function body
패치 : wordpress importer plugin
기본적으로 위와 같이 수행하면 파일명 깨지는 문제는 수정된다.
다만, PHP basename 함수의 버그때문에, 파일명의 첫 바이트가 날아가면서 유니코드 파일명이 손상되는 문제가 생긴다.
워드프레스는 이에 대한 해결책으로 wp_basename
이라는 함수를 제공하지만, importer 플러그인은 이를 사용하지 않는다.
설치한 플러그인의 wordpress-importer.php
파일을 열고, 모든 basename
함수를 wp_basename
으로 수정한다.
아마 세 군데 수정해야 할 것이다.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
TTXML to WordPress JSON converter
What is this?
Converts TTXML (Tistory) backup data to WordPress-compatible JSON backup format (+exported attachments).
Progress
Proof of Concept. Some bugs may exists.
License
MIT(X11) license.
How to use
site
andguid_pattern
values to reflect your configuration.How to mod WordPress Importer plugin
I cannot provide modded plugin itself because of license issue.
That's all!