Last active
November 5, 2018 16:25
-
-
Save suminb/0c9f9961727639261009b4a5f078501c to your computer and use it in GitHub Desktop.
WordPress 게시물을 .rst 형식으로 변환하는 스크립트
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""Converts old blog posts (from WordPress) to .rst files.""" | |
import os | |
import re | |
import warnings | |
import pypandoc | |
import yaml | |
SOURCE_PATH = 'posts.bak' | |
TARGET_PATH = 'posts.rst' | |
def process_file(filename): | |
datetime = extract_datetime(filename) | |
with open(filename) as fin: | |
return convert(fin.read(), datetime) | |
def convert(source, datetime): | |
"""Converts a document. | |
:param source: A WordPress document | |
:param datetime: String representation of datetime (yyyy-mm-dd) | |
""" | |
metadata_raw, html = split_document(source) | |
metadata = yaml.load(metadata_raw) | |
rst = pypandoc.convert_text(html, 'rst', format='html') | |
rst = migrate_all_images(rst, datetime) | |
metadata_text = yaml.dump( | |
metadata, default_flow_style=False, allow_unicode=True) | |
metadata_text = '\n '.join(metadata_text.split('\n')) | |
title = metadata['title'] | |
headerline = '=' * (len(datetime) + 1 + width(title)) | |
return f""" | |
{datetime} {title} | |
{headerline} | |
{rst} | |
.. (Metadata from the original post) | |
{metadata_text} | |
""".strip(), f'{datetime}.rst' | |
def split_document(source): | |
index = source.index('---', 3) | |
return source[3:index].strip(), source[index + 3:].strip() | |
def width(string): | |
return sum([1 if s.isascii() else 2 for s in string]) | |
def extract_datetime(path): | |
"""Extracts datetime (yyyy-mm-dd) from a filename (or path).""" | |
path_components = os.path.split(path) | |
filename = path_components[-1] | |
s = re.search(r'\d{4}-\d{2}-\d{2}', filename) | |
if s: | |
return s.group(0) | |
else: | |
raise ValueError(f'No date was found in {path}') | |
def migrate_image(src_path, dest_dir): | |
""" | |
:param src_path: e.g., wp-content/uploads/2006/02/image.jpg | |
:param dest_dir: e.g., posts.rst/2006-02-18 | |
""" | |
basepath, filename = os.path.split(src_path) | |
if not os.path.exists(dest_dir): | |
os.mkdir(dest_dir) | |
dest_path = os.path.join(dest_dir, filename) | |
if os.path.exists(src_path): | |
os.rename(src_path, dest_path) | |
else: | |
warnings.warn(f'{src_path} does not exist') | |
return dest_path | |
def migrate_all_images(source, datetime): | |
# Pre-processing | |
source = source.replace('http://blog...(old domain 1)', '') | |
source = source.replace('http://blog...(old domain 2)', '') | |
source = source.replace('http://blog...(old domain 3)', '') | |
pattern = re.compile(r'wp-content/uploads/.+\.\w+') | |
index = 0 | |
while True: | |
match = pattern.search(source, index) | |
if not match: | |
break | |
path = source[match.start():match.end()] | |
dest_path = migrate_image(path, os.path.join(TARGET_PATH, datetime)) | |
# Resolve the path difference between a post and an image | |
dest_path = os.path.relpath(dest_path, TARGET_PATH) | |
# `path` should include the preceeding '/' | |
source = source.replace('/' + path, dest_path) | |
index = match.end() | |
return source | |
def main(): | |
for filename in os.listdir(SOURCE_PATH): | |
if not filename.endswith('.html'): | |
continue | |
print(f'Converting {filename}...') | |
rst, target_file = process_file(os.path.join(SOURCE_PATH, filename)) | |
with open(os.path.join(TARGET_PATH, target_file), 'w') as fout: | |
fout.write(rst) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment