Created
May 4, 2018 23:07
-
-
Save adrianhall/5e3531e316356405e74d68c56c336de0 to your computer and use it in GitHub Desktop.
Convert an exported wordpress.com post to Jekyll
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env bash | |
echo "Processing $1" | |
base=${1//.html/} | |
# split .html into an empty file, a .yaml and an .htm | |
csplit "$base.html" '/---/' | |
mv xx00 tmp/$base.yaml | |
mv xx01 tmp/$base.html | |
# Set metadata | |
# remove meta | |
python -c " | |
import yaml | |
with open(\"tmp/$base.yaml\", 'r') as stream: | |
data = yaml.load(stream) | |
data['author'] = 'adrianhall' | |
del data['meta'] | |
del data['parent_id'] | |
del data['password'] | |
del data['published'] | |
del data['status'] | |
with open(\"tmp/$base.yaml\", 'w') as stream: | |
stream.write(yaml.dump(data, explicit_start=True, explicit_end=False)) | |
stream.write('---\n') | |
" | |
python -c " | |
import re | |
import sys | |
import HTMLParser | |
import html2text | |
import string | |
# Read the file | |
with open(\"tmp/$base.html\", 'r') as content_file: | |
# Skip first line | |
content_file.readline() | |
content = content_file.read() | |
# Regular expression for a short code | |
html = HTMLParser.HTMLParser() | |
match = re.findall(r'(\[code.*?\].*?\[\/code\])', content, re.S) | |
r = [] | |
for i in range(len(match)): | |
codesnippet = html.unescape(match[i].replace('<br />','').replace('</p>','').replace('<p>','\n')) | |
# See if there is a language bit | |
lm = re.findall(r'\[code language=\"(.*?)\"', codesnippet, re.S) | |
lang = ('text', lm[0])[lm.count > 0] | |
rm = re.findall(r'\[code.*?\](.*?)\[\/code\]', codesnippet, re.S) | |
r.append('\`\`\`' + lang + rm[0] + '\`\`\`\n') | |
# Replace all the instances of the code with the snippet marker | |
copy_of_content = content | |
for i in range(len(match)): | |
placeholder = '###SHORTCODE-SNIPPET-'+str(i)+'###' | |
copy_of_content = copy_of_content.replace(match[i], placeholder) | |
# Strip out non-ascii characters which sometimes appear in Wordpress output | |
printable = set(string.printable) | |
copy_of_content = filter(lambda x: x in printable, copy_of_content) | |
# Convert to Markdown | |
copy_of_content = html2text.html2text(copy_of_content) | |
# Now put the snippets back in | |
for i in range(len(match)): | |
placeholder = '###SHORTCODE-SNIPPET-'+str(i)+'###' | |
copy_of_content = copy_of_content.replace(placeholder, r[i]) | |
# Remove duplicate blank lines | |
copy_of_content = copy_of_content.replace('\n\n\n', '\n\n') | |
# Write to stdout | |
with open(\"tmp/$base.md\", 'w') as stream: | |
stream.write(copy_of_content) | |
" | |
# concat .yaml and .content into .md | |
cat tmp/$base.yaml tmp/$base.md >$base.md | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment