Skip to content

Instantly share code, notes, and snippets.

@adrianhall
Created May 4, 2018 23:07
Show Gist options
  • Save adrianhall/5e3531e316356405e74d68c56c336de0 to your computer and use it in GitHub Desktop.
Save adrianhall/5e3531e316356405e74d68c56c336de0 to your computer and use it in GitHub Desktop.
Convert an exported wordpress.com post to Jekyll
#!/usr/bin/env bash
echo "Processing $1"
base=${1//.html/}
# split .html into an empty file, a .yaml and an .htm
csplit "$base.html" '/---/'
mv xx00 tmp/$base.yaml
mv xx01 tmp/$base.html
# Set metadata
# remove meta
python -c "
import yaml
with open(\"tmp/$base.yaml\", 'r') as stream:
data = yaml.load(stream)
data['author'] = 'adrianhall'
del data['meta']
del data['parent_id']
del data['password']
del data['published']
del data['status']
with open(\"tmp/$base.yaml\", 'w') as stream:
stream.write(yaml.dump(data, explicit_start=True, explicit_end=False))
stream.write('---\n')
"
python -c "
import re
import sys
import HTMLParser
import html2text
import string
# Read the file
with open(\"tmp/$base.html\", 'r') as content_file:
# Skip first line
content_file.readline()
content = content_file.read()
# Regular expression for a short code
html = HTMLParser.HTMLParser()
match = re.findall(r'(\[code.*?\].*?\[\/code\])', content, re.S)
r = []
for i in range(len(match)):
codesnippet = html.unescape(match[i].replace('<br />','').replace('</p>','').replace('<p>','\n'))
# See if there is a language bit
lm = re.findall(r'\[code language=\"(.*?)\"', codesnippet, re.S)
lang = ('text', lm[0])[lm.count > 0]
rm = re.findall(r'\[code.*?\](.*?)\[\/code\]', codesnippet, re.S)
r.append('\`\`\`' + lang + rm[0] + '\`\`\`\n')
# Replace all the instances of the code with the snippet marker
copy_of_content = content
for i in range(len(match)):
placeholder = '###SHORTCODE-SNIPPET-'+str(i)+'###'
copy_of_content = copy_of_content.replace(match[i], placeholder)
# Strip out non-ascii characters which sometimes appear in Wordpress output
printable = set(string.printable)
copy_of_content = filter(lambda x: x in printable, copy_of_content)
# Convert to Markdown
copy_of_content = html2text.html2text(copy_of_content)
# Now put the snippets back in
for i in range(len(match)):
placeholder = '###SHORTCODE-SNIPPET-'+str(i)+'###'
copy_of_content = copy_of_content.replace(placeholder, r[i])
# Remove duplicate blank lines
copy_of_content = copy_of_content.replace('\n\n\n', '\n\n')
# Write to stdout
with open(\"tmp/$base.md\", 'w') as stream:
stream.write(copy_of_content)
"
# concat .yaml and .content into .md
cat tmp/$base.yaml tmp/$base.md >$base.md
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment