adrianhall · May 4, 2018 23:07
diff --git a/wpconvert.sh b/wpconvert.sh
 #!/usr/bin/env bash

 echo "Processing $1"
 base=${1//.html/}

 # split .html into an empty file, a .yaml and an .htm
 csplit "$base.html" '/---/'
 mv xx00 tmp/$base.yaml
 mv xx01 tmp/$base.html

 # Set metadata
 #  remove meta
 python -c "
 import yaml
 with open(\"tmp/$base.yaml\", 'r') as stream:
    data = yaml.load(stream)
 data['author'] = 'adrianhall'
 del data['meta']
 del data['parent_id']
 del data['password']
 del data['published']
 del data['status']
 with open(\"tmp/$base.yaml\", 'w') as stream:
    stream.write(yaml.dump(data, explicit_start=True, explicit_end=False))
    stream.write('---\n')
 "

 python -c "
 import re
 import sys
 import HTMLParser
 import html2text
 import string

 # Read the file
 with open(\"tmp/$base.html\", 'r') as content_file:
    # Skip first line
    content_file.readline()
    content = content_file.read()

 # Regular expression for a short code
 html = HTMLParser.HTMLParser()
 match = re.findall(r'(\[code.*?\].*?\[\/code\])', content, re.S)
 r = []
 for i in range(len(match)):
    codesnippet = html.unescape(match[i].replace('<br />','').replace('</p>','').replace('<p>','\n'))
    # See if there is a language bit
    lm = re.findall(r'\[code language=\"(.*?)\"', codesnippet, re.S)
    lang = ('text', lm[0])[lm.count > 0]
    rm = re.findall(r'\[code.*?\](.*?)\[\/code\]', codesnippet, re.S)
    r.append('\`\`\`' + lang + rm[0] + '\`\`\`\n')

 # Replace all the instances of the code with the snippet marker
 copy_of_content = content
 for i in range(len(match)):
    placeholder = '###SHORTCODE-SNIPPET-'+str(i)+'###'
    copy_of_content = copy_of_content.replace(match[i], placeholder)

 # Strip out non-ascii characters which sometimes appear in Wordpress output
 printable = set(string.printable)
 copy_of_content = filter(lambda x: x in printable, copy_of_content)

 # Convert to Markdown
 copy_of_content = html2text.html2text(copy_of_content)

 # Now put the snippets back in
 for i in range(len(match)):
    placeholder = '###SHORTCODE-SNIPPET-'+str(i)+'###'
    copy_of_content = copy_of_content.replace(placeholder, r[i])

 # Remove duplicate blank lines
 copy_of_content = copy_of_content.replace('\n\n\n', '\n\n')

 # Write to stdout
 with open(\"tmp/$base.md\", 'w') as stream:
    stream.write(copy_of_content)
 "

 # concat .yaml and .content into .md
 cat tmp/$base.yaml tmp/$base.md >$base.md
	#!/usr/bin/env bash

	echo "Processing $1"
	base=${1//.html/}

	# split .html into an empty file, a .yaml and an .htm
	csplit "$base.html" '/---/'
	mv xx00 tmp/$base.yaml
	mv xx01 tmp/$base.html

	# Set metadata
	# remove meta
	python -c "
	import yaml
	with open(\"tmp/$base.yaml\", 'r') as stream:
	data = yaml.load(stream)
	data['author'] = 'adrianhall'
	del data['meta']
	del data['parent_id']
	del data['password']
	del data['published']
	del data['status']
	with open(\"tmp/$base.yaml\", 'w') as stream:
	stream.write(yaml.dump(data, explicit_start=True, explicit_end=False))
	stream.write('---\n')
	"

	python -c "
	import re
	import sys
	import HTMLParser
	import html2text
	import string

	# Read the file
	with open(\"tmp/$base.html\", 'r') as content_file:
	# Skip first line
	content_file.readline()
	content = content_file.read()

	# Regular expression for a short code
	html = HTMLParser.HTMLParser()
	match = re.findall(r'(\[code.?\].?\[\/code\])', content, re.S)
	r = []
	for i in range(len(match)):
	codesnippet = html.unescape(match[i].replace('<br />','').replace('</p>','').replace('<p>','\n'))
	# See if there is a language bit
	lm = re.findall(r'\[code language=\"(.*?)\"', codesnippet, re.S)
	lang = ('text', lm[0])[lm.count > 0]
	rm = re.findall(r'\[code.?\](.?)\[\/code\]', codesnippet, re.S)
	r.append('\`\`\`' + lang + rm[0] + '\`\`\`\n')

	# Replace all the instances of the code with the snippet marker
	copy_of_content = content
	for i in range(len(match)):
	placeholder = '###SHORTCODE-SNIPPET-'+str(i)+'###'
	copy_of_content = copy_of_content.replace(match[i], placeholder)

	# Strip out non-ascii characters which sometimes appear in Wordpress output
	printable = set(string.printable)
	copy_of_content = filter(lambda x: x in printable, copy_of_content)

	# Convert to Markdown
	copy_of_content = html2text.html2text(copy_of_content)

	# Now put the snippets back in
	for i in range(len(match)):
	placeholder = '###SHORTCODE-SNIPPET-'+str(i)+'###'
	copy_of_content = copy_of_content.replace(placeholder, r[i])

	# Remove duplicate blank lines
	copy_of_content = copy_of_content.replace('\n\n\n', '\n\n')

	# Write to stdout
	with open(\"tmp/$base.md\", 'w') as stream:
	stream.write(copy_of_content)
	"

	# concat .yaml and .content into .md
	cat tmp/$base.yaml tmp/$base.md >$base.md