Skip to content

Instantly share code, notes, and snippets.

@cou929
Created May 3, 2011 12:47
Show Gist options
  • Save cou929/953267 to your computer and use it in GitHub Desktop.
Save cou929/953267 to your computer and use it in GitHub Desktop.
Convert exported file from wordpress.com for importing posterous.
# /usr/bin/env python
# -*- coding: utf-8 -*-
import sys
import re
from xml.dom.minidom import parse
def markdown_nize(html):
ret = ''
html = html.encode('utf-8')
html = codeblock2md(html)
html = link2md(html)
html = h22md(html)
html = h32md(html)
html = ul2md(html)
html = ol2md(html)
ret += '<markdown>\n'
ret += html
ret += '</markdown>\n'
return ret
def codeblock2md(html):
ret = ''
in_codeblock = False
source_header = re.compile('\[sourcecode language=[\'"](\w+)[\'"].*?\]')
source_footer = re.compile('\[/sourcecode\]')
for line in html.split("\n"):
source_header_match = source_header.search(line)
source_footer_match = source_footer.search(line)
if not in_codeblock and source_header_match:
in_codeblock = True
lang = source_header_match.groups()[0]
if lang == 'cpp':
ret += ' #!cpp\n'
elif not in_codeblock:
ret += line + "\n"
elif in_codeblock and source_footer_match:
in_codeblock = False
elif in_codeblock:
ret += ' ' + line + "\n"
return ret
def link2md(html):
ret = ''
link = re.compile('<a.*?href="(.+?)".*?>(.+?)</a>')
link_pos = re.compile('<a.*?>.*?</a>')
for line in html.split("\n"):
if link.search(line):
pat = link.findall(line)
pos = link_pos.findall(line)
for i in xrange(len(pat)):
md = "[{0}]({1})".format(pat[i][1], pat[i][0])
line = line.replace(pos[i], md, 1)
ret += line + "\n"
return ret
def h22md(html):
ret = ''
h3 = re.compile('<h2.*?>(.+)</h2>')
for line in html.split("\n"):
res = h3.search(line)
if res:
md = "# {0} #".format(res.groups()[0])
line = re.sub('<h2.*?>.*?</h2>', md, line)
ret += line + "\n"
return ret
def h32md(html):
ret = ''
h3 = re.compile('<h3.*?>(.+)</h3>')
for line in html.split("\n"):
res = h3.search(line)
if res:
md = "## {0} ##".format(res.groups()[0])
line = re.sub('<h3.*?>.*?</h3>', md, line)
ret += line + "\n"
return ret
def ul2md(html):
ret = ''
in_list = False
header = re.compile('<ul>')
footer = re.compile('</ul>')
list = re.compile('<li.*?>(.+)</li>')
for line in html.split("\n"):
header_match = header.search(line)
footer_match = footer.search(line)
if not in_list and header_match:
in_list = True
elif not in_list:
ret += line + "\n"
elif in_list and footer_match:
in_list = False
elif in_list:
res = list.search(line)
if res:
md = "* {0}".format(res.groups()[0])
line = re.sub('<li.*?>.*?</li>', md, line)
ret += line + "\n"
return ret
def ol2md(html):
ret = ''
in_list = False
counter = 0
header = re.compile('<ol>')
footer = re.compile('</ol>')
list = re.compile('<li.*?>(.+)</li>')
for line in html.split("\n"):
header_match = header.search(line)
footer_match = footer.search(line)
if not in_list and header_match:
in_list = True
counter = 1
elif not in_list:
ret += line + "\n"
elif in_list and footer_match:
in_list = False
elif in_list:
res = list.search(line)
if res:
md = "{0}. {1}".format(counter, res.groups()[0])
line = re.sub('<li.*?>.*?</li>', md, line)
ret += line + "\n"
counter += 1
return ret
def main():
dom = parse(sys.argv[1])
for element in dom.getElementsByTagName('content:encoded'):
if element.childNodes:
element.childNodes[0].nodeValue = markdown_nize(element.childNodes[0].nodeValue).decode('utf-8')
print dom.toxml('utf-8')
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment