Created
May 3, 2011 12:47
-
-
Save cou929/953267 to your computer and use it in GitHub Desktop.
Convert exported file from wordpress.com for importing posterous.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# /usr/bin/env python | |
# -*- coding: utf-8 -*- | |
import sys | |
import re | |
from xml.dom.minidom import parse | |
def markdown_nize(html): | |
ret = '' | |
html = html.encode('utf-8') | |
html = codeblock2md(html) | |
html = link2md(html) | |
html = h22md(html) | |
html = h32md(html) | |
html = ul2md(html) | |
html = ol2md(html) | |
ret += '<markdown>\n' | |
ret += html | |
ret += '</markdown>\n' | |
return ret | |
def codeblock2md(html): | |
ret = '' | |
in_codeblock = False | |
source_header = re.compile('\[sourcecode language=[\'"](\w+)[\'"].*?\]') | |
source_footer = re.compile('\[/sourcecode\]') | |
for line in html.split("\n"): | |
source_header_match = source_header.search(line) | |
source_footer_match = source_footer.search(line) | |
if not in_codeblock and source_header_match: | |
in_codeblock = True | |
lang = source_header_match.groups()[0] | |
if lang == 'cpp': | |
ret += ' #!cpp\n' | |
elif not in_codeblock: | |
ret += line + "\n" | |
elif in_codeblock and source_footer_match: | |
in_codeblock = False | |
elif in_codeblock: | |
ret += ' ' + line + "\n" | |
return ret | |
def link2md(html): | |
ret = '' | |
link = re.compile('<a.*?href="(.+?)".*?>(.+?)</a>') | |
link_pos = re.compile('<a.*?>.*?</a>') | |
for line in html.split("\n"): | |
if link.search(line): | |
pat = link.findall(line) | |
pos = link_pos.findall(line) | |
for i in xrange(len(pat)): | |
md = "[{0}]({1})".format(pat[i][1], pat[i][0]) | |
line = line.replace(pos[i], md, 1) | |
ret += line + "\n" | |
return ret | |
def h22md(html): | |
ret = '' | |
h3 = re.compile('<h2.*?>(.+)</h2>') | |
for line in html.split("\n"): | |
res = h3.search(line) | |
if res: | |
md = "# {0} #".format(res.groups()[0]) | |
line = re.sub('<h2.*?>.*?</h2>', md, line) | |
ret += line + "\n" | |
return ret | |
def h32md(html): | |
ret = '' | |
h3 = re.compile('<h3.*?>(.+)</h3>') | |
for line in html.split("\n"): | |
res = h3.search(line) | |
if res: | |
md = "## {0} ##".format(res.groups()[0]) | |
line = re.sub('<h3.*?>.*?</h3>', md, line) | |
ret += line + "\n" | |
return ret | |
def ul2md(html): | |
ret = '' | |
in_list = False | |
header = re.compile('<ul>') | |
footer = re.compile('</ul>') | |
list = re.compile('<li.*?>(.+)</li>') | |
for line in html.split("\n"): | |
header_match = header.search(line) | |
footer_match = footer.search(line) | |
if not in_list and header_match: | |
in_list = True | |
elif not in_list: | |
ret += line + "\n" | |
elif in_list and footer_match: | |
in_list = False | |
elif in_list: | |
res = list.search(line) | |
if res: | |
md = "* {0}".format(res.groups()[0]) | |
line = re.sub('<li.*?>.*?</li>', md, line) | |
ret += line + "\n" | |
return ret | |
def ol2md(html): | |
ret = '' | |
in_list = False | |
counter = 0 | |
header = re.compile('<ol>') | |
footer = re.compile('</ol>') | |
list = re.compile('<li.*?>(.+)</li>') | |
for line in html.split("\n"): | |
header_match = header.search(line) | |
footer_match = footer.search(line) | |
if not in_list and header_match: | |
in_list = True | |
counter = 1 | |
elif not in_list: | |
ret += line + "\n" | |
elif in_list and footer_match: | |
in_list = False | |
elif in_list: | |
res = list.search(line) | |
if res: | |
md = "{0}. {1}".format(counter, res.groups()[0]) | |
line = re.sub('<li.*?>.*?</li>', md, line) | |
ret += line + "\n" | |
counter += 1 | |
return ret | |
def main(): | |
dom = parse(sys.argv[1]) | |
for element in dom.getElementsByTagName('content:encoded'): | |
if element.childNodes: | |
element.childNodes[0].nodeValue = markdown_nize(element.childNodes[0].nodeValue).decode('utf-8') | |
print dom.toxml('utf-8') | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment