Created
January 1, 2011 17:41
-
-
Save pelletier/761875 to your computer and use it in GitHub Desktop.
Convert a Wordpress weblog to many rst files you can use with Pelican for instance.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: UTF-8 -*- | |
import sys | |
import datetime | |
import subprocess | |
from xml.dom.minidom import parse, parseString | |
def html2rst(html): | |
html = str(html) | |
p = subprocess.Popen(['pandoc', '--from=html', '--to=rst'], | |
stdin=subprocess.PIPE, stdout=subprocess.PIPE) | |
return p.communicate(html.encode('utf-8'))[0] | |
def getChildText(node, name): | |
node = node.getElementsByTagName(name)[0] | |
nodelist = node.childNodes | |
rc = [] | |
for node in nodelist: | |
if node.nodeType == node.TEXT_NODE: | |
rc.append(node.data) | |
return ''.join(rc) | |
if len(sys.argv) < 2: | |
print("Use: wp2rst <wp_xml_file_path>") | |
exit(1) | |
dom = parse(sys.argv[1]) | |
items = dom.getElementsByTagName('item') | |
for item in items: | |
if getChildText(item, 'wp:post_type') == 'post': | |
# Grab the title | |
title = getChildText(item, 'title') | |
print(title) | |
# We do not need drafts | |
if getChildText(item, 'wp:status') in ['draft', 'auto-draft']: | |
continue | |
# Grab the content and convert it to RST | |
html_content = item.getElementsByTagName('content:encoded')[0].firstChild.data | |
rst_content = html2rst(html_content) | |
# Grab the date and convert it to a correct date / time format | |
raw_date = getChildText(item, 'pubDate') | |
date = datetime.datetime.strptime(raw_date.split('+')[0], "%a, %d %b %Y %H:%M:%S ") | |
print(date) | |
# Grab the slug | |
slug = getChildText(item, 'link').split('/')[-2] | |
print(slug) | |
# Look for tags | |
tags = [] | |
for categ in item.getElementsByTagName('category'): | |
if categ.getAttribute('domain') == 'tag' and categ.hasAttribute('nicename'): | |
tags.append(categ.getAttribute('nicename')) | |
print(tags) | |
import codecs | |
# Write the result in a file | |
file_descriptor = codecs.open('data/blog/%s.rst' % slug, 'w+', "utf-8-sig") | |
file_descriptor.write(title) | |
file_descriptor.write('\n%s\n\n' % (len(title)*"#")) | |
file_descriptor.write(':date: %s\n' % date.strftime('%Y-%m-%d %H:%M')) | |
file_descriptor.write(':tags: %s\n' % ", ".join(tags) ) | |
file_descriptor.write(':author: Thomas Pelletier\n\n') | |
print(type(str(rst_content))) | |
file_descriptor.write(rst_content.decode('utf-8')) | |
file_descriptor.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment