Skip to content

Instantly share code, notes, and snippets.

@pelletier
Created January 1, 2011 17:41
Show Gist options
  • Save pelletier/761875 to your computer and use it in GitHub Desktop.
Save pelletier/761875 to your computer and use it in GitHub Desktop.
Convert a Wordpress weblog to many rst files you can use with Pelican for instance.
# -*- coding: UTF-8 -*-
import sys
import datetime
import subprocess
from xml.dom.minidom import parse, parseString
def html2rst(html):
html = str(html)
p = subprocess.Popen(['pandoc', '--from=html', '--to=rst'],
stdin=subprocess.PIPE, stdout=subprocess.PIPE)
return p.communicate(html.encode('utf-8'))[0]
def getChildText(node, name):
node = node.getElementsByTagName(name)[0]
nodelist = node.childNodes
rc = []
for node in nodelist:
if node.nodeType == node.TEXT_NODE:
rc.append(node.data)
return ''.join(rc)
if len(sys.argv) < 2:
print("Use: wp2rst <wp_xml_file_path>")
exit(1)
dom = parse(sys.argv[1])
items = dom.getElementsByTagName('item')
for item in items:
if getChildText(item, 'wp:post_type') == 'post':
# Grab the title
title = getChildText(item, 'title')
print(title)
# We do not need drafts
if getChildText(item, 'wp:status') in ['draft', 'auto-draft']:
continue
# Grab the content and convert it to RST
html_content = item.getElementsByTagName('content:encoded')[0].firstChild.data
rst_content = html2rst(html_content)
# Grab the date and convert it to a correct date / time format
raw_date = getChildText(item, 'pubDate')
date = datetime.datetime.strptime(raw_date.split('+')[0], "%a, %d %b %Y %H:%M:%S ")
print(date)
# Grab the slug
slug = getChildText(item, 'link').split('/')[-2]
print(slug)
# Look for tags
tags = []
for categ in item.getElementsByTagName('category'):
if categ.getAttribute('domain') == 'tag' and categ.hasAttribute('nicename'):
tags.append(categ.getAttribute('nicename'))
print(tags)
import codecs
# Write the result in a file
file_descriptor = codecs.open('data/blog/%s.rst' % slug, 'w+', "utf-8-sig")
file_descriptor.write(title)
file_descriptor.write('\n%s\n\n' % (len(title)*"#"))
file_descriptor.write(':date: %s\n' % date.strftime('%Y-%m-%d %H:%M'))
file_descriptor.write(':tags: %s\n' % ", ".join(tags) )
file_descriptor.write(':author: Thomas Pelletier\n\n')
print(type(str(rst_content)))
file_descriptor.write(rst_content.decode('utf-8'))
file_descriptor.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment