Skip to content

Instantly share code, notes, and snippets.

@brianly
Last active December 11, 2015 01:18
Show Gist options
  • Save brianly/4522057 to your computer and use it in GitHub Desktop.
Save brianly/4522057 to your computer and use it in GitHub Desktop.
The guts of a script to convert posts from a WordPress blog export into markdown for use with Acrylamid.
import os
import codecs
import xml.etree.ElementTree as etree
from dateutil import parser as du
from urlparse import urlparse
def parse_export(path):
"""Parses the WordPress export file into a Python-native format"""
posts = []
tree = etree.parse(path)
root = tree.getroot()
for item_elem in root.iter('item'):
# Stash this in case it might be useful
post = {'_raw': item_elem.getchildren()}
# Include only the useful stuff
for i in item_elem.getchildren():
if i.tag == 'category':
if 'tags' not in post:
post['tags'] = []
if i.attrib['domain'] == 'post_tag': # only capture tags - not categories
post['tags'].append(i.attrib['nicename'])
elif i.tag == 'link':
post['url'] = i.text
elif i.tag == 'pubDate':
post['published'] = unicode(du.parse(i.text))
elif i.tag == 'title':
post[i.tag] = i.text
elif i.tag == '{http://purl.org/rss/1.0/modules/content/}encoded':
post['content'] = i.text
elif i.tag == '{http://wordpress.org/export/1.2/}status':
post['status'] = i.text
posts.append(post)
return posts
OUTPUT_DIR = os.path.join(os.getcwd(), 'out')
INDEX_NAME = 'index.txt'
posts = parse_export('data/brianlyttle.wordpress.2013-01-12.xml')
for p in posts:
if p['status'] == 'publish':
title = 'title: "%s"\n' % p['title']
date = 'date: %s\n' % (unicode(p['published'])[0:16])
tags = 'tags: [%s]\n' % ', '.join(p['tags'])
text = ''.join(['---\n', title, date, tags, '---\n', p['content']])
post_path = os.path.join(OUTPUT_DIR, urlparse(p['url']).path[1:])
if not os.path.exists(post_path):
os.makedirs(post_path)
with codecs.open(os.path.join(post_path, INDEX_NAME), 'w', 'utf-8') as f:
f.write(text)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment