Skip to content

Instantly share code, notes, and snippets.

@major
Created May 14, 2012 01:43
Show Gist options
  • Save major/2691197 to your computer and use it in GitHub Desktop.
Save major/2691197 to your computer and use it in GitHub Desktop.
Exporting rackerhacker.com from Wordpress to Markdown
#!/usr/bin/env python
import codecs
import feedparser
import html2text
from pprint import pprint
import sys
import time
def slugmaker(linktext):
"""Turn a post link into a slug"""
linkpieces = linktext.split('/')
slugpieces = linkpieces[3:7]
return "/%s" % '/'.join(slugpieces)
# Forget line wrapping
html2text.BODY_WIDTH=0
# Pick up the XML export
d = feedparser.parse('rackerhacker.wordpress.2012-05-13.xml')
for entry in d.entries:
# Skip if it's an unpublished entry
if entry.wp_status != 'publish':
continue
# Skip if it's not a post
if entry.wp_post_type != 'post':
continue
# Empty dict to hold this posts' data
elements = {}
# The easy stuff
elements['title'] = entry.title
elements['slug'] = slugmaker(entry.link)
elements['date'] = time.strftime('%Y-%m-%d %H:%M', entry.published_parsed)
elements['author'] = "Major Hayden"
# Get the tags (if they exist)
taglist = []
if hasattr(entry, 'tags'):
for tag in entry.tags:
if tag.scheme == 'post_tag':
taglist.append(tag.term)
elements['tags'] = ', '.join(taglist)
# Snag the actual content of the post
elements['content'] = html2text.html2text(entry.content[0].value)
# Generate a filename to hold the markdown text
elements['filename'] = "%s-%s.md" % (
time.strftime('%Y-%m-%d', entry.published_parsed),
entry.link.split('/')[6])
# Basic markdown template
file_contents = """Title: %(title)s
Date: %(date)s
Slug: %(slug)s
Tags: %(tags)s
Author: %(author)s
%(content)s
""" % elements
# Write the data (I hate unicode)
with codecs.open("src/%s" % elements['filename'], 'wb', "utf-8") as f:
f.write(file_contents)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment