Skip to content

Instantly share code, notes, and snippets.

@mutaku
Created May 16, 2013 18:55
Show Gist options
  • Select an option

  • Save mutaku/5594129 to your computer and use it in GitHub Desktop.

Select an option

Save mutaku/5594129 to your computer and use it in GitHub Desktop.
Rip out old posts from wordpress XML export, convert to markdown, and put in a new database structure.
from django.core.files.uploadedfile import InMemoryUploadedFile
import cStringIO
import StringIO
from urllib2 import urlopen
from PIL import Image as PILImage
import re
import os
from xml.dom import minidom
from mutaku.blog.models import Post, Image
from mutaku.tools.extras import findext
class Poster():
'''
Post XML post data to Alfred
'''
def __init__(self, item):
self.item = item
self.image_list = []
self.title = getData(self.item, "title")
self.pre_content = getData(self.item, "content:encoded")
self.content = self.subit(self.pre_content)
try:
self.old_slug = getData(self.item, "wp:post_name")
except:
self.old_slug = "DRAFT"
self.images = self.collect_images()
def subit(self, data):
'''
Compile and sub
'''
# Old code blocks to markdown
pattern = re.compile(r"\[(?P<style>.*?)\](?P<code>.*?)\[\/(?P<endstyle>.*?)\]",
re.DOTALL|re.IGNORECASE|re.MULTILINE)
data = pattern.sub(self.push_code, data)
# HTML links to markdown
pattern = re.compile(r'\<a.*?href="(?P<src>.*?)".*?\>(?P<text>.*?)\<\/a\>',
re.DOTALL|re.IGNORECASE|re.MULTILINE)
data = pattern.sub(self.push_link, data)
# Suck out images, upload, and insert as custom markdown
pattern = re.compile(r'(?P<img>\<img.*?\>)',
re.DOTALL|re.IGNORECASE|re.MULTILINE)
data = pattern.sub(self.push_img, data)
# Take out p tags
pattern = re.compile(r'\<p.*?\>(?P<content>.*?)\<\/p\>',
re.DOTALL|re.IGNORECASE|re.MULTILINE)
data = pattern.sub("\g<content>", data)
# Miscellaneous HTML conversions
pattern = re.compile(r'\&lt;',
re.DOTALL|re.IGNORECASE|re.MULTILINE)
data = pattern.sub('<', data)
pattern = re.compile(r'\&\#39;',
re.DOTALL|re.IGNORECASE|re.MULTILINE)
data = pattern.sub("'", data)
pattern = re.compile(r'\&gt;',
re.DOTALL|re.IGNORECASE|re.MULTILINE)
data = pattern.sub('>', data)
pattern = re.compile(r'\&quot\;',
re.DOTALL|re.IGNORECASE|re.MULTILINE)
data = pattern.sub('"', data)
return data
def push_code(self, matchobj):
'''
Make the code block substitutions
'''
if not re.search(r"caption", matchobj.group('style')):
new = matchobj.group('code').split("\n")
for k, v in enumerate(new):
new[k] = " "*4+v
new.insert(0, "\n"+" "*4+":::"+matchobj.group('style'))
new.append("\n")
return "\n".join(new)
return matchobj.group('code')
def push_img(self, matchobj):
'''
Make image tag substitutions
'''
img_url, img_title, caption = None, None, None
# Parse out img tag data
pattern = re.compile(r"\<img.*? src\=\"(?P<img_src>.*?)\"",
re.DOTALL|re.IGNORECASE|re.MULTILINE)
url_search = pattern.search(matchobj.group("img"))
if url_search:
# If resized, get original image link
pattern = re.compile(r"(?P<main>.*?)-(?P<dim>\d*?x\d*?)(?P<ext>\.\w+)",
re.DOTALL|re.IGNORECASE|re.MULTILINE)
url_parsed = pattern.search(url_search.group("img_src"))
if url_parsed:
img_url = url_parsed.group("main") + url_parsed.group("ext")
else:
img_url = url_search.group("img_src")
# Try to get a title from the image
pattern = re.compile(r"title=\"(?P<img_title>.*?)\"",
re.DOTALL|re.IGNORECASE|re.MULTILINE)
title_search = pattern.search(matchobj.group("img"))
if title_search:
img_title = title_search.group("img_title")
# Try to find an associated (parent) caption
pattern = re.compile(r"\[caption.*?caption\=(?P<caption>.*?)\](?P<content>.*?)\[\/caption\]",
re.DOTALL|re.IGNORECASE|re.MULTILINE)
for result in pattern.finditer(self.pre_content):
if result.group("caption") and re.search(url_search.group("img_src"), result.group("content")):
caption = result.group("caption")
if img_url:
return self.handle_image(img_url,
img_title,
caption)
return None
def handle_image(self, img_url, img_title, caption):
'''
Grab image from URL, upload from memory, and return slug
'''
try:
img_file = urlopen(img_url)
target_name = os.path.basename(img_file.url)
ext = os.path.splitext(target_name)[1]
if not img_title:
img_title = target_name
if not caption:
caption = "Holder caption."
data_input = cStringIO.StringIO(img_file.read())
img = PILImage.open(data_input)
if img.mode not in ("L", "RGB"):
img = img.convert("RGB")
data_output = StringIO.StringIO()
img.save(data_output, format=findext(ext).upper())
img_obj = InMemoryUploadedFile(data_output,
None,
target_name,
'image/'+findext(ext).lower(),
data_output.len,
None)
i = Image(image=img_obj,
caption=caption,
slug=img_title )
i.save()
self.image_list.append(i.slug)
return i.__unicode__()
except:
pass
return None
def push_link(self, matchobj):
'''
Push link markdown
'''
if not re.search(r"\<img", matchobj.group("text")):
return "[%s](%s)" % (matchobj.group("text"),
matchobj.group("src"))
return matchobj.group(0)
def collect_images(self):
'''
Convert list of slugs into queryset for insertion into post object
'''
return Image.objects.filter(slug__in=self.image_list)
def getData(item, name):
'''
Get data from element by tag name
'''
return item.getElementsByTagName(name)[0].firstChild.data
def isPost(item):
'''
Check if item is a post
'''
if getData(item, "wp:post_type") != "attachment":
return True
return False
def parse_and_run():
'''
Parse and iterate over XML post data and insert into alfred
'''
xml_file = raw_input("XML file: ")
xml = minidom.parse(xml_file)
items = xml.getElementsByTagName("item")
problems = []
for k, item in enumerate(items):
if isPost(item):
try:
obj = Poster(item)
p = Post(title=obj.title,
body=obj.content,
old_mutaku_slug=obj.old_slug)
p.save()
p.images = obj.images
p.save()
except:
problems.append((k, getData(item, "title"), item))
return problems
if __name__ == "__main__":
parse_and_run()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment