Created
May 16, 2013 18:55
-
-
Save mutaku/5594129 to your computer and use it in GitHub Desktop.
Rip out old posts from wordpress XML export, convert to markdown, and put in a new database structure.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from django.core.files.uploadedfile import InMemoryUploadedFile | |
| import cStringIO | |
| import StringIO | |
| from urllib2 import urlopen | |
| from PIL import Image as PILImage | |
| import re | |
| import os | |
| from xml.dom import minidom | |
| from mutaku.blog.models import Post, Image | |
| from mutaku.tools.extras import findext | |
| class Poster(): | |
| ''' | |
| Post XML post data to Alfred | |
| ''' | |
| def __init__(self, item): | |
| self.item = item | |
| self.image_list = [] | |
| self.title = getData(self.item, "title") | |
| self.pre_content = getData(self.item, "content:encoded") | |
| self.content = self.subit(self.pre_content) | |
| try: | |
| self.old_slug = getData(self.item, "wp:post_name") | |
| except: | |
| self.old_slug = "DRAFT" | |
| self.images = self.collect_images() | |
| def subit(self, data): | |
| ''' | |
| Compile and sub | |
| ''' | |
| # Old code blocks to markdown | |
| pattern = re.compile(r"\[(?P<style>.*?)\](?P<code>.*?)\[\/(?P<endstyle>.*?)\]", | |
| re.DOTALL|re.IGNORECASE|re.MULTILINE) | |
| data = pattern.sub(self.push_code, data) | |
| # HTML links to markdown | |
| pattern = re.compile(r'\<a.*?href="(?P<src>.*?)".*?\>(?P<text>.*?)\<\/a\>', | |
| re.DOTALL|re.IGNORECASE|re.MULTILINE) | |
| data = pattern.sub(self.push_link, data) | |
| # Suck out images, upload, and insert as custom markdown | |
| pattern = re.compile(r'(?P<img>\<img.*?\>)', | |
| re.DOTALL|re.IGNORECASE|re.MULTILINE) | |
| data = pattern.sub(self.push_img, data) | |
| # Take out p tags | |
| pattern = re.compile(r'\<p.*?\>(?P<content>.*?)\<\/p\>', | |
| re.DOTALL|re.IGNORECASE|re.MULTILINE) | |
| data = pattern.sub("\g<content>", data) | |
| # Miscellaneous HTML conversions | |
| pattern = re.compile(r'\<', | |
| re.DOTALL|re.IGNORECASE|re.MULTILINE) | |
| data = pattern.sub('<', data) | |
| pattern = re.compile(r'\&\#39;', | |
| re.DOTALL|re.IGNORECASE|re.MULTILINE) | |
| data = pattern.sub("'", data) | |
| pattern = re.compile(r'\>', | |
| re.DOTALL|re.IGNORECASE|re.MULTILINE) | |
| data = pattern.sub('>', data) | |
| pattern = re.compile(r'\"\;', | |
| re.DOTALL|re.IGNORECASE|re.MULTILINE) | |
| data = pattern.sub('"', data) | |
| return data | |
| def push_code(self, matchobj): | |
| ''' | |
| Make the code block substitutions | |
| ''' | |
| if not re.search(r"caption", matchobj.group('style')): | |
| new = matchobj.group('code').split("\n") | |
| for k, v in enumerate(new): | |
| new[k] = " "*4+v | |
| new.insert(0, "\n"+" "*4+":::"+matchobj.group('style')) | |
| new.append("\n") | |
| return "\n".join(new) | |
| return matchobj.group('code') | |
| def push_img(self, matchobj): | |
| ''' | |
| Make image tag substitutions | |
| ''' | |
| img_url, img_title, caption = None, None, None | |
| # Parse out img tag data | |
| pattern = re.compile(r"\<img.*? src\=\"(?P<img_src>.*?)\"", | |
| re.DOTALL|re.IGNORECASE|re.MULTILINE) | |
| url_search = pattern.search(matchobj.group("img")) | |
| if url_search: | |
| # If resized, get original image link | |
| pattern = re.compile(r"(?P<main>.*?)-(?P<dim>\d*?x\d*?)(?P<ext>\.\w+)", | |
| re.DOTALL|re.IGNORECASE|re.MULTILINE) | |
| url_parsed = pattern.search(url_search.group("img_src")) | |
| if url_parsed: | |
| img_url = url_parsed.group("main") + url_parsed.group("ext") | |
| else: | |
| img_url = url_search.group("img_src") | |
| # Try to get a title from the image | |
| pattern = re.compile(r"title=\"(?P<img_title>.*?)\"", | |
| re.DOTALL|re.IGNORECASE|re.MULTILINE) | |
| title_search = pattern.search(matchobj.group("img")) | |
| if title_search: | |
| img_title = title_search.group("img_title") | |
| # Try to find an associated (parent) caption | |
| pattern = re.compile(r"\[caption.*?caption\=(?P<caption>.*?)\](?P<content>.*?)\[\/caption\]", | |
| re.DOTALL|re.IGNORECASE|re.MULTILINE) | |
| for result in pattern.finditer(self.pre_content): | |
| if result.group("caption") and re.search(url_search.group("img_src"), result.group("content")): | |
| caption = result.group("caption") | |
| if img_url: | |
| return self.handle_image(img_url, | |
| img_title, | |
| caption) | |
| return None | |
| def handle_image(self, img_url, img_title, caption): | |
| ''' | |
| Grab image from URL, upload from memory, and return slug | |
| ''' | |
| try: | |
| img_file = urlopen(img_url) | |
| target_name = os.path.basename(img_file.url) | |
| ext = os.path.splitext(target_name)[1] | |
| if not img_title: | |
| img_title = target_name | |
| if not caption: | |
| caption = "Holder caption." | |
| data_input = cStringIO.StringIO(img_file.read()) | |
| img = PILImage.open(data_input) | |
| if img.mode not in ("L", "RGB"): | |
| img = img.convert("RGB") | |
| data_output = StringIO.StringIO() | |
| img.save(data_output, format=findext(ext).upper()) | |
| img_obj = InMemoryUploadedFile(data_output, | |
| None, | |
| target_name, | |
| 'image/'+findext(ext).lower(), | |
| data_output.len, | |
| None) | |
| i = Image(image=img_obj, | |
| caption=caption, | |
| slug=img_title ) | |
| i.save() | |
| self.image_list.append(i.slug) | |
| return i.__unicode__() | |
| except: | |
| pass | |
| return None | |
| def push_link(self, matchobj): | |
| ''' | |
| Push link markdown | |
| ''' | |
| if not re.search(r"\<img", matchobj.group("text")): | |
| return "[%s](%s)" % (matchobj.group("text"), | |
| matchobj.group("src")) | |
| return matchobj.group(0) | |
| def collect_images(self): | |
| ''' | |
| Convert list of slugs into queryset for insertion into post object | |
| ''' | |
| return Image.objects.filter(slug__in=self.image_list) | |
| def getData(item, name): | |
| ''' | |
| Get data from element by tag name | |
| ''' | |
| return item.getElementsByTagName(name)[0].firstChild.data | |
| def isPost(item): | |
| ''' | |
| Check if item is a post | |
| ''' | |
| if getData(item, "wp:post_type") != "attachment": | |
| return True | |
| return False | |
| def parse_and_run(): | |
| ''' | |
| Parse and iterate over XML post data and insert into alfred | |
| ''' | |
| xml_file = raw_input("XML file: ") | |
| xml = minidom.parse(xml_file) | |
| items = xml.getElementsByTagName("item") | |
| problems = [] | |
| for k, item in enumerate(items): | |
| if isPost(item): | |
| try: | |
| obj = Poster(item) | |
| p = Post(title=obj.title, | |
| body=obj.content, | |
| old_mutaku_slug=obj.old_slug) | |
| p.save() | |
| p.images = obj.images | |
| p.save() | |
| except: | |
| problems.append((k, getData(item, "title"), item)) | |
| return problems | |
| if __name__ == "__main__": | |
| parse_and_run() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment