Created
November 2, 2018 12:13
-
-
Save trentrichardson/719177f60204e482c385eb273d3bc6cf to your computer and use it in GitHub Desktop.
Wordpress xml export to Pelican
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
import os | |
import io | |
import time | |
import datetime | |
import dateutil.parser | |
import re | |
# for xml parsing | |
from bs4 import BeautifulSoup | |
# for converting to markdown | |
import html2text | |
# for downloading attachments | |
import wget | |
# for converting various wordpress code tags to markdown | |
coderegex1 = re.compile(r'\[sourcecode language=\"[a-zA-Z0-9]*\"\](.*?)\[\/sourcecode\]', re.DOTALL) | |
coderegex2 = re.compile(r'\[code language=\"[a-zA-Z0-9]*\"\](.*?)\[\/code\]', re.DOTALL) | |
coderegex3 = re.compile(r'\[code lang=[a-zA-Z0-9]*\](.*?)\[\/code\]', re.DOTALL) | |
# for replacing http://www.example.com/wp-content/uploads/ with just /uploads/ | |
uploadregex = re.compile(r'(https?\:\/\/[a-zA-Z0-9\-\.]+\/wp-content\/uploads\/)') | |
class Post: | |
def __init__(self, title, author, date, content, category, status, slug, tags): | |
self.title = title | |
self.author = author | |
self.date = dateutil.parser.parse(date) | |
self.content = content | |
self.category = category | |
self.status = status | |
self.slug = slug | |
self.tags = tags | |
def load_doc(filename): | |
print("> Loading document!") | |
doc = "" | |
with io.open(filename, 'r', encoding='UTF-8') as f: | |
doc = f.read() | |
return doc | |
def parse_doc(doc): | |
print("> Parsing document!") | |
posts = [] | |
attachments = [] | |
soup = BeautifulSoup(doc, 'html.parser') | |
for item in soup.find_all('item'): | |
if item.find('wp:post_type').string == "post": | |
# get the tags | |
tags = [] | |
for tag in item.findAll('category', { 'domain': 'post_tag'}): | |
tags.append(tag['nicename']) | |
# fix urls to not use wp-content nor full url | |
content = uploadregex.sub('{attach}images/',item.find('content:encoded').string) | |
posts.append(Post( | |
item.find('title').string, | |
item.find('dc:creator').string, | |
item.find('wp:post_date').string, | |
content, | |
item.find('category', { 'domain': 'category'})['nicename'], | |
item.find('wp:status').string, | |
item.find('wp:post_name').string, | |
'; '.join(tags) )) | |
elif item.find('wp:post_type').string == "attachment": | |
attachments.append(item.guid.string) | |
return posts, attachments | |
def gen_markdown(post): | |
h = html2text.HTML2Text() | |
h.unicode_snob = 1 | |
h.body_width = 0 | |
h.dash_unordered_list = True | |
title = post.title.translate(str.maketrans({"\"": """, ":": ":"})) | |
body = post.content | |
header ="""Title: %s | |
Date: %s | |
Category: %s | |
Tags: %s | |
Slug: %s | |
Author: %s | |
"""%(title, post.date.strftime("%Y-%m-%d %H:%M:%S"), post.category, post.tags, post.slug, post.author) | |
body = re.sub(coderegex1, r"<pre>\1</pre>", body, re.U) | |
body = re.sub(coderegex2, r"<pre>\1</pre>", body, re.U) | |
body = re.sub(coderegex3, r"<pre>\1</pre>", body, re.U) | |
body = h.handle(body) | |
return header + body | |
def save_posts(output, posts): | |
print("> Saving posts!") | |
out = "" | |
directory = "" | |
for p in posts: | |
if p.status == "publish": | |
directory = output + "_posts/" | |
elif p.status == "draft": | |
directory = output + "_drafts/" | |
else: | |
directory = output + "_other/" | |
if not os.path.exists(directory): | |
os.makedirs(directory) | |
print("Saving", directory + p.date.strftime("%Y-%m-%d") + "-" + p.slug + ".md") | |
with io.open(directory + p.date.strftime("%Y-%m-%d") + "-" + p.slug + ".md", 'w', encoding='UTF-8') as f: | |
f.write(gen_markdown(p)) | |
def download_attachments(output, attachments): | |
print("> Wget'ing attachments!") | |
# todo | |
def main(): | |
output = "./" | |
if len(sys.argv) == 1: | |
print("Parameters: filename for wordpress .xml export file, optional output directory") | |
return | |
elif len(sys.argv) == 2: | |
filename = sys.argv[1] | |
elif len(sys.argv) == 3: | |
filename = sys.argv[1] | |
output = sys.argv[2] | |
doc = "" | |
posts = [] | |
attachments = [] | |
doc = load_doc(filename) | |
posts, attachments = parse_doc(doc) | |
save_posts(output, posts) | |
download_attachments(output, attachments) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Originally from wordpress-to-markdown this makes a few changes for Pelican's markdown specs for posts. It also grabs tags. I had issues with Pelican's version not working with the latest Pandoc, so with a couple tweaks to this script it worked fine for my needs.
Setup:
I used pipenv, so I made a new directory and put this python file in it, then ran the following:
Usage:
It should generate a folder like
_post
that contains the new .md files. These will be copied to your pelican project'scontent
directory. For images copy the contents of yourwp-content/uploads
directory tocontent/images
(the folders directly in images should be the year/month named folders)