-
-
Save aharonium/1d148b57e2b8488f68e2f2781ce92e00 to your computer and use it in GitHub Desktop.
Script to convert WordPress posts to plain text files
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
"""This script converts WXR file to a number of plain text files. | |
WXR stands for "WordPress eXtended RSS", which basically is just a | |
regular XML file. This script extracts entries from the WXR file into | |
plain text files. Output format: article name prefixed by date for | |
posts, article name for pages. | |
Usage: wxr2txt.py filename [-o output_dir] | |
""" | |
import os | |
import re | |
import sys | |
from xml.etree import ElementTree | |
from urlparse import urlparse | |
NAMESPACES = { | |
'content': 'http://purl.org/rss/1.0/modules/content/', | |
'wp': 'http://wordpress.org/export/1.2/', | |
'excerpt': 'http://wordpress.org/export/1.2/excerpt/', | |
'wfw': 'http://wellformedweb.org/CommentAPI/', | |
'dc': 'http://purl.org/dc/elements/1.1/', | |
} | |
USAGE_STRING = "Usage: wxr2txt.py filename [-o output_dir]" ''' note: output argument no longer works ''' | |
def main(argv): | |
filename, output_dir = _parse_and_validate_output(argv) | |
try: | |
data = ElementTree.parse(filename).getroot() | |
except ElementTree.ParseError: | |
_error("Invalid input file format. Can not parse the input.") | |
page_counter, post_counter = 0, 0 | |
cwd = os.getcwd() | |
for post in data.find('channel').findall('item'): | |
post_type = post.find('wp:post_type', namespaces=NAMESPACES).text | |
if post_type not in ('post', 'page'): | |
continue | |
content = post.find('content:encoded', namespaces=NAMESPACES).text | |
date = post.find('wp:post_date', namespaces=NAMESPACES).text | |
postid = post.find('wp:post_id', namespaces=NAMESPACES).text | |
author = post.find('dc:creator', namespaces=NAMESPACES).text | |
title = post.find('title').text | |
linktitle = post.find('link').text | |
url = post.find('link').text | |
''' date = date.split(' ')[0].replace('-', '') ''' | |
date = date.split(' ')[0] | |
postidlink = 'http://opensiddur.org/?p=' + postid | |
linktitle = re.sub(r'^https:\/.*\/(.+)\/$', r'\1', linktitle) | |
url = re.sub(r'^https:\/(.*\/).+\/$', r'\1', url) | |
title = title.encode('utf8') | |
fullname = os.path.join(cwd, url) | |
path, basename = os.path.split(fullname) | |
if not os.path.exists(path): | |
os.makedirs(path) | |
if post_type == 'post': | |
''' post_filename = linktitle + '_(' + author + '_' + date + ').html.md' ''' | |
post_filename = linktitle + '.html.md' | |
post_counter += 1 | |
else: | |
post_filename = linktitle + '.html' | |
page_counter += 1 | |
''' with open(os.path.join(output_dir, post_filename), 'w') as post_file: ''' | |
with open(os.path.join(fullname, post_filename), 'w') as post_file: | |
post_file.write('<html>\n<head></head>\n<body>\nTitle: ' + title + '<br />\n') | |
post_file.write('Primary contributor: ' + author + '<br />\n') | |
''' post_file.write('Sharing terms: ' + license + '<br />\n') ''' | |
post_file.write('For attribution and license, please consult the following URL: <a href="' + postidlink + '">' + postidlink + '</a>\n<p />\n<hr />\n\n') | |
post_file.write(content.encode('utf8')) | |
post_file.write('\n</body>\n</html>') | |
post_counter += 1 | |
print "Saved {} posts and {} pages in directory '{}'.".format( | |
post_counter, page_counter, output_dir) | |
def _parse_and_validate_output(argv): | |
if len(argv) not in (2, 4): | |
_error("Wrong number of arguments.") | |
filename = argv[1] | |
if not os.path.isfile(filename): | |
_error("Input file does not exist (or not enough permissions).") | |
output_dir = argv[3] if len(argv) == 4 and argv[2] == '-o' else os.getcwd() | |
if not os.path.isdir(output_dir): | |
_error("Output directory does not exist (or not enough permissions).") | |
return filename, output_dir | |
def _error(text): | |
print text | |
print USAGE_STRING | |
sys.exit(1) | |
if __name__ == "__main__": | |
main(sys.argv) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
This fork breaks the output directory option that ruslanosipov had in his original script. The script will create a directory tree of wordpress posts based on the post categories and sub-categories indicated in their permalink.
I still want to figure out how I can parse elements with multiple values (categories, tags, co-authors) as well as postmeta info. Please help if you can.