Skip to content

Instantly share code, notes, and snippets.

@aharonium
Forked from ruslanosipov/wxr2txt.py
Last active July 19, 2019 01:31
Show Gist options
  • Save aharonium/1d148b57e2b8488f68e2f2781ce92e00 to your computer and use it in GitHub Desktop.
Save aharonium/1d148b57e2b8488f68e2f2781ce92e00 to your computer and use it in GitHub Desktop.
Script to convert WordPress posts to plain text files
#!/usr/bin/env python
"""This script converts WXR file to a number of plain text files.
WXR stands for "WordPress eXtended RSS", which basically is just a
regular XML file. This script extracts entries from the WXR file into
plain text files. Output format: article name prefixed by date for
posts, article name for pages.
Usage: wxr2txt.py filename [-o output_dir]
"""
import os
import re
import sys
from xml.etree import ElementTree
from urlparse import urlparse
NAMESPACES = {
'content': 'http://purl.org/rss/1.0/modules/content/',
'wp': 'http://wordpress.org/export/1.2/',
'excerpt': 'http://wordpress.org/export/1.2/excerpt/',
'wfw': 'http://wellformedweb.org/CommentAPI/',
'dc': 'http://purl.org/dc/elements/1.1/',
}
USAGE_STRING = "Usage: wxr2txt.py filename [-o output_dir]" ''' note: output argument no longer works '''
def main(argv):
filename, output_dir = _parse_and_validate_output(argv)
try:
data = ElementTree.parse(filename).getroot()
except ElementTree.ParseError:
_error("Invalid input file format. Can not parse the input.")
page_counter, post_counter = 0, 0
cwd = os.getcwd()
for post in data.find('channel').findall('item'):
post_type = post.find('wp:post_type', namespaces=NAMESPACES).text
if post_type not in ('post', 'page'):
continue
content = post.find('content:encoded', namespaces=NAMESPACES).text
date = post.find('wp:post_date', namespaces=NAMESPACES).text
postid = post.find('wp:post_id', namespaces=NAMESPACES).text
author = post.find('dc:creator', namespaces=NAMESPACES).text
title = post.find('title').text
linktitle = post.find('link').text
url = post.find('link').text
''' date = date.split(' ')[0].replace('-', '') '''
date = date.split(' ')[0]
postidlink = 'http://opensiddur.org/?p=' + postid
linktitle = re.sub(r'^https:\/.*\/(.+)\/$', r'\1', linktitle)
url = re.sub(r'^https:\/(.*\/).+\/$', r'\1', url)
title = title.encode('utf8')
fullname = os.path.join(cwd, url)
path, basename = os.path.split(fullname)
if not os.path.exists(path):
os.makedirs(path)
if post_type == 'post':
''' post_filename = linktitle + '_(' + author + '_' + date + ').html.md' '''
post_filename = linktitle + '.html.md'
post_counter += 1
else:
post_filename = linktitle + '.html'
page_counter += 1
''' with open(os.path.join(output_dir, post_filename), 'w') as post_file: '''
with open(os.path.join(fullname, post_filename), 'w') as post_file:
post_file.write('<html>\n<head></head>\n<body>\nTitle: ' + title + '<br />\n')
post_file.write('Primary contributor: ' + author + '<br />\n')
''' post_file.write('Sharing terms: ' + license + '<br />\n') '''
post_file.write('For attribution and license, please consult the following URL: <a href="' + postidlink + '">' + postidlink + '</a>\n<p />\n<hr />\n\n')
post_file.write(content.encode('utf8'))
post_file.write('\n</body>\n</html>')
post_counter += 1
print "Saved {} posts and {} pages in directory '{}'.".format(
post_counter, page_counter, output_dir)
def _parse_and_validate_output(argv):
if len(argv) not in (2, 4):
_error("Wrong number of arguments.")
filename = argv[1]
if not os.path.isfile(filename):
_error("Input file does not exist (or not enough permissions).")
output_dir = argv[3] if len(argv) == 4 and argv[2] == '-o' else os.getcwd()
if not os.path.isdir(output_dir):
_error("Output directory does not exist (or not enough permissions).")
return filename, output_dir
def _error(text):
print text
print USAGE_STRING
sys.exit(1)
if __name__ == "__main__":
main(sys.argv)
@aharonium
Copy link
Author

This fork breaks the output directory option that ruslanosipov had in his original script. The script will create a directory tree of wordpress posts based on the post categories and sub-categories indicated in their permalink.

I still want to figure out how I can parse elements with multiple values (categories, tags, co-authors) as well as postmeta info. Please help if you can.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment