Skip to content

Instantly share code, notes, and snippets.

@chosak
Created January 18, 2017 11:54
Show Gist options
  • Save chosak/3717fb120c592c1b78b6f3ee6dc366aa to your computer and use it in GitHub Desktop.
Save chosak/3717fb120c592c1b78b6f3ee6dc366aa to your computer and use it in GitHub Desktop.
Wordpress export script
#!/usr/bin/env python
from __future__ import absolute_import, print_function, unicode_literals
import argparse
import os
import re
import requests
from getpass import getpass
from wordpress_xmlrpc import Client
from wordpress_xmlrpc.methods.media import GetMediaLibrary
from wordpress_xmlrpc.methods.posts import GetPosts
def export(domain, username, password):
rootdir = 'export'
if not os.path.exists(rootdir):
os.mkdir(rootdir)
url = re.sub('/?$', '/xmlrpc.php', domain)
client = Client(url, username, password)
for post in generate_posts(client):
text = cleanup_post_content(post.content)
images = list(generate_images(client, post))
dirname = os.path.join(rootdir, post.date.isoformat())
if not os.path.exists(dirname):
os.mkdir(dirname)
metadata_filename = os.path.join(dirname, 'post.txt')
print('writing', metadata_filename)
with open(metadata_filename, 'w') as f:
f.write('url: {}\n'.format(post.link))
f.write('title: {}\n\n'.format(post.title))
f.write(text)
seen_filenames = []
for i, image in enumerate(images):
response = requests.get(image, stream=True)
filename = os.path.basename(image)
if filename in seen_filenames:
raise RuntimeError('duplicate filename: {}'.format(filename))
seen_filenames.append(filename)
image_filename = os.path.join(dirname, filename)
print('-- {}/{} writing'.format(i, len(images)), image_filename)
with open(image_filename, 'wb') as f:
f.write(response.content)
def generate_posts(client):
offset = 0
step = 10
while True:
posts = client.call(GetPosts({
'offset': offset,
'number': step,
'post_status': 'publish',
}))
if not posts:
break
for post in posts:
yield post
offset += step
def cleanup_post_content(content):
# Replace non-breaking spaces.
content = content.replace('\xa0', ' ')
# Remove HTML tags.
content = re.sub(r'<[^<]+?>', '', content)
content = content.replace('&nbsp;', ' ')
# Remove image galleries.
content = re.sub(r'\[gallery.*]', '', content)
return content.strip()
def generate_images(client, post):
library = client.call(GetMediaLibrary({'parent_id': post.id}))
for image in library:
yield image.link
if '__main__' == __name__:
parser = argparse.ArgumentParser()
parser.add_argument('domain', help='site URL, e.g. https://www.site.com')
parser.add_argument('username')
args = parser.parse_args()
password = getpass()
export(args.domain, args.username, password)
python-wordpress-xmlrpc==2.3
requests==2.12.4
$ python export.py https://subdomain.wordpress.com username
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment