Created
January 18, 2017 11:54
-
-
Save chosak/3717fb120c592c1b78b6f3ee6dc366aa to your computer and use it in GitHub Desktop.
Wordpress export script
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
from __future__ import absolute_import, print_function, unicode_literals | |
import argparse | |
import os | |
import re | |
import requests | |
from getpass import getpass | |
from wordpress_xmlrpc import Client | |
from wordpress_xmlrpc.methods.media import GetMediaLibrary | |
from wordpress_xmlrpc.methods.posts import GetPosts | |
def export(domain, username, password): | |
rootdir = 'export' | |
if not os.path.exists(rootdir): | |
os.mkdir(rootdir) | |
url = re.sub('/?$', '/xmlrpc.php', domain) | |
client = Client(url, username, password) | |
for post in generate_posts(client): | |
text = cleanup_post_content(post.content) | |
images = list(generate_images(client, post)) | |
dirname = os.path.join(rootdir, post.date.isoformat()) | |
if not os.path.exists(dirname): | |
os.mkdir(dirname) | |
metadata_filename = os.path.join(dirname, 'post.txt') | |
print('writing', metadata_filename) | |
with open(metadata_filename, 'w') as f: | |
f.write('url: {}\n'.format(post.link)) | |
f.write('title: {}\n\n'.format(post.title)) | |
f.write(text) | |
seen_filenames = [] | |
for i, image in enumerate(images): | |
response = requests.get(image, stream=True) | |
filename = os.path.basename(image) | |
if filename in seen_filenames: | |
raise RuntimeError('duplicate filename: {}'.format(filename)) | |
seen_filenames.append(filename) | |
image_filename = os.path.join(dirname, filename) | |
print('-- {}/{} writing'.format(i, len(images)), image_filename) | |
with open(image_filename, 'wb') as f: | |
f.write(response.content) | |
def generate_posts(client): | |
offset = 0 | |
step = 10 | |
while True: | |
posts = client.call(GetPosts({ | |
'offset': offset, | |
'number': step, | |
'post_status': 'publish', | |
})) | |
if not posts: | |
break | |
for post in posts: | |
yield post | |
offset += step | |
def cleanup_post_content(content): | |
# Replace non-breaking spaces. | |
content = content.replace('\xa0', ' ') | |
# Remove HTML tags. | |
content = re.sub(r'<[^<]+?>', '', content) | |
content = content.replace(' ', ' ') | |
# Remove image galleries. | |
content = re.sub(r'\[gallery.*]', '', content) | |
return content.strip() | |
def generate_images(client, post): | |
library = client.call(GetMediaLibrary({'parent_id': post.id})) | |
for image in library: | |
yield image.link | |
if '__main__' == __name__: | |
parser = argparse.ArgumentParser() | |
parser.add_argument('domain', help='site URL, e.g. https://www.site.com') | |
parser.add_argument('username') | |
args = parser.parse_args() | |
password = getpass() | |
export(args.domain, args.username, password) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
python-wordpress-xmlrpc==2.3 | |
requests==2.12.4 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
$ python export.py https://subdomain.wordpress.com username |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment