Skip to content

Instantly share code, notes, and snippets.

@tomconte
Created April 16, 2011 16:18
Show Gist options
  • Select an option

  • Save tomconte/923238 to your computer and use it in GitHub Desktop.

Select an option

Save tomconte/923238 to your computer and use it in GitHub Desktop.
WordPress to Tumblr migration
#!/usr/bin/env python
"""
wp2tumblr.py
Python script for migrating your Wordpress blog to Tumblr
Karteek Edamadaka
Modified by Thomas Conte for his own needs
"""
import os
import sys
import urllib
import urllib2
import datetime
import time
import logging
import pickle
import re
import json
DRYRUN = True
GENERATOR = 'KMigrator 0.3'
TUMBLRWRITE = 'http://www.tumblr.com/api/write'
TUMBLRREAD = 'http://blogatom.tumblr.com/api/read/json'
# Update with your TumblrSite/api/read/json
TUMBLREMAIL = 'foo@bar.com'
# Update with your Tumblr login Email
TUMBLRPASS = 'password'
# Update with your Tumblr login password
WXR = 'blogatom.wordpress.2011-04-11.xml'
# Update with your file name of your Wordpress Extended RSS file
BASEURL = "http://blogatom.tumblr.com/post/"
# Update with your TumblrSite/post
OBJECTFILE = "posts.obj"
# Above object remembers all the parsed posts and their comments
SUCCESSFILE = "success.obj"
# Above object remembers all the posts which are posted to Tumblr to avoid
# reposting in case of failures
logger = logging.getLogger()
logger.addHandler(logging.StreamHandler())
logger.setLevel(logging.INFO)
def main():
print("Welcome to Wordpress to Tumblr Migrator")
if not os.path.exists('posts.obj'):
logger.info("Couldn't find serialized object, so parsing the WXR")
if os.path.exists(WXR):
posts = parse_wxr(WXR)
pickle.dump(posts, open(OBJECTFILE, 'w'))
else:
logger.critical("Wordpress Archive - %s is not found in this folder" % WXR)
sys.exit(1)
else:
logger.info("Serialized Posts Object found. Loading it")
posts = pickle.load(open(OBJECTFILE, 'r'))
for post in posts:
if not check_progress(post['slug']):
# Rewrite links to new storage location
pattern = re.compile('''http://www.cont..net/wp/wp-content/uploads/([\w\_\-\.\/]+)''')
#links = list(set(pattern.findall(post['content'])))
post['content'] = pattern.sub(r"http://blogatom.blob.core.windows.net/img/\1", post['content'])
logger.info(post['content'])
logger.info("post_to_tumblr %s" % post['title'])
tumblr_postid = post_to_tumblr('regular', post['title'], post['content'], post['date'], post['tags'])
post['tumblr_id'] = tumblr_postid
logger.info("Posted %s to Tumblr with ID - %s" %(post['title'], post['tumblr_id']))
update_progress(post['slug'])
else:
logger.info("Post %s already found to be proccessed. Skipping to next one" % post['title'])
logger.info("Writing the Posts Object after Migration")
pickle.dump(posts, open('migrated.'+OBJECTFILE, 'w'))
print("Migration is done. But, few files might not have been migrated. You might have to migrate them manually")
for post in posts:
if post.has_key('attachments'):
print("On post - %s I couldn't migrate" % post['title'])
for a in post['attachments']:
print(a)
def do_wait(stime, reason=None):
if reason is not None:
logger.info("\n"+ reason)
logger.info("... waiting for %s seconds" % stime)
for i in range(0, stime):
sys.stdout.write('.')
time.sleep(1)
sys.stdout.flush()
logger.info(" Continuing")
def update_progress(post):
posts = []
try:
posts = pickle.load(open(SUCCESSFILE, 'r'))
except:
logger.warn("Unable to load successful posts from file - %s" % SUCCESSFILE)
posts.append(post)
pickle.dump(posts, open(SUCCESSFILE, 'w'))
def check_progress(title):
try:
posts = pickle.load(open(SUCCESSFILE, 'r'))
if title in posts:
return True
else:
return False
except:
return False
def get_post_info(post_id):
res = do_http_request(TUMBLRREAD, {'email':TUMBLREMAIL, 'password':TUMBLRPASS, 'id':str(post_id)}, 'POST')
if res is not False:
logger.info("Extracting Info")
info = json.loads(res[22:-2])
if len(info['posts']) > 0:
logger.info("Extracted!")
return info['posts'][0]
else:
logger.error("Extracted information doesn't contain photo info. Failure")
logger.debug("Received info from Tumblr - %s" % str(info))
logger.error("Getting post info failed. Retrying")
do_wait(30)
return get_post_info(post_id)
def post_to_tumblr(post_type='regular', title=None, body=None, date=datetime.datetime.now().ctime(), tags=None, source=None, private=0):
post_data = {
'email':TUMBLREMAIL,
'password':TUMBLRPASS,
'type':post_type,
'generator':GENERATOR,
'date':date,
'title':title
}
if body is not None:
post_data['body'] = body
if source is not None:
post_data['source'] = source
if private == 1:
post_data['private'] = '1'
else:
post_data['private'] = '0'
if tags is not None:
post_data['tags'] = ', '.join(tags)
tumblr_post_id = do_http_request(TUMBLRWRITE, post_data, "POST")
if tumblr_post_id is not False:
logger.info("Posted %s to Tumblr" %(title))
return tumblr_post_id
logger.error("Server acted weird while posting %s" % title)
do_wait(35)
return post_to_tumblr(post_type, title, body, date, tags, source, private)
def do_http_request(url, post_data={}, method="GET"):
params = urllib.urlencode(dict([k, v.encode('utf-8')] for k, v in post_data.items()))
if method == "POST":
request = urllib2.Request(url, params)
else:
if len(post_data) > 0:
url = url + '?' + params
request = urllib2.Request(url)
try:
logger.debug("Requesting %s using HTTP %s with data %s" %(url, method, str(post_data)))
response = urllib2.urlopen(request)
return response.read()
except Exception, e:
logger.error("Error requesting %s using HTTP %s with data %s with Exception %s" %(url, method, str(post_data), e))
return False
def parse_wxr(wxr):
posts = []
import xml.dom.minidom
from xml.dom.minidom import Node
doc = xml.dom.minidom.parse(wxr)
items = doc.getElementsByTagName("item")
logger.info("Total Number of Entries (posts, pages and attachments) in the Wordpress eXtended Rss file : %s" %(len(items)))
for post in items:
_post = {}
if post.getElementsByTagName("wp:post_type")[0].firstChild.data == "post":
_post['title'] = post.getElementsByTagName("title")[0].firstChild.data
_post['slug'] = post.getElementsByTagName("wp:post_name")[0].firstChild.data
_post['link'] = post.getElementsByTagName("link")[0].firstChild.data
_post['date'] = post.getElementsByTagName("wp:post_date")[0].firstChild.data
_post['content'] = post.getElementsByTagName("content:encoded")[0].firstChild.data
_post['tags'] = list()
terms = post.getElementsByTagName("category")
for term in terms:
if term.getAttribute("domain") == "category" and term.getAttribute("nicename") != "":
_post['tags'].append(term.firstChild.data)
# Filter out the Twitter category
if "Twitter" not in _post['tags']:
posts.append(_post)
logger.info("Finished parsing WXR. Returning the Posts")
return posts
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment