Created
April 16, 2011 16:18
-
-
Save tomconte/923238 to your computer and use it in GitHub Desktop.
WordPress to Tumblr migration
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python | |
| """ | |
| wp2tumblr.py | |
| Python script for migrating your Wordpress blog to Tumblr | |
| Karteek Edamadaka | |
| Modified by Thomas Conte for his own needs | |
| """ | |
| import os | |
| import sys | |
| import urllib | |
| import urllib2 | |
| import datetime | |
| import time | |
| import logging | |
| import pickle | |
| import re | |
| import json | |
| DRYRUN = True | |
| GENERATOR = 'KMigrator 0.3' | |
| TUMBLRWRITE = 'http://www.tumblr.com/api/write' | |
| TUMBLRREAD = 'http://blogatom.tumblr.com/api/read/json' | |
| # Update with your TumblrSite/api/read/json | |
| TUMBLREMAIL = 'foo@bar.com' | |
| # Update with your Tumblr login Email | |
| TUMBLRPASS = 'password' | |
| # Update with your Tumblr login password | |
| WXR = 'blogatom.wordpress.2011-04-11.xml' | |
| # Update with your file name of your Wordpress Extended RSS file | |
| BASEURL = "http://blogatom.tumblr.com/post/" | |
| # Update with your TumblrSite/post | |
| OBJECTFILE = "posts.obj" | |
| # Above object remembers all the parsed posts and their comments | |
| SUCCESSFILE = "success.obj" | |
| # Above object remembers all the posts which are posted to Tumblr to avoid | |
| # reposting in case of failures | |
| logger = logging.getLogger() | |
| logger.addHandler(logging.StreamHandler()) | |
| logger.setLevel(logging.INFO) | |
| def main(): | |
| print("Welcome to Wordpress to Tumblr Migrator") | |
| if not os.path.exists('posts.obj'): | |
| logger.info("Couldn't find serialized object, so parsing the WXR") | |
| if os.path.exists(WXR): | |
| posts = parse_wxr(WXR) | |
| pickle.dump(posts, open(OBJECTFILE, 'w')) | |
| else: | |
| logger.critical("Wordpress Archive - %s is not found in this folder" % WXR) | |
| sys.exit(1) | |
| else: | |
| logger.info("Serialized Posts Object found. Loading it") | |
| posts = pickle.load(open(OBJECTFILE, 'r')) | |
| for post in posts: | |
| if not check_progress(post['slug']): | |
| # Rewrite links to new storage location | |
| pattern = re.compile('''http://www.cont..net/wp/wp-content/uploads/([\w\_\-\.\/]+)''') | |
| #links = list(set(pattern.findall(post['content']))) | |
| post['content'] = pattern.sub(r"http://blogatom.blob.core.windows.net/img/\1", post['content']) | |
| logger.info(post['content']) | |
| logger.info("post_to_tumblr %s" % post['title']) | |
| tumblr_postid = post_to_tumblr('regular', post['title'], post['content'], post['date'], post['tags']) | |
| post['tumblr_id'] = tumblr_postid | |
| logger.info("Posted %s to Tumblr with ID - %s" %(post['title'], post['tumblr_id'])) | |
| update_progress(post['slug']) | |
| else: | |
| logger.info("Post %s already found to be proccessed. Skipping to next one" % post['title']) | |
| logger.info("Writing the Posts Object after Migration") | |
| pickle.dump(posts, open('migrated.'+OBJECTFILE, 'w')) | |
| print("Migration is done. But, few files might not have been migrated. You might have to migrate them manually") | |
| for post in posts: | |
| if post.has_key('attachments'): | |
| print("On post - %s I couldn't migrate" % post['title']) | |
| for a in post['attachments']: | |
| print(a) | |
| def do_wait(stime, reason=None): | |
| if reason is not None: | |
| logger.info("\n"+ reason) | |
| logger.info("... waiting for %s seconds" % stime) | |
| for i in range(0, stime): | |
| sys.stdout.write('.') | |
| time.sleep(1) | |
| sys.stdout.flush() | |
| logger.info(" Continuing") | |
| def update_progress(post): | |
| posts = [] | |
| try: | |
| posts = pickle.load(open(SUCCESSFILE, 'r')) | |
| except: | |
| logger.warn("Unable to load successful posts from file - %s" % SUCCESSFILE) | |
| posts.append(post) | |
| pickle.dump(posts, open(SUCCESSFILE, 'w')) | |
| def check_progress(title): | |
| try: | |
| posts = pickle.load(open(SUCCESSFILE, 'r')) | |
| if title in posts: | |
| return True | |
| else: | |
| return False | |
| except: | |
| return False | |
| def get_post_info(post_id): | |
| res = do_http_request(TUMBLRREAD, {'email':TUMBLREMAIL, 'password':TUMBLRPASS, 'id':str(post_id)}, 'POST') | |
| if res is not False: | |
| logger.info("Extracting Info") | |
| info = json.loads(res[22:-2]) | |
| if len(info['posts']) > 0: | |
| logger.info("Extracted!") | |
| return info['posts'][0] | |
| else: | |
| logger.error("Extracted information doesn't contain photo info. Failure") | |
| logger.debug("Received info from Tumblr - %s" % str(info)) | |
| logger.error("Getting post info failed. Retrying") | |
| do_wait(30) | |
| return get_post_info(post_id) | |
| def post_to_tumblr(post_type='regular', title=None, body=None, date=datetime.datetime.now().ctime(), tags=None, source=None, private=0): | |
| post_data = { | |
| 'email':TUMBLREMAIL, | |
| 'password':TUMBLRPASS, | |
| 'type':post_type, | |
| 'generator':GENERATOR, | |
| 'date':date, | |
| 'title':title | |
| } | |
| if body is not None: | |
| post_data['body'] = body | |
| if source is not None: | |
| post_data['source'] = source | |
| if private == 1: | |
| post_data['private'] = '1' | |
| else: | |
| post_data['private'] = '0' | |
| if tags is not None: | |
| post_data['tags'] = ', '.join(tags) | |
| tumblr_post_id = do_http_request(TUMBLRWRITE, post_data, "POST") | |
| if tumblr_post_id is not False: | |
| logger.info("Posted %s to Tumblr" %(title)) | |
| return tumblr_post_id | |
| logger.error("Server acted weird while posting %s" % title) | |
| do_wait(35) | |
| return post_to_tumblr(post_type, title, body, date, tags, source, private) | |
| def do_http_request(url, post_data={}, method="GET"): | |
| params = urllib.urlencode(dict([k, v.encode('utf-8')] for k, v in post_data.items())) | |
| if method == "POST": | |
| request = urllib2.Request(url, params) | |
| else: | |
| if len(post_data) > 0: | |
| url = url + '?' + params | |
| request = urllib2.Request(url) | |
| try: | |
| logger.debug("Requesting %s using HTTP %s with data %s" %(url, method, str(post_data))) | |
| response = urllib2.urlopen(request) | |
| return response.read() | |
| except Exception, e: | |
| logger.error("Error requesting %s using HTTP %s with data %s with Exception %s" %(url, method, str(post_data), e)) | |
| return False | |
| def parse_wxr(wxr): | |
| posts = [] | |
| import xml.dom.minidom | |
| from xml.dom.minidom import Node | |
| doc = xml.dom.minidom.parse(wxr) | |
| items = doc.getElementsByTagName("item") | |
| logger.info("Total Number of Entries (posts, pages and attachments) in the Wordpress eXtended Rss file : %s" %(len(items))) | |
| for post in items: | |
| _post = {} | |
| if post.getElementsByTagName("wp:post_type")[0].firstChild.data == "post": | |
| _post['title'] = post.getElementsByTagName("title")[0].firstChild.data | |
| _post['slug'] = post.getElementsByTagName("wp:post_name")[0].firstChild.data | |
| _post['link'] = post.getElementsByTagName("link")[0].firstChild.data | |
| _post['date'] = post.getElementsByTagName("wp:post_date")[0].firstChild.data | |
| _post['content'] = post.getElementsByTagName("content:encoded")[0].firstChild.data | |
| _post['tags'] = list() | |
| terms = post.getElementsByTagName("category") | |
| for term in terms: | |
| if term.getAttribute("domain") == "category" and term.getAttribute("nicename") != "": | |
| _post['tags'].append(term.firstChild.data) | |
| # Filter out the Twitter category | |
| if "Twitter" not in _post['tags']: | |
| posts.append(_post) | |
| logger.info("Finished parsing WXR. Returning the Posts") | |
| return posts | |
| if __name__ == '__main__': | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment