tomconte · April 16, 2011 16:18
diff --git a/wp2tumblr.py b/wp2tumblr.py
 #!/usr/bin/env python
 """
    wp2tumblr.py
    Python script for migrating your Wordpress blog to Tumblr
    Karteek Edamadaka
    Modified by Thomas Conte for his own needs
 """

 import os
 import sys
 import urllib
 import urllib2
 import datetime
 import time
 import logging
 import pickle
 import re
 import json

 DRYRUN = True

 GENERATOR = 'KMigrator 0.3'
 TUMBLRWRITE = 'http://www.tumblr.com/api/write'
 TUMBLRREAD = 'http://blogatom.tumblr.com/api/read/json'
 # Update with your TumblrSite/api/read/json
 TUMBLREMAIL = 'foo@bar.com'
 # Update with your Tumblr login Email
 TUMBLRPASS = 'password'
 # Update with your Tumblr login password
 WXR = 'blogatom.wordpress.2011-04-11.xml'
 # Update with your file name of your Wordpress Extended RSS file
 BASEURL = "http://blogatom.tumblr.com/post/"
 # Update with your TumblrSite/post
 OBJECTFILE = "posts.obj"
 # Above object remembers all the parsed posts and their comments
 SUCCESSFILE = "success.obj"
 # Above object remembers all the posts which are posted to Tumblr to avoid 
 # reposting in case of failures

 logger = logging.getLogger()
 logger.addHandler(logging.StreamHandler())
 logger.setLevel(logging.INFO)

 def main():
    print("Welcome to Wordpress to Tumblr Migrator")

    if not os.path.exists('posts.obj'):
        logger.info("Couldn't find serialized object, so parsing the WXR")
        if os.path.exists(WXR):
            posts = parse_wxr(WXR)
            pickle.dump(posts, open(OBJECTFILE, 'w'))
        else:
            logger.critical("Wordpress Archive - %s is not found in this folder" % WXR)
            sys.exit(1)
    else:
        logger.info("Serialized Posts Object found. Loading it")
        posts = pickle.load(open(OBJECTFILE, 'r'))
        
    for post in posts:
        if not check_progress(post['slug']):
            # Rewrite links to new storage location
            pattern = re.compile('''http://www.cont..net/wp/wp-content/uploads/([\w\_\-\.\/]+)''')
            #links = list(set(pattern.findall(post['content'])))
            post['content'] = pattern.sub(r"http://blogatom.blob.core.windows.net/img/\1", post['content'])
            logger.info(post['content'])
        
            logger.info("post_to_tumblr %s" % post['title'])
            tumblr_postid = post_to_tumblr('regular', post['title'], post['content'], post['date'], post['tags'])
            post['tumblr_id'] = tumblr_postid
            logger.info("Posted %s to Tumblr with ID - %s" %(post['title'], post['tumblr_id']))
            
            update_progress(post['slug'])
        else:
            logger.info("Post %s already found to be proccessed. Skipping to next one" % post['title'])

    logger.info("Writing the Posts Object after Migration")
    pickle.dump(posts, open('migrated.'+OBJECTFILE, 'w'))
    print("Migration is done. But, few files might not have been migrated. You might have to migrate them manually")
    
    for post in posts:
        if post.has_key('attachments'):
            print("On post - %s I couldn't migrate" % post['title'])
            for a in post['attachments']:
                print(a)
            
 def do_wait(stime, reason=None):
    if reason is not None:
        logger.info("\n"+ reason)
    logger.info("... waiting for %s seconds" % stime)
    for i in range(0, stime):
        sys.stdout.write('.')
        time.sleep(1)
        sys.stdout.flush()
    logger.info("  Continuing")

 def update_progress(post):
    posts = []
    try:
        posts = pickle.load(open(SUCCESSFILE, 'r'))
    except:
        logger.warn("Unable to load successful posts from file - %s" % SUCCESSFILE)
    posts.append(post)
    pickle.dump(posts, open(SUCCESSFILE, 'w'))

 def check_progress(title):
    try:
        posts = pickle.load(open(SUCCESSFILE, 'r'))
        if title in posts:
            return True
        else:
            return False
    except:
        return False

 def get_post_info(post_id):
    res = do_http_request(TUMBLRREAD, {'email':TUMBLREMAIL, 'password':TUMBLRPASS, 'id':str(post_id)}, 'POST')
    if res is not False:
        logger.info("Extracting Info")
        info = json.loads(res[22:-2])
        if len(info['posts']) > 0:
            logger.info("Extracted!")
            return info['posts'][0]
        else:
            logger.error("Extracted information doesn't contain photo info. Failure")
            logger.debug("Received info from Tumblr - %s" % str(info))
    logger.error("Getting post info failed. Retrying")
    do_wait(30)
    return get_post_info(post_id)

 def post_to_tumblr(post_type='regular', title=None, body=None, date=datetime.datetime.now().ctime(), tags=None, source=None, private=0):
    post_data = {
        'email':TUMBLREMAIL,
        'password':TUMBLRPASS,
        'type':post_type,
        'generator':GENERATOR,
        'date':date,
        'title':title
    }
    
    if body is not None:
        post_data['body'] = body

    if source is not None:
        post_data['source'] = source
    
    if private == 1:
        post_data['private'] = '1'
    else:
        post_data['private'] = '0'
    
    if tags is not None:
        post_data['tags'] = ', '.join(tags)
    
    tumblr_post_id = do_http_request(TUMBLRWRITE, post_data, "POST")
    if tumblr_post_id is not False:
        logger.info("Posted %s to Tumblr" %(title))
        return tumblr_post_id

    logger.error("Server acted weird while posting %s" % title)
    do_wait(35)
    return post_to_tumblr(post_type, title, body, date, tags, source, private)

 def do_http_request(url, post_data={}, method="GET"):
    params = urllib.urlencode(dict([k, v.encode('utf-8')] for k, v in post_data.items()))
    if method == "POST":
        request = urllib2.Request(url, params)
    else:
        if len(post_data) > 0:
            url = url + '?' + params
        request = urllib2.Request(url)
    try:
        logger.debug("Requesting %s using HTTP %s with data %s" %(url, method, str(post_data)))
        response = urllib2.urlopen(request)
        return response.read()
    except Exception, e:
        logger.error("Error requesting %s using HTTP %s with data %s with Exception %s" %(url, method, str(post_data), e))
        return False

 def parse_wxr(wxr):
    posts = []
    import xml.dom.minidom
    from xml.dom.minidom import Node
    doc = xml.dom.minidom.parse(wxr)
    items = doc.getElementsByTagName("item")
    logger.info("Total Number of Entries (posts, pages and attachments) in the Wordpress eXtended Rss file : %s" %(len(items)))
    for post in items:
        _post = {}
        if post.getElementsByTagName("wp:post_type")[0].firstChild.data == "post":
            _post['title'] = post.getElementsByTagName("title")[0].firstChild.data
            _post['slug'] = post.getElementsByTagName("wp:post_name")[0].firstChild.data
            _post['link'] = post.getElementsByTagName("link")[0].firstChild.data
            _post['date'] = post.getElementsByTagName("wp:post_date")[0].firstChild.data
            _post['content'] = post.getElementsByTagName("content:encoded")[0].firstChild.data
            _post['tags'] = list()
            terms = post.getElementsByTagName("category")
            for term in terms:
                if term.getAttribute("domain") == "category" and term.getAttribute("nicename") != "":
                    _post['tags'].append(term.firstChild.data)
            # Filter out the Twitter category
            if "Twitter" not in _post['tags']:
            	posts.append(_post)
    logger.info("Finished parsing WXR. Returning the Posts")
    return posts

 if __name__ == '__main__':
    main()
	#!/usr/bin/env python
	"""
	wp2tumblr.py
	Python script for migrating your Wordpress blog to Tumblr
	Karteek Edamadaka
	Modified by Thomas Conte for his own needs
	"""

	import os
	import sys
	import urllib
	import urllib2
	import datetime
	import time
	import logging
	import pickle
	import re
	import json

	DRYRUN = True

	GENERATOR = 'KMigrator 0.3'
	TUMBLRWRITE = 'http://www.tumblr.com/api/write'
	TUMBLRREAD = 'http://blogatom.tumblr.com/api/read/json'
	# Update with your TumblrSite/api/read/json
	TUMBLREMAIL = 'foo@bar.com'
	# Update with your Tumblr login Email
	TUMBLRPASS = 'password'
	# Update with your Tumblr login password
	WXR = 'blogatom.wordpress.2011-04-11.xml'
	# Update with your file name of your Wordpress Extended RSS file
	BASEURL = "http://blogatom.tumblr.com/post/"
	# Update with your TumblrSite/post
	OBJECTFILE = "posts.obj"
	# Above object remembers all the parsed posts and their comments
	SUCCESSFILE = "success.obj"
	# Above object remembers all the posts which are posted to Tumblr to avoid
	# reposting in case of failures

	logger = logging.getLogger()
	logger.addHandler(logging.StreamHandler())
	logger.setLevel(logging.INFO)

	def main():
	print("Welcome to Wordpress to Tumblr Migrator")

	if not os.path.exists('posts.obj'):
	logger.info("Couldn't find serialized object, so parsing the WXR")
	if os.path.exists(WXR):
	posts = parse_wxr(WXR)
	pickle.dump(posts, open(OBJECTFILE, 'w'))
	else:
	logger.critical("Wordpress Archive - %s is not found in this folder" % WXR)
	sys.exit(1)
	else:
	logger.info("Serialized Posts Object found. Loading it")
	posts = pickle.load(open(OBJECTFILE, 'r'))

	for post in posts:
	if not check_progress(post['slug']):
	# Rewrite links to new storage location
	pattern = re.compile('''http://www.cont..net/wp/wp-content/uploads/([\w\_\-\.\/]+)''')
	#links = list(set(pattern.findall(post['content'])))
	post['content'] = pattern.sub(r"http://blogatom.blob.core.windows.net/img/\1", post['content'])
	logger.info(post['content'])

	logger.info("post_to_tumblr %s" % post['title'])
	tumblr_postid = post_to_tumblr('regular', post['title'], post['content'], post['date'], post['tags'])
	post['tumblr_id'] = tumblr_postid
	logger.info("Posted %s to Tumblr with ID - %s" %(post['title'], post['tumblr_id']))

	update_progress(post['slug'])
	else:
	logger.info("Post %s already found to be proccessed. Skipping to next one" % post['title'])

	logger.info("Writing the Posts Object after Migration")
	pickle.dump(posts, open('migrated.'+OBJECTFILE, 'w'))
	print("Migration is done. But, few files might not have been migrated. You might have to migrate them manually")

	for post in posts:
	if post.has_key('attachments'):
	print("On post - %s I couldn't migrate" % post['title'])
	for a in post['attachments']:
	print(a)

	def do_wait(stime, reason=None):
	if reason is not None:
	logger.info("\n"+ reason)
	logger.info("... waiting for %s seconds" % stime)
	for i in range(0, stime):
	sys.stdout.write('.')
	time.sleep(1)
	sys.stdout.flush()
	logger.info(" Continuing")

	def update_progress(post):
	posts = []
	try:
	posts = pickle.load(open(SUCCESSFILE, 'r'))
	except:
	logger.warn("Unable to load successful posts from file - %s" % SUCCESSFILE)
	posts.append(post)
	pickle.dump(posts, open(SUCCESSFILE, 'w'))

	def check_progress(title):
	try:
	posts = pickle.load(open(SUCCESSFILE, 'r'))
	if title in posts:
	return True
	else:
	return False
	except:
	return False

	def get_post_info(post_id):
	res = do_http_request(TUMBLRREAD, {'email':TUMBLREMAIL, 'password':TUMBLRPASS, 'id':str(post_id)}, 'POST')
	if res is not False:
	logger.info("Extracting Info")
	info = json.loads(res[22:-2])
	if len(info['posts']) > 0:
	logger.info("Extracted!")
	return info['posts'][0]
	else:
	logger.error("Extracted information doesn't contain photo info. Failure")
	logger.debug("Received info from Tumblr - %s" % str(info))
	logger.error("Getting post info failed. Retrying")
	do_wait(30)
	return get_post_info(post_id)

	def post_to_tumblr(post_type='regular', title=None, body=None, date=datetime.datetime.now().ctime(), tags=None, source=None, private=0):
	post_data = {
	'email':TUMBLREMAIL,
	'password':TUMBLRPASS,
	'type':post_type,
	'generator':GENERATOR,
	'date':date,
	'title':title
	}

	if body is not None:
	post_data['body'] = body

	if source is not None:
	post_data['source'] = source

	if private == 1:
	post_data['private'] = '1'
	else:
	post_data['private'] = '0'

	if tags is not None:
	post_data['tags'] = ', '.join(tags)

	tumblr_post_id = do_http_request(TUMBLRWRITE, post_data, "POST")
	if tumblr_post_id is not False:
	logger.info("Posted %s to Tumblr" %(title))
	return tumblr_post_id

	logger.error("Server acted weird while posting %s" % title)
	do_wait(35)
	return post_to_tumblr(post_type, title, body, date, tags, source, private)

	def do_http_request(url, post_data={}, method="GET"):
	params = urllib.urlencode(dict([k, v.encode('utf-8')] for k, v in post_data.items()))
	if method == "POST":
	request = urllib2.Request(url, params)
	else:
	if len(post_data) > 0:
	url = url + '?' + params
	request = urllib2.Request(url)
	try:
	logger.debug("Requesting %s using HTTP %s with data %s" %(url, method, str(post_data)))
	response = urllib2.urlopen(request)
	return response.read()
	except Exception, e:
	logger.error("Error requesting %s using HTTP %s with data %s with Exception %s" %(url, method, str(post_data), e))
	return False

	def parse_wxr(wxr):
	posts = []
	import xml.dom.minidom
	from xml.dom.minidom import Node
	doc = xml.dom.minidom.parse(wxr)
	items = doc.getElementsByTagName("item")
	logger.info("Total Number of Entries (posts, pages and attachments) in the Wordpress eXtended Rss file : %s" %(len(items)))
	for post in items:
	_post = {}
	if post.getElementsByTagName("wp:post_type")[0].firstChild.data == "post":
	_post['title'] = post.getElementsByTagName("title")[0].firstChild.data
	_post['slug'] = post.getElementsByTagName("wp:post_name")[0].firstChild.data
	_post['link'] = post.getElementsByTagName("link")[0].firstChild.data
	_post['date'] = post.getElementsByTagName("wp:post_date")[0].firstChild.data
	_post['content'] = post.getElementsByTagName("content:encoded")[0].firstChild.data
	_post['tags'] = list()
	terms = post.getElementsByTagName("category")
	for term in terms:
	if term.getAttribute("domain") == "category" and term.getAttribute("nicename") != "":
	_post['tags'].append(term.firstChild.data)
	# Filter out the Twitter category
	if "Twitter" not in _post['tags']:
	posts.append(_post)
	logger.info("Finished parsing WXR. Returning the Posts")
	return posts

	if __name__ == '__main__':
	main()
No results found