Skip to content

Instantly share code, notes, and snippets.

@fajran
Last active August 21, 2016 16:28
Show Gist options
  • Save fajran/5659455 to your computer and use it in GitHub Desktop.
Save fajran/5659455 to your computer and use it in GitHub Desktop.
A Blogger's backup file to WordPress' WXR converter. Only tested with posts and comments, and NOT with pages. May not be efficient for huge blogs since the script keep all content in the memory during conversion. Released as public domain.
# Blogger's backup file to WordPress' WXR converter.
#
# Only tested with posts and comments, and NOT with pages.
# May not be efficient for huge blogs since the script keep
# all content in the memory during conversion.
#
# Released as public domain.
#
# Please note that I converted the labels in Blogspot
# as tags in WordPress. I also hardcoded two categories for the
# WordPress posts. Adjust these first to suit your need.
import sys
from datetime import datetime
from xml.dom.minidom import parse, parseString
from xml.dom import Node
import cgi
import dateutil.parser
inp = sys.argv[1]
def d(*msg):
print >>sys.stderr, ' '.join(map(str, msg))
class Blog(object):
class Author(object):
name = None
email = None
uri = None
class Entry(object):
entry_id = None
url = None
permalink = None
title = None
title_type = None
content = None
content_type = None
published = None
updated = None
author = None
class Post(Entry):
draft = False
def __init__(self):
self.labels = []
self.comments = []
class Comment(Entry):
pass
author = Author()
posts = []
class BlogParser(object):
def __init__(self, atom_file):
self.atom_file = atom_file
def parse(self):
self.blog = Blog()
dom = parse(open(self.atom_file))
feed = None
for child in dom.childNodes:
if child.nodeName == 'feed':
feed = child
break
if feed is not None:
self.parse_metadata(feed)
self.parse_entries(feed)
return self.blog
def get_text(self, el):
if el.nodeType == Node.TEXT_NODE:
return el.nodeValue
value = []
if el.nodeType == Node.ELEMENT_NODE:
for child in el.childNodes:
value.append(self.get_text(child))
return ''.join(value)
def parse_date(self, txt):
return dateutil.parser.parse(txt)
def parse_metadata(self, feed):
for child in feed.childNodes:
name = child.nodeName.split(':')[-1]
if name == 'entry':
break
if name == 'id':
self.blog.blog_id = self.get_text(child)
elif name == 'updated':
self.blog.updated = self.parse_date(self.get_text(child))
elif name == 'title':
self.blog.title = self.get_text(child)
elif name == 'author':
self.blog.author = self.parse_author(child)
def parse_author(self, author):
data = Blog.Author()
for child in author.childNodes:
name = child.nodeName.split(':')[-1]
if name == 'name':
data.name = self.get_text(child)
elif name == 'uri':
data.uri = self.get_text(child)
elif name == 'email':
data.email = self.get_text(child)
return data
def parse_entries(self, feed):
self.posts = []
self.comments = []
self.post_ids = {}
self.comment_ids = {}
for child in feed.childNodes:
if child.nodeName != 'entry':
continue
self.parse_entry(child)
self.assign_comments()
self.blog.posts = self.posts
total = len(self.blog.posts)
d('total posts:', total)
d('total comments:', len(self.comments))
for i, post in enumerate(self.blog.posts):
d('(%d / %d) -> %d: %s' % (i+1, total, len(post.comments), post.title))
def assign_comments(self):
i = 0
for comment in self.comments:
entry_id = comment.post_entry_id
if entry_id not in self.post_ids:
continue
post = self.post_ids[entry_id]
post.comments.append(comment)
d('%s. comment: %s -> %s' % (i+1, id(comment), id(post)))
i += 1
def parse_category(self, category):
scheme = category.attributes['scheme'].nodeValue
term = category.attributes['term'].nodeValue
return scheme, term
def get_kind(self, entry):
for child in entry.childNodes:
if child.nodeName == 'category':
scheme, term = self.parse_category(child)
if scheme == 'http://schemas.google.com/g/2005#kind':
return term
def parse_entry(self, entry):
kind = self.get_kind(entry)
if kind == 'http://schemas.google.com/blogger/2008/kind#post':
post = self.parse_post(entry)
self.posts.append(post)
self.post_ids[post.entry_id] = post
elif kind == 'http://schemas.google.com/blogger/2008/kind#comment':
comment = self.parse_comment(entry)
self.comments.append(comment)
self.comment_ids[comment.entry_id] = comment
def get_draft(self, control):
for child in control.childNodes:
name = child.nodeName.split(':')[-1]
if name == 'draft':
return self.get_text(child) == 'yes'
return False
def parse_entry_common(self, entry, target):
for child in entry.childNodes:
name = child.nodeName.split(':')[-1]
ns = child.namespaceURI
if name == 'id':
target.entry_id = self.get_text(child)
elif name == 'published':
target.published = self.parse_date(self.get_text(child))
elif name == 'updated':
target.updated = self.parse_date(self.get_text(child))
elif name == 'title':
target.title = self.get_text(child)
target.title_type = child.attributes['type'].nodeValue
elif name == 'content':
target.content = self.get_text(child)
target.content_type = child.attributes['type'].nodeValue
elif name == 'author':
target.author = self.parse_author(child)
elif name == 'link':
rel = child.attributes['rel'].nodeValue
href = child.attributes['href'].nodeValue
if rel == 'self':
target.permalink = href
elif rel == 'alternate':
target.url = href
def parse_post(self, entry):
post = Blog.Post()
self.parse_entry_common(entry, post)
for child in entry.childNodes:
name = child.nodeName.split(':')[-1]
ns = child.namespaceURI
if name == 'category':
scheme, term = self.parse_category(child)
if scheme == 'http://www.blogger.com/atom/ns#':
post.labels.append(term)
elif ns == 'http://purl.org/atom/app#' and name == 'control':
post.draft = self.get_draft(child)
return post
def parse_comment(self, entry):
comment = Blog.Comment()
self.parse_entry_common(entry, comment)
for child in entry.childNodes:
name = child.nodeName.split(':')[-1]
ns = child.namespaceURI
if ns == 'http://purl.org/syndication/thread/1.0' and name == 'in-reply-to':
ref = child.attributes['ref'].nodeValue
comment.post_entry_id = ref
return comment
class WXRWriter(object):
comment_status = 'open'
def __init__(self, blog):
self.blog = blog
def write(self):
self.post_id = 0
self.comment_id = 0
doc = self.get_header() + self.get_entries() + self.get_footer()
doc = [line.strip() for line in doc]
doc = '\n'.join(doc)
return unicode(doc).encode('utf-8')
def get_header(self):
res = []
res.append('<?xml version="1.0" encoding="UTF-8" ?>')
res.append('<rss version="2.0"')
res.append(' xmlns:excerpt="http://wordpress.org/export/1.2/excerpt/"')
res.append(' xmlns:content="http://purl.org/rss/1.0/modules/content/"')
res.append(' xmlns:wfw="http://wellformedweb.org/CommentAPI/"')
res.append(' xmlns:dc="http://purl.org/dc/elements/1.1/"')
res.append(' xmlns:wp="http://wordpress.org/export/1.2/">')
res.append('<channel>')
res.append('<title>%s</title>' % self.blog.title)
res.append('<wp:wxr_version>1.2</wp:wxr_version>')
return res
def get_footer(self):
res = []
res.append('</channel>')
res.append('</rss>')
return res
def get_entries(self):
res = []
for post in self.blog.posts:
res += self.get_post(post)
return res
def get_date(self, ts):
return ts.strftime("%a, %d %b %Y %H:%M:%S +0000")
def get_date_wp(self, ts):
return ts.strftime("%Y-%m-%d %H:%M:%S")
def escape(self, s):
return s
return cgi.escape(s).encode('ascii', 'xmlcharrefreplace')
def get_comment(self, comment):
status = 1
res = []
self.comment_id += 1
res.append(' <wp:comment>')
res.append(' <wp:comment_id>%s</wp:comment_id>' % self.comment_id)
if comment.author.name:
res.append(' <wp:comment_author>%s</wp:comment_author>' % comment.author.name)
if comment.author.email:
res.append(' <wp:comment_author_email>%s</wp:comment_author_email>' % comment.author.email)
if comment.author.uri:
res.append(' <wp:comment_author_url>%s</wp:comment_author_url>' % comment.author.uri)
res.append(' <wp:comment_author_IP>%s</wp:comment_author_IP>' % '')
res.append(' <wp:comment_date>%s</wp:comment_date>' % self.get_date_wp(comment.published))
res.append(' <wp:comment_date_gmt>%s</wp:comment_date_gmt>' % self.get_date_wp(comment.published))
res.append(' <wp:comment_content><![CDATA[%s]]></wp:comment_content>' % self.escape(comment.content))
res.append(' <wp:comment_approved>%s</wp:comment_approved>' % status)
res.append(' <wp:commentmeta>')
res.append(' <wp:meta_key>blogger_id</wp:meta_key>')
res.append(' <wp:meta_value>%s</wp:meta_value>' % comment.entry_id)
res.append(' </wp:commentmeta>')
if comment.permalink:
res.append(' <wp:commentmeta>')
res.append(' <wp:meta_key>blogger_permalink</wp:meta_key>')
res.append(' <wp:meta_value>%s</wp:meta_value>' % comment.permalink)
res.append(' </wp:commentmeta>')
if comment.url:
res.append(' <wp:commentmeta>')
res.append(' <wp:meta_key>blogger_url</wp:meta_key>')
res.append(' <wp:meta_value>%s</wp:meta_value>' % comment.url)
res.append(' </wp:commentmeta>')
res.append(' </wp:comment>')
return res
def get_post(self, post):
if post.content.strip() == '':
return []
res = []
slug = None
if post.url is not None:
slug = post.url.split('/')[-1]
slug = slug[:-5]
status = 'publish'
if post.draft:
status = 'draft'
self.post_id += 1
res.append('<item>')
res.append(' <title>%s</title>' % post.title)
res.append(' <pubDate>%s</pubDate>' % self.get_date(post.published))
res.append(' <dc:creator>%s</dc:creator>' % post.author.name)
res.append(' <guid isPermaLink="true">%s</guid>' % post.permalink)
res.append(' <description></description/>')
res.append(' <content:encoded><![CDATA[%s]]></content:encoded>' % self.escape(post.content))
res.append(' <excerpt:encoded><![CDATA[%s]]></excerpt:encoded>' % self.escape(post.content))
res.append(' <wp:post_id>%s</wp:post_id>' % self.post_id)
res.append(' <wp:post_date>%s</wp:post_date>' % self.get_date_wp(post.published))
res.append(' <wp:post_date_gmt>%s</wp:post_date_gmt>' % self.get_date_wp(post.published))
res.append(' <wp:comment_status>%s</wp:comment_status>' % self.comment_status)
res.append(' <wp:ping_status>closed</wp:ping_status>')
if slug:
res.append(' <wp:post_name>%s</wp:post_name>' % slug)
res.append(' <wp:status>%s</wp:status>' % status)
res.append(' <wp:post_parent>0</wp:post_parent>')
res.append(' <wp:menu_order>0</wp:menu_order>')
res.append(' <wp:post_type>post</wp:post_type>')
res.append(' <wp:post_password></wp:post_password>')
res.append(' <wp:is_sticky>0</wp:is_sticky>')
res.append(' <category domain="category" nicename="id"><![CDATA[Bahasa Indonesia]]></category>')
res.append(' <category domain="category" nicename="hacking"><![CDATA[Hacking]]></category>')
for label in post.labels:
res.append(' <category domain="post_tag" nicename="%s"><![CDATA[%s]]></category>' % (label, label))
res.append(' <wp:postmeta>')
res.append(' <wp:meta_key>blogger_id</wp:meta_key>')
res.append(' <wp:meta_value>%s</wp:meta_value>' % post.entry_id)
res.append(' </wp:postmeta>')
if post.permalink:
res.append(' <wp:postmeta>')
res.append(' <wp:meta_key>blogger_permalink</wp:meta_key>')
res.append(' <wp:meta_value>%s</wp:meta_value>' % post.permalink)
res.append(' </wp:postmeta>')
if post.url:
res.append(' <wp:postmeta>')
res.append(' <wp:meta_key>blogger_url</wp:meta_key>')
res.append(' <wp:meta_value>%s</wp:meta_value>' % post.url)
res.append(' </wp:postmeta>')
for comment in post.comments:
res += self.get_comment(comment)
res.append('</item>')
return res
p = BlogParser(inp)
blog = p.parse()
writer = WXRWriter(blog)
xml = writer.write()
print xml
# f = open(out, 'w')
# f.write(xml)
# f.close()
@BYK
Copy link

BYK commented Aug 21, 2016

@adem0x - just convert that str to unicode on line 24 and you're good.

Also there's a typo in the script. Replace line 361 with the following:

 res.append('  <description></description>')

(basically remove the / at the end of the closing tag) and you're good.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment