Last active
August 21, 2016 16:28
-
-
Save fajran/5659455 to your computer and use it in GitHub Desktop.
A Blogger's backup file to WordPress' WXR converter. Only tested with posts and comments, and NOT with pages. May not be efficient for huge blogs since the script keep all content in the memory during conversion. Released as public domain.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Blogger's backup file to WordPress' WXR converter. | |
# | |
# Only tested with posts and comments, and NOT with pages. | |
# May not be efficient for huge blogs since the script keep | |
# all content in the memory during conversion. | |
# | |
# Released as public domain. | |
# | |
# Please note that I converted the labels in Blogspot | |
# as tags in WordPress. I also hardcoded two categories for the | |
# WordPress posts. Adjust these first to suit your need. | |
import sys | |
from datetime import datetime | |
from xml.dom.minidom import parse, parseString | |
from xml.dom import Node | |
import cgi | |
import dateutil.parser | |
inp = sys.argv[1] | |
def d(*msg): | |
print >>sys.stderr, ' '.join(map(str, msg)) | |
class Blog(object): | |
class Author(object): | |
name = None | |
email = None | |
uri = None | |
class Entry(object): | |
entry_id = None | |
url = None | |
permalink = None | |
title = None | |
title_type = None | |
content = None | |
content_type = None | |
published = None | |
updated = None | |
author = None | |
class Post(Entry): | |
draft = False | |
def __init__(self): | |
self.labels = [] | |
self.comments = [] | |
class Comment(Entry): | |
pass | |
author = Author() | |
posts = [] | |
class BlogParser(object): | |
def __init__(self, atom_file): | |
self.atom_file = atom_file | |
def parse(self): | |
self.blog = Blog() | |
dom = parse(open(self.atom_file)) | |
feed = None | |
for child in dom.childNodes: | |
if child.nodeName == 'feed': | |
feed = child | |
break | |
if feed is not None: | |
self.parse_metadata(feed) | |
self.parse_entries(feed) | |
return self.blog | |
def get_text(self, el): | |
if el.nodeType == Node.TEXT_NODE: | |
return el.nodeValue | |
value = [] | |
if el.nodeType == Node.ELEMENT_NODE: | |
for child in el.childNodes: | |
value.append(self.get_text(child)) | |
return ''.join(value) | |
def parse_date(self, txt): | |
return dateutil.parser.parse(txt) | |
def parse_metadata(self, feed): | |
for child in feed.childNodes: | |
name = child.nodeName.split(':')[-1] | |
if name == 'entry': | |
break | |
if name == 'id': | |
self.blog.blog_id = self.get_text(child) | |
elif name == 'updated': | |
self.blog.updated = self.parse_date(self.get_text(child)) | |
elif name == 'title': | |
self.blog.title = self.get_text(child) | |
elif name == 'author': | |
self.blog.author = self.parse_author(child) | |
def parse_author(self, author): | |
data = Blog.Author() | |
for child in author.childNodes: | |
name = child.nodeName.split(':')[-1] | |
if name == 'name': | |
data.name = self.get_text(child) | |
elif name == 'uri': | |
data.uri = self.get_text(child) | |
elif name == 'email': | |
data.email = self.get_text(child) | |
return data | |
def parse_entries(self, feed): | |
self.posts = [] | |
self.comments = [] | |
self.post_ids = {} | |
self.comment_ids = {} | |
for child in feed.childNodes: | |
if child.nodeName != 'entry': | |
continue | |
self.parse_entry(child) | |
self.assign_comments() | |
self.blog.posts = self.posts | |
total = len(self.blog.posts) | |
d('total posts:', total) | |
d('total comments:', len(self.comments)) | |
for i, post in enumerate(self.blog.posts): | |
d('(%d / %d) -> %d: %s' % (i+1, total, len(post.comments), post.title)) | |
def assign_comments(self): | |
i = 0 | |
for comment in self.comments: | |
entry_id = comment.post_entry_id | |
if entry_id not in self.post_ids: | |
continue | |
post = self.post_ids[entry_id] | |
post.comments.append(comment) | |
d('%s. comment: %s -> %s' % (i+1, id(comment), id(post))) | |
i += 1 | |
def parse_category(self, category): | |
scheme = category.attributes['scheme'].nodeValue | |
term = category.attributes['term'].nodeValue | |
return scheme, term | |
def get_kind(self, entry): | |
for child in entry.childNodes: | |
if child.nodeName == 'category': | |
scheme, term = self.parse_category(child) | |
if scheme == 'http://schemas.google.com/g/2005#kind': | |
return term | |
def parse_entry(self, entry): | |
kind = self.get_kind(entry) | |
if kind == 'http://schemas.google.com/blogger/2008/kind#post': | |
post = self.parse_post(entry) | |
self.posts.append(post) | |
self.post_ids[post.entry_id] = post | |
elif kind == 'http://schemas.google.com/blogger/2008/kind#comment': | |
comment = self.parse_comment(entry) | |
self.comments.append(comment) | |
self.comment_ids[comment.entry_id] = comment | |
def get_draft(self, control): | |
for child in control.childNodes: | |
name = child.nodeName.split(':')[-1] | |
if name == 'draft': | |
return self.get_text(child) == 'yes' | |
return False | |
def parse_entry_common(self, entry, target): | |
for child in entry.childNodes: | |
name = child.nodeName.split(':')[-1] | |
ns = child.namespaceURI | |
if name == 'id': | |
target.entry_id = self.get_text(child) | |
elif name == 'published': | |
target.published = self.parse_date(self.get_text(child)) | |
elif name == 'updated': | |
target.updated = self.parse_date(self.get_text(child)) | |
elif name == 'title': | |
target.title = self.get_text(child) | |
target.title_type = child.attributes['type'].nodeValue | |
elif name == 'content': | |
target.content = self.get_text(child) | |
target.content_type = child.attributes['type'].nodeValue | |
elif name == 'author': | |
target.author = self.parse_author(child) | |
elif name == 'link': | |
rel = child.attributes['rel'].nodeValue | |
href = child.attributes['href'].nodeValue | |
if rel == 'self': | |
target.permalink = href | |
elif rel == 'alternate': | |
target.url = href | |
def parse_post(self, entry): | |
post = Blog.Post() | |
self.parse_entry_common(entry, post) | |
for child in entry.childNodes: | |
name = child.nodeName.split(':')[-1] | |
ns = child.namespaceURI | |
if name == 'category': | |
scheme, term = self.parse_category(child) | |
if scheme == 'http://www.blogger.com/atom/ns#': | |
post.labels.append(term) | |
elif ns == 'http://purl.org/atom/app#' and name == 'control': | |
post.draft = self.get_draft(child) | |
return post | |
def parse_comment(self, entry): | |
comment = Blog.Comment() | |
self.parse_entry_common(entry, comment) | |
for child in entry.childNodes: | |
name = child.nodeName.split(':')[-1] | |
ns = child.namespaceURI | |
if ns == 'http://purl.org/syndication/thread/1.0' and name == 'in-reply-to': | |
ref = child.attributes['ref'].nodeValue | |
comment.post_entry_id = ref | |
return comment | |
class WXRWriter(object): | |
comment_status = 'open' | |
def __init__(self, blog): | |
self.blog = blog | |
def write(self): | |
self.post_id = 0 | |
self.comment_id = 0 | |
doc = self.get_header() + self.get_entries() + self.get_footer() | |
doc = [line.strip() for line in doc] | |
doc = '\n'.join(doc) | |
return unicode(doc).encode('utf-8') | |
def get_header(self): | |
res = [] | |
res.append('<?xml version="1.0" encoding="UTF-8" ?>') | |
res.append('<rss version="2.0"') | |
res.append(' xmlns:excerpt="http://wordpress.org/export/1.2/excerpt/"') | |
res.append(' xmlns:content="http://purl.org/rss/1.0/modules/content/"') | |
res.append(' xmlns:wfw="http://wellformedweb.org/CommentAPI/"') | |
res.append(' xmlns:dc="http://purl.org/dc/elements/1.1/"') | |
res.append(' xmlns:wp="http://wordpress.org/export/1.2/">') | |
res.append('<channel>') | |
res.append('<title>%s</title>' % self.blog.title) | |
res.append('<wp:wxr_version>1.2</wp:wxr_version>') | |
return res | |
def get_footer(self): | |
res = [] | |
res.append('</channel>') | |
res.append('</rss>') | |
return res | |
def get_entries(self): | |
res = [] | |
for post in self.blog.posts: | |
res += self.get_post(post) | |
return res | |
def get_date(self, ts): | |
return ts.strftime("%a, %d %b %Y %H:%M:%S +0000") | |
def get_date_wp(self, ts): | |
return ts.strftime("%Y-%m-%d %H:%M:%S") | |
def escape(self, s): | |
return s | |
return cgi.escape(s).encode('ascii', 'xmlcharrefreplace') | |
def get_comment(self, comment): | |
status = 1 | |
res = [] | |
self.comment_id += 1 | |
res.append(' <wp:comment>') | |
res.append(' <wp:comment_id>%s</wp:comment_id>' % self.comment_id) | |
if comment.author.name: | |
res.append(' <wp:comment_author>%s</wp:comment_author>' % comment.author.name) | |
if comment.author.email: | |
res.append(' <wp:comment_author_email>%s</wp:comment_author_email>' % comment.author.email) | |
if comment.author.uri: | |
res.append(' <wp:comment_author_url>%s</wp:comment_author_url>' % comment.author.uri) | |
res.append(' <wp:comment_author_IP>%s</wp:comment_author_IP>' % '') | |
res.append(' <wp:comment_date>%s</wp:comment_date>' % self.get_date_wp(comment.published)) | |
res.append(' <wp:comment_date_gmt>%s</wp:comment_date_gmt>' % self.get_date_wp(comment.published)) | |
res.append(' <wp:comment_content><![CDATA[%s]]></wp:comment_content>' % self.escape(comment.content)) | |
res.append(' <wp:comment_approved>%s</wp:comment_approved>' % status) | |
res.append(' <wp:commentmeta>') | |
res.append(' <wp:meta_key>blogger_id</wp:meta_key>') | |
res.append(' <wp:meta_value>%s</wp:meta_value>' % comment.entry_id) | |
res.append(' </wp:commentmeta>') | |
if comment.permalink: | |
res.append(' <wp:commentmeta>') | |
res.append(' <wp:meta_key>blogger_permalink</wp:meta_key>') | |
res.append(' <wp:meta_value>%s</wp:meta_value>' % comment.permalink) | |
res.append(' </wp:commentmeta>') | |
if comment.url: | |
res.append(' <wp:commentmeta>') | |
res.append(' <wp:meta_key>blogger_url</wp:meta_key>') | |
res.append(' <wp:meta_value>%s</wp:meta_value>' % comment.url) | |
res.append(' </wp:commentmeta>') | |
res.append(' </wp:comment>') | |
return res | |
def get_post(self, post): | |
if post.content.strip() == '': | |
return [] | |
res = [] | |
slug = None | |
if post.url is not None: | |
slug = post.url.split('/')[-1] | |
slug = slug[:-5] | |
status = 'publish' | |
if post.draft: | |
status = 'draft' | |
self.post_id += 1 | |
res.append('<item>') | |
res.append(' <title>%s</title>' % post.title) | |
res.append(' <pubDate>%s</pubDate>' % self.get_date(post.published)) | |
res.append(' <dc:creator>%s</dc:creator>' % post.author.name) | |
res.append(' <guid isPermaLink="true">%s</guid>' % post.permalink) | |
res.append(' <description></description/>') | |
res.append(' <content:encoded><![CDATA[%s]]></content:encoded>' % self.escape(post.content)) | |
res.append(' <excerpt:encoded><![CDATA[%s]]></excerpt:encoded>' % self.escape(post.content)) | |
res.append(' <wp:post_id>%s</wp:post_id>' % self.post_id) | |
res.append(' <wp:post_date>%s</wp:post_date>' % self.get_date_wp(post.published)) | |
res.append(' <wp:post_date_gmt>%s</wp:post_date_gmt>' % self.get_date_wp(post.published)) | |
res.append(' <wp:comment_status>%s</wp:comment_status>' % self.comment_status) | |
res.append(' <wp:ping_status>closed</wp:ping_status>') | |
if slug: | |
res.append(' <wp:post_name>%s</wp:post_name>' % slug) | |
res.append(' <wp:status>%s</wp:status>' % status) | |
res.append(' <wp:post_parent>0</wp:post_parent>') | |
res.append(' <wp:menu_order>0</wp:menu_order>') | |
res.append(' <wp:post_type>post</wp:post_type>') | |
res.append(' <wp:post_password></wp:post_password>') | |
res.append(' <wp:is_sticky>0</wp:is_sticky>') | |
res.append(' <category domain="category" nicename="id"><![CDATA[Bahasa Indonesia]]></category>') | |
res.append(' <category domain="category" nicename="hacking"><![CDATA[Hacking]]></category>') | |
for label in post.labels: | |
res.append(' <category domain="post_tag" nicename="%s"><![CDATA[%s]]></category>' % (label, label)) | |
res.append(' <wp:postmeta>') | |
res.append(' <wp:meta_key>blogger_id</wp:meta_key>') | |
res.append(' <wp:meta_value>%s</wp:meta_value>' % post.entry_id) | |
res.append(' </wp:postmeta>') | |
if post.permalink: | |
res.append(' <wp:postmeta>') | |
res.append(' <wp:meta_key>blogger_permalink</wp:meta_key>') | |
res.append(' <wp:meta_value>%s</wp:meta_value>' % post.permalink) | |
res.append(' </wp:postmeta>') | |
if post.url: | |
res.append(' <wp:postmeta>') | |
res.append(' <wp:meta_key>blogger_url</wp:meta_key>') | |
res.append(' <wp:meta_value>%s</wp:meta_value>' % post.url) | |
res.append(' </wp:postmeta>') | |
for comment in post.comments: | |
res += self.get_comment(comment) | |
res.append('</item>') | |
return res | |
p = BlogParser(inp) | |
blog = p.parse() | |
writer = WXRWriter(blog) | |
xml = writer.write() | |
print xml | |
# f = open(out, 'w') | |
# f.write(xml) | |
# f.close() |
@adem0x - just convert that str
to unicode
on line 24 and you're good.
Also there's a typo in the script. Replace line 361 with the following:
res.append(' <description></description>')
(basically remove the /
at the end of the closing tag) and you're good.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
I was simply trying to use this code to convert a blogger backup (xml) but I am getting this error. (run under ubuntu with python v2.7)
I don't see why it should fail when it meets a non-ascii character
Traceback (most recent call last):
File "blogger-to-wordpress.py", line 403, in
blog = p.parse()
File "blogger-to-wordpress.py", line 73, in parse
self.parse_entries(feed)
File "blogger-to-wordpress.py", line 142, in parse_entries
d('(%d / %d) -> %d: %s' % (i+1, total, len(post.comments), post.title))
File "blogger-to-wordpress.py", line 24, in d
print >>sys.stderr, ' '.join(map(str, msg))
UnicodeEncodeError: 'ascii' codec can't encode character u'\u0131' in position 43: ordinal not in range(128)