Last active
August 21, 2016 16:28
-
-
Save fajran/5659455 to your computer and use it in GitHub Desktop.
A Blogger's backup file to WordPress' WXR converter. Only tested with posts and comments, and NOT with pages. May not be efficient for huge blogs since the script keep all content in the memory during conversion. Released as public domain.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Blogger's backup file to WordPress' WXR converter. | |
# | |
# Only tested with posts and comments, and NOT with pages. | |
# May not be efficient for huge blogs since the script keep | |
# all content in the memory during conversion. | |
# | |
# Released as public domain. | |
# | |
# Please note that I converted the labels in Blogspot | |
# as tags in WordPress. I also hardcoded two categories for the | |
# WordPress posts. Adjust these first to suit your need. | |
import sys | |
from datetime import datetime | |
from xml.dom.minidom import parse, parseString | |
from xml.dom import Node | |
import cgi | |
import dateutil.parser | |
inp = sys.argv[1] | |
def d(*msg): | |
print >>sys.stderr, ' '.join(map(str, msg)) | |
class Blog(object): | |
class Author(object): | |
name = None | |
email = None | |
uri = None | |
class Entry(object): | |
entry_id = None | |
url = None | |
permalink = None | |
title = None | |
title_type = None | |
content = None | |
content_type = None | |
published = None | |
updated = None | |
author = None | |
class Post(Entry): | |
draft = False | |
def __init__(self): | |
self.labels = [] | |
self.comments = [] | |
class Comment(Entry): | |
pass | |
author = Author() | |
posts = [] | |
class BlogParser(object): | |
def __init__(self, atom_file): | |
self.atom_file = atom_file | |
def parse(self): | |
self.blog = Blog() | |
dom = parse(open(self.atom_file)) | |
feed = None | |
for child in dom.childNodes: | |
if child.nodeName == 'feed': | |
feed = child | |
break | |
if feed is not None: | |
self.parse_metadata(feed) | |
self.parse_entries(feed) | |
return self.blog | |
def get_text(self, el): | |
if el.nodeType == Node.TEXT_NODE: | |
return el.nodeValue | |
value = [] | |
if el.nodeType == Node.ELEMENT_NODE: | |
for child in el.childNodes: | |
value.append(self.get_text(child)) | |
return ''.join(value) | |
def parse_date(self, txt): | |
return dateutil.parser.parse(txt) | |
def parse_metadata(self, feed): | |
for child in feed.childNodes: | |
name = child.nodeName.split(':')[-1] | |
if name == 'entry': | |
break | |
if name == 'id': | |
self.blog.blog_id = self.get_text(child) | |
elif name == 'updated': | |
self.blog.updated = self.parse_date(self.get_text(child)) | |
elif name == 'title': | |
self.blog.title = self.get_text(child) | |
elif name == 'author': | |
self.blog.author = self.parse_author(child) | |
def parse_author(self, author): | |
data = Blog.Author() | |
for child in author.childNodes: | |
name = child.nodeName.split(':')[-1] | |
if name == 'name': | |
data.name = self.get_text(child) | |
elif name == 'uri': | |
data.uri = self.get_text(child) | |
elif name == 'email': | |
data.email = self.get_text(child) | |
return data | |
def parse_entries(self, feed): | |
self.posts = [] | |
self.comments = [] | |
self.post_ids = {} | |
self.comment_ids = {} | |
for child in feed.childNodes: | |
if child.nodeName != 'entry': | |
continue | |
self.parse_entry(child) | |
self.assign_comments() | |
self.blog.posts = self.posts | |
total = len(self.blog.posts) | |
d('total posts:', total) | |
d('total comments:', len(self.comments)) | |
for i, post in enumerate(self.blog.posts): | |
d('(%d / %d) -> %d: %s' % (i+1, total, len(post.comments), post.title)) | |
def assign_comments(self): | |
i = 0 | |
for comment in self.comments: | |
entry_id = comment.post_entry_id | |
if entry_id not in self.post_ids: | |
continue | |
post = self.post_ids[entry_id] | |
post.comments.append(comment) | |
d('%s. comment: %s -> %s' % (i+1, id(comment), id(post))) | |
i += 1 | |
def parse_category(self, category): | |
scheme = category.attributes['scheme'].nodeValue | |
term = category.attributes['term'].nodeValue | |
return scheme, term | |
def get_kind(self, entry): | |
for child in entry.childNodes: | |
if child.nodeName == 'category': | |
scheme, term = self.parse_category(child) | |
if scheme == 'http://schemas.google.com/g/2005#kind': | |
return term | |
def parse_entry(self, entry): | |
kind = self.get_kind(entry) | |
if kind == 'http://schemas.google.com/blogger/2008/kind#post': | |
post = self.parse_post(entry) | |
self.posts.append(post) | |
self.post_ids[post.entry_id] = post | |
elif kind == 'http://schemas.google.com/blogger/2008/kind#comment': | |
comment = self.parse_comment(entry) | |
self.comments.append(comment) | |
self.comment_ids[comment.entry_id] = comment | |
def get_draft(self, control): | |
for child in control.childNodes: | |
name = child.nodeName.split(':')[-1] | |
if name == 'draft': | |
return self.get_text(child) == 'yes' | |
return False | |
def parse_entry_common(self, entry, target): | |
for child in entry.childNodes: | |
name = child.nodeName.split(':')[-1] | |
ns = child.namespaceURI | |
if name == 'id': | |
target.entry_id = self.get_text(child) | |
elif name == 'published': | |
target.published = self.parse_date(self.get_text(child)) | |
elif name == 'updated': | |
target.updated = self.parse_date(self.get_text(child)) | |
elif name == 'title': | |
target.title = self.get_text(child) | |
target.title_type = child.attributes['type'].nodeValue | |
elif name == 'content': | |
target.content = self.get_text(child) | |
target.content_type = child.attributes['type'].nodeValue | |
elif name == 'author': | |
target.author = self.parse_author(child) | |
elif name == 'link': | |
rel = child.attributes['rel'].nodeValue | |
href = child.attributes['href'].nodeValue | |
if rel == 'self': | |
target.permalink = href | |
elif rel == 'alternate': | |
target.url = href | |
def parse_post(self, entry): | |
post = Blog.Post() | |
self.parse_entry_common(entry, post) | |
for child in entry.childNodes: | |
name = child.nodeName.split(':')[-1] | |
ns = child.namespaceURI | |
if name == 'category': | |
scheme, term = self.parse_category(child) | |
if scheme == 'http://www.blogger.com/atom/ns#': | |
post.labels.append(term) | |
elif ns == 'http://purl.org/atom/app#' and name == 'control': | |
post.draft = self.get_draft(child) | |
return post | |
def parse_comment(self, entry): | |
comment = Blog.Comment() | |
self.parse_entry_common(entry, comment) | |
for child in entry.childNodes: | |
name = child.nodeName.split(':')[-1] | |
ns = child.namespaceURI | |
if ns == 'http://purl.org/syndication/thread/1.0' and name == 'in-reply-to': | |
ref = child.attributes['ref'].nodeValue | |
comment.post_entry_id = ref | |
return comment | |
class WXRWriter(object): | |
comment_status = 'open' | |
def __init__(self, blog): | |
self.blog = blog | |
def write(self): | |
self.post_id = 0 | |
self.comment_id = 0 | |
doc = self.get_header() + self.get_entries() + self.get_footer() | |
doc = [line.strip() for line in doc] | |
doc = '\n'.join(doc) | |
return unicode(doc).encode('utf-8') | |
def get_header(self): | |
res = [] | |
res.append('<?xml version="1.0" encoding="UTF-8" ?>') | |
res.append('<rss version="2.0"') | |
res.append(' xmlns:excerpt="http://wordpress.org/export/1.2/excerpt/"') | |
res.append(' xmlns:content="http://purl.org/rss/1.0/modules/content/"') | |
res.append(' xmlns:wfw="http://wellformedweb.org/CommentAPI/"') | |
res.append(' xmlns:dc="http://purl.org/dc/elements/1.1/"') | |
res.append(' xmlns:wp="http://wordpress.org/export/1.2/">') | |
res.append('<channel>') | |
res.append('<title>%s</title>' % self.blog.title) | |
res.append('<wp:wxr_version>1.2</wp:wxr_version>') | |
return res | |
def get_footer(self): | |
res = [] | |
res.append('</channel>') | |
res.append('</rss>') | |
return res | |
def get_entries(self): | |
res = [] | |
for post in self.blog.posts: | |
res += self.get_post(post) | |
return res | |
def get_date(self, ts): | |
return ts.strftime("%a, %d %b %Y %H:%M:%S +0000") | |
def get_date_wp(self, ts): | |
return ts.strftime("%Y-%m-%d %H:%M:%S") | |
def escape(self, s): | |
return s | |
return cgi.escape(s).encode('ascii', 'xmlcharrefreplace') | |
def get_comment(self, comment): | |
status = 1 | |
res = [] | |
self.comment_id += 1 | |
res.append(' <wp:comment>') | |
res.append(' <wp:comment_id>%s</wp:comment_id>' % self.comment_id) | |
if comment.author.name: | |
res.append(' <wp:comment_author>%s</wp:comment_author>' % comment.author.name) | |
if comment.author.email: | |
res.append(' <wp:comment_author_email>%s</wp:comment_author_email>' % comment.author.email) | |
if comment.author.uri: | |
res.append(' <wp:comment_author_url>%s</wp:comment_author_url>' % comment.author.uri) | |
res.append(' <wp:comment_author_IP>%s</wp:comment_author_IP>' % '') | |
res.append(' <wp:comment_date>%s</wp:comment_date>' % self.get_date_wp(comment.published)) | |
res.append(' <wp:comment_date_gmt>%s</wp:comment_date_gmt>' % self.get_date_wp(comment.published)) | |
res.append(' <wp:comment_content><![CDATA[%s]]></wp:comment_content>' % self.escape(comment.content)) | |
res.append(' <wp:comment_approved>%s</wp:comment_approved>' % status) | |
res.append(' <wp:commentmeta>') | |
res.append(' <wp:meta_key>blogger_id</wp:meta_key>') | |
res.append(' <wp:meta_value>%s</wp:meta_value>' % comment.entry_id) | |
res.append(' </wp:commentmeta>') | |
if comment.permalink: | |
res.append(' <wp:commentmeta>') | |
res.append(' <wp:meta_key>blogger_permalink</wp:meta_key>') | |
res.append(' <wp:meta_value>%s</wp:meta_value>' % comment.permalink) | |
res.append(' </wp:commentmeta>') | |
if comment.url: | |
res.append(' <wp:commentmeta>') | |
res.append(' <wp:meta_key>blogger_url</wp:meta_key>') | |
res.append(' <wp:meta_value>%s</wp:meta_value>' % comment.url) | |
res.append(' </wp:commentmeta>') | |
res.append(' </wp:comment>') | |
return res | |
def get_post(self, post): | |
if post.content.strip() == '': | |
return [] | |
res = [] | |
slug = None | |
if post.url is not None: | |
slug = post.url.split('/')[-1] | |
slug = slug[:-5] | |
status = 'publish' | |
if post.draft: | |
status = 'draft' | |
self.post_id += 1 | |
res.append('<item>') | |
res.append(' <title>%s</title>' % post.title) | |
res.append(' <pubDate>%s</pubDate>' % self.get_date(post.published)) | |
res.append(' <dc:creator>%s</dc:creator>' % post.author.name) | |
res.append(' <guid isPermaLink="true">%s</guid>' % post.permalink) | |
res.append(' <description></description/>') | |
res.append(' <content:encoded><![CDATA[%s]]></content:encoded>' % self.escape(post.content)) | |
res.append(' <excerpt:encoded><![CDATA[%s]]></excerpt:encoded>' % self.escape(post.content)) | |
res.append(' <wp:post_id>%s</wp:post_id>' % self.post_id) | |
res.append(' <wp:post_date>%s</wp:post_date>' % self.get_date_wp(post.published)) | |
res.append(' <wp:post_date_gmt>%s</wp:post_date_gmt>' % self.get_date_wp(post.published)) | |
res.append(' <wp:comment_status>%s</wp:comment_status>' % self.comment_status) | |
res.append(' <wp:ping_status>closed</wp:ping_status>') | |
if slug: | |
res.append(' <wp:post_name>%s</wp:post_name>' % slug) | |
res.append(' <wp:status>%s</wp:status>' % status) | |
res.append(' <wp:post_parent>0</wp:post_parent>') | |
res.append(' <wp:menu_order>0</wp:menu_order>') | |
res.append(' <wp:post_type>post</wp:post_type>') | |
res.append(' <wp:post_password></wp:post_password>') | |
res.append(' <wp:is_sticky>0</wp:is_sticky>') | |
res.append(' <category domain="category" nicename="id"><![CDATA[Bahasa Indonesia]]></category>') | |
res.append(' <category domain="category" nicename="hacking"><![CDATA[Hacking]]></category>') | |
for label in post.labels: | |
res.append(' <category domain="post_tag" nicename="%s"><![CDATA[%s]]></category>' % (label, label)) | |
res.append(' <wp:postmeta>') | |
res.append(' <wp:meta_key>blogger_id</wp:meta_key>') | |
res.append(' <wp:meta_value>%s</wp:meta_value>' % post.entry_id) | |
res.append(' </wp:postmeta>') | |
if post.permalink: | |
res.append(' <wp:postmeta>') | |
res.append(' <wp:meta_key>blogger_permalink</wp:meta_key>') | |
res.append(' <wp:meta_value>%s</wp:meta_value>' % post.permalink) | |
res.append(' </wp:postmeta>') | |
if post.url: | |
res.append(' <wp:postmeta>') | |
res.append(' <wp:meta_key>blogger_url</wp:meta_key>') | |
res.append(' <wp:meta_value>%s</wp:meta_value>' % post.url) | |
res.append(' </wp:postmeta>') | |
for comment in post.comments: | |
res += self.get_comment(comment) | |
res.append('</item>') | |
return res | |
p = BlogParser(inp) | |
blog = p.parse() | |
writer = WXRWriter(blog) | |
xml = writer.write() | |
print xml | |
# f = open(out, 'w') | |
# f.write(xml) | |
# f.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
@adem0x - just convert that
str
tounicode
on line 24 and you're good.Also there's a typo in the script. Replace line 361 with the following:
(basically remove the
/
at the end of the closing tag) and you're good.