Skip to content

Instantly share code, notes, and snippets.

@MinchinWeb
Last active January 11, 2017 00:57
Show Gist options
  • Save MinchinWeb/7b2e4cf4d1b935c62ce8d6d1968270ff to your computer and use it in GitHub Desktop.
Save MinchinWeb/7b2e4cf4d1b935c62ce8d6d1968270ff to your computer and use it in GitHub Desktop.
Blogger Comments Exported
#! python3.6
"""
Export Comments from BLogger XML
Takes in a Blogger export XML file and spits out each comment in a seperate
file, such that can be used with the [Pelican Comment System]
(https://bernhard.scheirle.de/posts/2014/March/29/static-comments-via-email/).
May be simple to extend to export posts as well.
For a more detailed desciption, read my blog post at
http://blog.minchin.ca/2016/12/blogger-comments-exported.html
Author: Wm. Minchin -- [email protected]
License: MIT
Changes:
- 2016.12.29 -- initial release
- 2017.01.10 -- clean-up for addition in Pelican Comment System repo
"""
from pathlib import Path
import untangle
###############################################################################
# Constants #
###############################################################################
BLOGGER_EXPORT = r'c:\tmp\blog.xml'
COMMENTS_DIR = 'comments'
COMMENT_EXT = '.md'
AUTHORS_FILENAME = 'authors.txt'
###############################################################################
# Main Code Body #
###############################################################################
authors_and_pics = []
def main():
obj = untangle.parse(BLOGGER_EXPORT)
templates = 0
posts = 0
comments = 0
settings = 0
others = 0
for entry in obj.feed.entry:
try:
full_type = entry.category['term']
except TypeError:
# if a post is under multiple categories
for my_category in entry.category:
full_type = my_category['term']
# str.find() uses a return of `-1` to denote failure
if full_type.find('#') != -1:
break
else:
others += 1
simple_type = full_type[full_type.find('#')+1:]
if 'settings' == simple_type:
settings += 1
elif 'post' == simple_type:
posts += 1
# process posts here
elif 'comment' == simple_type:
comments += 1
process_comment(entry, obj)
elif 'template' == simple_type:
templates += 1
else:
others += 1
export_authors()
print('''
{} template
{} posts (including drafts)
{} comments
{} settings
{} other entries'''.format(templates,
posts,
comments,
settings,
others))
def process_comment(entry, obj):
# e.g. "tag:blogger.com,1999:blog-26967745.post-4115122471434984978"
comment_id = entry.id.cdata
# in ISO 8601 format, usable as is
comment_published = entry.published.cdata
comment_body = entry.content.cdata
comment_post_id = entry.thr_in_reply_to['ref']
comment_author = entry.author.name.cdata
comment_author_pic = entry.author.gd_image['src']
comment_author_email = entry.author.email.cdata
# add author and pic to global list
global authors_and_pics
authors_and_pics.append((comment_author, comment_author_pic))
# use this for a filename for the comment
# e.g. "4115122471434984978"
comment_short_id = comment_id[comment_id.find('post-')+5:]
comment_text = "date: {}\nauthor: {}\nemail: {}\n\n{}\n"\
.format(comment_published,
comment_author,
comment_author_email,
comment_body)
# article
for entry in obj.feed.entry:
entry_id = entry.id.cdata
if entry_id == comment_post_id:
article_entry = entry
break
else:
print("No matching article for comment", comment_id, comment_post_id)
# don't process comment further
return
# article slug
for link in article_entry.link:
if link['rel'] == 'alternate':
article_link = link['href']
break
else:
article_title = article_entry.title.cdata
print('Could not find slug for', article_title)
article_link = article_title.lower().replace(' ', '-')
article_slug = article_link[article_link.rfind('/')+1:
article_link.find('.html')]
comment_filename = Path(COMMENTS_DIR).resolve()
# folder; if it doesn't exist, create it
comment_filename = comment_filename / article_slug
comment_filename.mkdir(parents=True, exist_ok=True)
# write the comment file
comment_filename = comment_filename / (comment_short_id + COMMENT_EXT)
comment_filename.write_text(comment_text)
def export_authors():
to_export = set(authors_and_pics)
to_export = list(to_export)
to_export.sort()
str_export = ''
for i in to_export:
str_export += (i[0] + '\t\t' + i[1] + '\n')
authors_filename = Path(COMMENTS_DIR).resolve() / AUTHORS_FILENAME
authors_filename.write_text(str_export)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment