Last active
January 11, 2017 00:57
-
-
Save MinchinWeb/7b2e4cf4d1b935c62ce8d6d1968270ff to your computer and use it in GitHub Desktop.
Blogger Comments Exported
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! python3.6 | |
""" | |
Export Comments from BLogger XML | |
Takes in a Blogger export XML file and spits out each comment in a seperate | |
file, such that can be used with the [Pelican Comment System] | |
(https://bernhard.scheirle.de/posts/2014/March/29/static-comments-via-email/). | |
May be simple to extend to export posts as well. | |
For a more detailed desciption, read my blog post at | |
http://blog.minchin.ca/2016/12/blogger-comments-exported.html | |
Author: Wm. Minchin -- [email protected] | |
License: MIT | |
Changes: | |
- 2016.12.29 -- initial release | |
- 2017.01.10 -- clean-up for addition in Pelican Comment System repo | |
""" | |
from pathlib import Path | |
import untangle | |
############################################################################### | |
# Constants # | |
############################################################################### | |
BLOGGER_EXPORT = r'c:\tmp\blog.xml' | |
COMMENTS_DIR = 'comments' | |
COMMENT_EXT = '.md' | |
AUTHORS_FILENAME = 'authors.txt' | |
############################################################################### | |
# Main Code Body # | |
############################################################################### | |
authors_and_pics = [] | |
def main(): | |
obj = untangle.parse(BLOGGER_EXPORT) | |
templates = 0 | |
posts = 0 | |
comments = 0 | |
settings = 0 | |
others = 0 | |
for entry in obj.feed.entry: | |
try: | |
full_type = entry.category['term'] | |
except TypeError: | |
# if a post is under multiple categories | |
for my_category in entry.category: | |
full_type = my_category['term'] | |
# str.find() uses a return of `-1` to denote failure | |
if full_type.find('#') != -1: | |
break | |
else: | |
others += 1 | |
simple_type = full_type[full_type.find('#')+1:] | |
if 'settings' == simple_type: | |
settings += 1 | |
elif 'post' == simple_type: | |
posts += 1 | |
# process posts here | |
elif 'comment' == simple_type: | |
comments += 1 | |
process_comment(entry, obj) | |
elif 'template' == simple_type: | |
templates += 1 | |
else: | |
others += 1 | |
export_authors() | |
print(''' | |
{} template | |
{} posts (including drafts) | |
{} comments | |
{} settings | |
{} other entries'''.format(templates, | |
posts, | |
comments, | |
settings, | |
others)) | |
def process_comment(entry, obj): | |
# e.g. "tag:blogger.com,1999:blog-26967745.post-4115122471434984978" | |
comment_id = entry.id.cdata | |
# in ISO 8601 format, usable as is | |
comment_published = entry.published.cdata | |
comment_body = entry.content.cdata | |
comment_post_id = entry.thr_in_reply_to['ref'] | |
comment_author = entry.author.name.cdata | |
comment_author_pic = entry.author.gd_image['src'] | |
comment_author_email = entry.author.email.cdata | |
# add author and pic to global list | |
global authors_and_pics | |
authors_and_pics.append((comment_author, comment_author_pic)) | |
# use this for a filename for the comment | |
# e.g. "4115122471434984978" | |
comment_short_id = comment_id[comment_id.find('post-')+5:] | |
comment_text = "date: {}\nauthor: {}\nemail: {}\n\n{}\n"\ | |
.format(comment_published, | |
comment_author, | |
comment_author_email, | |
comment_body) | |
# article | |
for entry in obj.feed.entry: | |
entry_id = entry.id.cdata | |
if entry_id == comment_post_id: | |
article_entry = entry | |
break | |
else: | |
print("No matching article for comment", comment_id, comment_post_id) | |
# don't process comment further | |
return | |
# article slug | |
for link in article_entry.link: | |
if link['rel'] == 'alternate': | |
article_link = link['href'] | |
break | |
else: | |
article_title = article_entry.title.cdata | |
print('Could not find slug for', article_title) | |
article_link = article_title.lower().replace(' ', '-') | |
article_slug = article_link[article_link.rfind('/')+1: | |
article_link.find('.html')] | |
comment_filename = Path(COMMENTS_DIR).resolve() | |
# folder; if it doesn't exist, create it | |
comment_filename = comment_filename / article_slug | |
comment_filename.mkdir(parents=True, exist_ok=True) | |
# write the comment file | |
comment_filename = comment_filename / (comment_short_id + COMMENT_EXT) | |
comment_filename.write_text(comment_text) | |
def export_authors(): | |
to_export = set(authors_and_pics) | |
to_export = list(to_export) | |
to_export.sort() | |
str_export = '' | |
for i in to_export: | |
str_export += (i[0] + '\t\t' + i[1] + '\n') | |
authors_filename = Path(COMMENTS_DIR).resolve() / AUTHORS_FILENAME | |
authors_filename.write_text(str_export) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment