Created
October 25, 2011 14:35
-
-
Save hrpunio/1312945 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/perl | |
# HTML to Blogger conversion script. | |
# | |
# Usage: blogspot-import.pl file1 file2 file3... > import-file.xml | |
# | |
# The resulting file import-file.xml is Atom [ http://tools.ietf.org/html/rfc4287 ] | |
# compatible and thus ready to be imported with | |
# Blogger import facility. | |
# | |
# It is assumed the following structure of each HTML file: | |
# | |
# <title> one-line-for-title </title> | |
# <!-- Tags: one-line-for-tags --> | |
# </head><body><!-- ##Published : 2011-10-20T07:20:26CEST ##--> | |
# ... document body | |
# </body></html> | |
# | |
# everything before <title> is ignored, tags are comma-separated, | |
# and `##Published DateAndTime' is publication date/time. | |
# | |
# (c) 2011/10 t.przechlewski | |
# | |
use Digest::MD5 qw(md5_hex); | |
print '<?xml version="1.0" encoding="UTF-8"?> | |
<!-- id, title/updated jest wymagane w elementach feed/entry reszta opcjonalna --> | |
<!-- wyglada na minimalne oznakowanie --> | |
<feed xmlns="http://www.w3.org/2005/Atom" | |
xmlns:openSearch="http://a9.com/-/spec/opensearchrss/1.0/" | |
xmlns:georss="http://www.georss.org/georss" | |
xmlns:gd="http://schemas.google.com/g/2005" | |
xmlns:thr="http://purl.org/syndication/thread/1.0">'; | |
print "<id>tag:blogger.com,1999:blog-1928418645181504144.archive</id>"; | |
print "<updated>2011-10-22T12:34:14.746-07:00</updated>"; | |
print "<title type='text'>pinkaccordions.blogspot.com</title>"; | |
# the following is required by Blogger import facility: | |
print "<generator version='7.00' uri='http://www.blogger.com'>Blogger</generator>\n"; | |
foreach $post_file (@ARGV) { | |
my $post_title = $post_content = $md5sum = $published = ''; | |
my @post_kws = (); | |
my $body = $in_pre = 0; | |
my $rel_URLs = 0; | |
print STDERR "\n$post_file opened!\n"; | |
open POST, "$post_file" || die "*** cannot open $post_file ***\n"; | |
while (<POST>) { | |
chomp(); | |
if (/<title>(.+)<\/title>/) {$post_title = $1 ; next ; } | |
if (/<!--[ \t]*Tags:[ \t]*(.+)[ \t]*-->/) {$tags = $1 ; next ; } | |
if (/<\/head><body>/) { | |
$body = 1 ; | |
## </head><body><!-- ##Published : 2011-10-20T07:20:26CEST ##--> | |
if (/##Published[ \t]+:[ \t]+([0-9T\-\:]+).+##/) { $published = $1; } | |
print STDERR "Published: $published\n"; | |
next; | |
} | |
if (/<\/body><\/html>/) { $body = 0 ; next } | |
if ( $body ) { | |
## Images from pinkaccordions.homelinux.org or with relative URLs should be reported | |
## test for URL': | |
if (/src[ \t]*=/) { | |
if (/pinkaccordions.homelinux.org/ || !(/http:\/\// ) ) { $rel_URLs = 1; } | |
} | |
## pre should preserve line breaks: | |
## in other parts of HTML line breaks are generally harmful (why? should be ignored): | |
if (/<pre>/) { $in_pre = 1; $post_content .= "$_\n"; next ; } | |
if (/<\/pre>/) { $in_pre = 0; $post_content .= "$_ "; next ; } | |
if ( $in_pre ) { $post_content .= "$_\n"; } | |
else { | |
$post_content .= "$_ "; # ** following space is essential here ** | |
} | |
} | |
} | |
### ### ### | |
if ($published eq '') { warn "*** something wrong with: $post_file. Not published? Skipping....\n" ; | |
close(POST); | |
next ; } | |
if ( $tags eq '' || $post_title eq '' ) { die "*** something wrong with: $post_file (tags: $tags/title: $post_title)\n"; } | |
if ($rel_URLs) { warn "*** suspicious relative URIs: $post_file\n"; } | |
$post_content =~ s/\&/&/g; | |
$post_content =~ s/</</g; | |
$post_content =~ s/>/>/g; | |
print STDERR "Title: $post_title Tags: $tags\n"; | |
@post_kws = split /,/, $tags; | |
$md5sum = md5_hex($post_content); | |
print STDERR "MD5sum: $md5sum\n"; | |
print "<entry>"; ## We use MD5sum as post ID | |
print "<id>tag:blogger.com,1999:post-$md5sum</id>"; | |
print "<published>$published</published>"; | |
print "<updated>$published</updated>"; | |
print '<category scheme="http://schemas.google.com/g/2005#kind" term="http://schemas.google.com/blogger/2008/kind#post"/>'; | |
## tags: | |
foreach $k (@post_kws) { print "<category scheme='http://www.blogger.com/atom/ns#' term='$k'/>"; } | |
print "<title type='text'>$post_title</title>"; | |
print "<content type='html'>$post_content</content></entry>"; | |
close(POST); | |
} | |
print "</feed>"; | |
## end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment