Created
December 22, 2010 18:59
-
-
Save hippietrail/751921 to your computer and use it in GitHub Desktop.
Strip headers/footers from Project Gutenberg texts
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/perl | |
# stripgutenberg.pl < in.txt > out.txt | |
# | |
# designed for piping | |
# Written by Andrew Dunbar (hippietrail), released into the public domain, Dec 2010 | |
use strict; | |
my $debug = 0; | |
my $state = 'beginning'; | |
my $print = 0; | |
my $printed = 0; | |
while (1) { | |
$_ = <>; | |
last unless $_; | |
# strip UTF-8 BOM | |
if ($. == 1 && index($_, "\xef\xbb\xbf") == 0) { | |
$_ = substr($_, 3); | |
} | |
if ($state eq 'beginning') { | |
if (/^(The Project Gutenberg [Ee]Book( of|,)|Project Gutenberg's )/) { | |
$state = 'normal pg header'; | |
$debug && print "state: beginning -> normal pg header\n"; | |
$print = 0; | |
} elsif (/^$/) { | |
$state = 'beginning blanks'; | |
$debug && print "state: beginning -> beginning blanks\n"; | |
} else { | |
die "unrecognized beginning: $_"; | |
} | |
} elsif ($state eq 'normal pg header') { | |
if (/^\*\*\*\ ?START OF TH(IS|E) PROJECT GUTENBERG EBOOK,? /) { | |
$state = 'end of normal header'; | |
$debug && print "state: normal pg header -> end of normal pg header\n"; | |
} else { | |
# body of normal pg header | |
} | |
} elsif ($state eq 'end of normal header') { | |
if (/^(Produced by|Transcribed from)/) { | |
$state = 'post header'; | |
$debug && print "state: end of normal pg header -> post header\n"; | |
} elsif (/^$/) { | |
# blank lines | |
} else { | |
$state = 'etext body'; | |
$debug && print "state: end of normal header -> etext body\n"; | |
$print = 1; | |
} | |
} elsif ($state eq 'post header') { | |
if (/^$/) { | |
$state = 'blanks after post header'; | |
$debug && print "state: post header -> blanks after post header\n"; | |
} else { | |
# multiline Produced / Transcribed | |
} | |
} elsif ($state eq 'blanks after post header') { | |
if (/^$/) { | |
# more blank lines | |
} else { | |
$state = 'etext body'; | |
$debug && print "state: blanks after post header -> etext body\n"; | |
$print = 1; | |
} | |
} elsif ($state eq 'beginning blanks') { | |
if (/<!-- #INCLUDE virtual=\"\/include\/ga-books-texth\.html\" -->/) { | |
$state = 'header include'; | |
$debug && print "state: beginning blanks -> header include\n"; | |
} elsif (/^Title: /) { | |
$state = 'aus header'; | |
$debug && print "state: beginning blanks -> aus header\n"; | |
} elsif (/^$/) { | |
# more blanks | |
} else { | |
die "unexpected stuff after beginning blanks: $_"; | |
} | |
} elsif ($state eq 'header include') { | |
if (/^$/) { | |
# blanks after header include | |
} else { | |
$state = 'aus header'; | |
$debug && print "state: header include -> aus header\n"; | |
} | |
} elsif ($state eq 'aus header') { | |
if (/^To contact Project Gutenberg of Australia go to http:\/\/gutenberg\.net\.au$/) { | |
$state = 'end of aus header'; | |
$debug && print "state: aus header -> end of aus header\n"; | |
} elsif (/^A Project Gutenberg of Australia eBook$/) { | |
$state = 'end of aus header'; | |
$debug && print "state: aus header -> end of aus header\n"; | |
} | |
} elsif ($state eq 'end of aus header') { | |
if (/^((Title|Author): .*)?$/) { | |
# title, author, or blank line | |
} else { | |
$state = 'etext body'; | |
$debug && print "state: end of aus header -> etext body\n"; | |
$print = 1; | |
} | |
} elsif ($state eq 'etext body') { | |
# here's the stuff | |
if (/^<!-- #INCLUDE virtual="\/include\/ga-books-textf\.html" -->$/) { | |
$state = 'footer'; | |
$debug && print "state: etext body -> footer\n"; | |
$print = 0; | |
} elsif (/^(\*\*\* ?)?end of (the )?project/i) { | |
$state = 'footer'; | |
$debug && print "state: etext body -> footer\n"; | |
$print = 0; | |
} | |
} elsif ($state eq 'footer') { | |
# nothing more of interest | |
} else { | |
die "unknown state '$state'"; | |
} | |
if ($print) { | |
print; | |
++$printed; | |
} else { | |
$debug && print "## $_"; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Sorry Apollo2X, I haven't played with this script, or indeed with Perl, for a few years now.
You should be able to do it with no changes to the script if you call it from another script in your favourite language. bash on Unix/Linux would be easy.
But changing the Perl would also be pretty easy for any Perl coder. Find a programmer friend and buy them a coffee or something (-;