Created
March 18, 2018 21:03
-
-
Save mcnewton/f8ff21b9b16b7e307d08f214f674a9d1 to your computer and use it in GitHub Desktop.
Extract Scribus document text and output HTML for converting to ePub
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/perl | |
# | |
# Go some way to extracting text from a Scribus document and | |
# converting it to simple HTML that can be used for an ePub file. | |
# | |
# Copyright (c) 2018 Matthew Newton | |
# | |
# Permission is hereby granted, free of charge, to any person | |
# obtaining a copy of this software and associated documentation | |
# files (the "Software"), to deal in the Software without | |
# restriction, including without limitation the rights to use, copy, | |
# modify, merge, publish, distribute, sublicense, and/or sell copies | |
# of the Software, and to permit persons to whom the Software is | |
# furnished to do so, subject to the following conditions: | |
# | |
# The above copyright notice and this permission notice shall be | |
# included in all copies or substantial portions of the Software. | |
# | |
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | |
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | |
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | |
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT | |
# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, | |
# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER | |
# DEALINGS IN THE SOFTWARE. | |
# | |
# | |
# This script certainly doesn't do everything, but it's better | |
# than having the options of "save as ASCII text" or nothing. | |
# | |
# The %styles hash defines what actions to take for a paragraph | |
# with name matching the hash key. | |
# | |
# tag: use this tag <around>the</around> paragraph | |
# pre: output this text before the paragraph (and before the opening tag) | |
# post: output this text after the paragraph (and closing tag) | |
# attr: add this attribute to the opening tag | |
# sub: regex substitute this pattern in the paragraph text... | |
# repl: ...and replace with this text instead | |
# | |
# The %fonts hash defines what to do when a font style is found in | |
# paragraph text It will append text before and after the styled | |
# text. This is usually going to be a <span> tag setting the | |
# correct style. | |
# | |
# The suggestion here is to use something like "fixme", which can | |
# then be searched easily in Sigil or similar, and replaced with | |
# the correct style name, as the same font might be used for | |
# multiple things in the source document. | |
use XML::LibXML; | |
use XML::LibXML::Iterator; | |
use utf8; | |
use open ':std', ':encoding(UTF-8)'; | |
my $debug = 0; | |
# | |
# Define paragraph styles that we might find in the source | |
# document. | |
# | |
my %styles = ( | |
"ChapterHeaderNumber" => { | |
"tag" => "", | |
"pre" => '<hr class="sigil_split_marker"/><h1>', | |
"post" => '. ', | |
}, | |
"ChapterHeaderText" => { | |
"tag" => "", | |
"post" => "</h1>\n", | |
}, | |
"SectionHeading" => { | |
"tag" => "h2", | |
"pre" => "\n", | |
}, | |
"SectionSub" => { | |
"tag" => "h3", | |
"pre" => "\n", | |
}, | |
"Default" => { | |
"tag" => "p", | |
}, | |
"Quotation" => { | |
"tag" => "p", | |
"attr" => "class=\"bible\"", | |
}, | |
"Question" => { | |
"tag" => "p", | |
"attr" => "class=\"q\"", | |
}, | |
"BulletList" => { | |
"tag" => "li", | |
"pre" => "\n<ul>\n", | |
"post" => "</ul>\n\n", | |
"sub" => qr/^.$/, | |
"repl" => "", | |
}, | |
); | |
# | |
# What to do when we find text that is a different font. | |
# | |
my %fonts = ( | |
"Delicious Roman" => { | |
"pre" => "<span class=\"fixme\">", | |
"post" => "</span>", | |
}, | |
"Delicious Bold" => { | |
"pre" => "<span class=\"fixme\">", | |
"post" => "</span>", | |
}, | |
"Delicious Italic" => { | |
"pre" => "<span class=\"fixme\">", | |
"post" => "</span>", | |
}, | |
"Delicious Bold Italic" => { | |
"pre" => "<span class=\"fixme\">", | |
"post" => "</span>", | |
}, | |
"Delicious Heavy Regular" => { | |
"pre" => "<span class=\"fixme\">", | |
"post" => "</span>", | |
}, | |
"Delicious SmallCaps Regular" => { | |
"pre" => "<span class=\"fixme\">", | |
"post" => "</span>", | |
}, | |
); | |
# | |
# Load the document and find the bits with text. | |
# | |
my $dom = XML::LibXML->load_xml(location => $ARGV[0]); | |
my $page = $dom->findnodes('/SCRIBUSUTF8NEW/DOCUMENT/PAGEOBJECT/ITEXT'); | |
my $iter = XML::LibXML::Iterator->new($dom); | |
my @output = (); | |
my @text = (); | |
my $prevclass = undef; | |
my $prevstylepost = ""; | |
# | |
# Iterate over all the nodes and look for paragraphs. | |
# | |
while ($iter->nextNode()) { | |
my $node = $iter->current(); | |
my $nn = lc $node->nodeName; | |
# Tab or non-breaking space? | |
# | |
push @text, "\t" if $nn eq "tab"; | |
push @text, " " if $nn eq "nbspace"; | |
next unless $node->hasAttributes(); | |
# Some text | |
# | |
if ($nn eq "itext") { | |
my $line = $node->getAttribute("CH"); | |
my $font = $node->getAttribute("FONT"); | |
if (defined $font) { | |
if (defined $fonts{$font}) { | |
my $pre = $fonts{$font}{"pre"} || ""; | |
my $post = $fonts{$font}{"post"} || ""; | |
$line = "$pre$line$post"; | |
} else { | |
$line = "<span style=\"font-name: $font\">$line</span>"; | |
} | |
} | |
my @attrs = $node->attributes; | |
foreach my $attr (@attrs) { | |
next if $attr->nodeName eq "CH"; | |
next if $attr->nodeName eq "FONT"; | |
print "unparsed attribute '". $attr->nodeName() . "': " . $attr->getValue() . "\n" if $debug; | |
} | |
push @text, $line; | |
} | |
# A paragraph to hold all the preceeding text | |
# | |
elsif ($nn eq "para" or $nn eq "trail") { | |
my $class = $node->getAttribute("PARENT"); | |
$style = $styles{$class}; | |
if ($prevclass and $class ne $prevclass) { | |
push @output, $prevstylepost; | |
push @output, $$style{"pre"} || ""; | |
} | |
$tag = defined $$style{"tag"} ? $$style{"tag"} : "p"; | |
$attr = $$style{"attr"}; | |
$pre = $$style{"pre"} || ""; | |
$post = $$style{"post"} || ""; | |
if (scalar @text) { | |
if ($$style{"sub"}) { | |
$sub = $$style{"sub"}; | |
$repl = $$style{"repl"} || ""; | |
for (my $i=0; $i<=$#text; $i++) { | |
$text[$i] =~ s/$sub/$repl/s; | |
} | |
} | |
if (defined $attr and not defined $tag) { | |
die "style $class with attr but no tag"; | |
} | |
push @output, "<$tag" . (defined $attr ? " $attr" : "") . ">" if $tag; | |
push @output, join("", @text); | |
push @output, "</$tag>\n" if $tag; | |
} | |
@text = (); | |
$prevclass = $class; | |
$prevstylepost = $$style{"post"} || ""; | |
} | |
# Don't handle other things yet | |
# | |
else { | |
next if $nn eq "color"; | |
next if $nn eq "document"; | |
next if $nn eq "style"; | |
next if $nn eq "checkprofile"; | |
next if $nn eq "tabs"; | |
next if $nn eq "effekte"; | |
next if $nn eq "pdf"; | |
next if $nn eq "subset"; | |
next if $nn eq "fonts"; | |
print "unknown nodename:" . $nn . "\n" if $debug; | |
} | |
} | |
push @output, $prevstylepost; | |
my $bibleregex = qr/(?:[123](?: |\s))?[A-Z][a-z]+(?: |\s)\d+(?::\d+)?(?:[-–]\d+(?::\d+)?)?/; | |
foreach my $line (@output) { | |
# | |
# Can process all text before it gets printed out. In this | |
# case we substitute things that *look* like Bible | |
# book/chapter/verse references and put a <span> around | |
# them. | |
# | |
$line =~ s/($bibleregex)/<span class="ref">$1<\/span>/g; | |
print $line; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment