mcnewton · March 18, 2018 21:03
diff --git a/scribus2epub.pl b/scribus2epub.pl
 #! /usr/bin/perl

 #
 # Go some way to extracting text from a Scribus document and
 # converting it to simple HTML that can be used for an ePub file.
 #
 # Copyright (c) 2018 Matthew Newton
 #
 # Permission is hereby granted, free of charge, to any person
 # obtaining a copy of this software and associated documentation
 # files (the "Software"), to deal in the Software without
 # restriction, including without limitation the rights to use, copy,
 # modify, merge, publish, distribute, sublicense, and/or sell copies
 # of the Software, and to permit persons to whom the Software is
 # furnished to do so, subject to the following conditions:
 #
 # The above copyright notice and this permission notice shall be
 # included in all copies or substantial portions of the Software.
 #
 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
 # HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
 # WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 # DEALINGS IN THE SOFTWARE.
 #
 #
 # This script certainly doesn't do everything, but it's better
 # than having the options of "save as ASCII text" or nothing.
 #
 # The %styles hash defines what actions to take for a paragraph
 # with name matching the hash key.
 #
 #   tag:  use this tag <around>the</around> paragraph
 #   pre:  output this text before the paragraph (and before the opening tag)
 #   post: output this text after the paragraph (and closing tag)
 #   attr: add this attribute to the opening tag
 #   sub:  regex substitute this pattern in the paragraph text...
 #   repl: ...and replace with this text instead
 #
 # The %fonts hash defines what to do when a font style is found in
 # paragraph text It will append text before and after the styled
 # text. This is usually going to be a <span> tag setting the
 # correct style.
 #
 # The suggestion here is to use something like "fixme", which can
 # then be searched easily in Sigil or similar, and replaced with
 # the correct style name, as the same font might be used for
 # multiple things in the source document.

 use XML::LibXML;
 use XML::LibXML::Iterator;

 use utf8;
 use open ':std', ':encoding(UTF-8)';

 my $debug = 0;

 #
 # Define paragraph styles that we might find in the source
 # document.
 #

 my %styles = (
 	"ChapterHeaderNumber" => {
 		"tag" => "",
 		"pre" => '<hr class="sigil_split_marker"/><h1>',
 		"post" => '. ',
 	},
 	"ChapterHeaderText" => {
 		"tag" => "",
 		"post" => "</h1>\n",
 	},
 	"SectionHeading" => {
 		"tag" => "h2",
 		"pre" => "\n",
 	},
 	"SectionSub" => {
 		"tag" => "h3",
 		"pre" => "\n",
 	},
 	"Default" => {
 		"tag" => "p",
 	},
 	"Quotation" => {
 		"tag" => "p",
 		"attr" => "class=\"bible\"",
 	},
 	"Question" => {
 		"tag" => "p",
 		"attr" => "class=\"q\"",
 	},
 	"BulletList" => {
 		"tag" => "li",
 		"pre" => "\n<ul>\n",
 		"post" => "</ul>\n\n",
 		"sub" => qr/^.$/,
 		"repl" => "",
 	},
 );

 #
 # What to do when we find text that is a different font.
 #

 my %fonts = (
 	"Delicious Roman" => {
 		"pre" => "<span class=\"fixme\">",
 		"post" => "</span>",
 	},
 	"Delicious Bold" => {
 		"pre" => "<span class=\"fixme\">",
 		"post" => "</span>",
 	},
 	"Delicious Italic" => {
 		"pre" => "<span class=\"fixme\">",
 		"post" => "</span>",
 	},
 	"Delicious Bold Italic" => {
 		"pre" => "<span class=\"fixme\">",
 		"post" => "</span>",
 	},
 	"Delicious Heavy Regular" => {
 		"pre" => "<span class=\"fixme\">",
 		"post" => "</span>",
 	},
 	"Delicious SmallCaps Regular" => {
 		"pre" => "<span class=\"fixme\">",
 		"post" => "</span>",
 	},
 );

 #
 # Load the document and find the bits with text.
 #

 my $dom = XML::LibXML->load_xml(location => $ARGV[0]);
 my $page = $dom->findnodes('/SCRIBUSUTF8NEW/DOCUMENT/PAGEOBJECT/ITEXT');
 my $iter = XML::LibXML::Iterator->new($dom);

 my @output = ();
 my @text = ();
 my $prevclass = undef;
 my $prevstylepost = "";

 #
 # Iterate over all the nodes and look for paragraphs.
 #

 while ($iter->nextNode()) {
 	my $node = $iter->current();
 	my $nn = lc $node->nodeName;

 	# Tab or non-breaking space?
 	#
 	push @text, "\t" if $nn eq "tab";
 	push @text, "&nbsp;" if $nn eq "nbspace";

 	next unless $node->hasAttributes();

 	# Some text
 	#
 	if ($nn eq "itext") {
 		my $line = $node->getAttribute("CH");
 		my $font = $node->getAttribute("FONT");
 		if (defined $font) {
 			if (defined $fonts{$font}) {
 				my $pre = $fonts{$font}{"pre"} || "";
 				my $post = $fonts{$font}{"post"} || "";
 				$line = "$pre$line$post";
 			} else {
 				$line = "<span style=\"font-name: $font\">$line</span>";
 			}
 		}
 		my @attrs = $node->attributes;
 		foreach my $attr (@attrs) {
 			next if $attr->nodeName eq "CH";
 			next if $attr->nodeName eq "FONT";
 			print "unparsed attribute '". $attr->nodeName() . "': " . $attr->getValue() . "\n" if $debug;
 		}
 		push @text, $line;
 	}

 	# A paragraph to hold all the preceeding text
 	#
 	elsif ($nn eq "para" or $nn eq "trail") {
 		my $class = $node->getAttribute("PARENT");
 		$style = $styles{$class};

 		if ($prevclass and $class ne $prevclass) {
 			push @output, $prevstylepost;
 			push @output, $$style{"pre"} || "";
 		}

 		$tag = defined $$style{"tag"} ? $$style{"tag"} : "p";
 		$attr = $$style{"attr"};
 		$pre = $$style{"pre"} || "";
 		$post = $$style{"post"} || "";
 		if (scalar @text) {
 			if ($$style{"sub"}) {
 				$sub = $$style{"sub"};
 				$repl = $$style{"repl"} || "";
 				for (my $i=0; $i<=$#text; $i++) {
 					$text[$i] =~ s/$sub/$repl/s;
 				}
 			}
 			if (defined $attr and not defined $tag) {
 				die "style $class with attr but no tag";
 			}
 			push @output, "<$tag" . (defined $attr ? " $attr" : "") . ">" if $tag;
 			push @output, join("", @text);
 			push @output, "</$tag>\n" if $tag;
 		}

 		@text = ();
 		$prevclass = $class;
 		$prevstylepost = $$style{"post"} || "";
 	}

 	# Don't handle other things yet
 	#
 	else {
 		next if $nn eq "color";
 		next if $nn eq "document";
 		next if $nn eq "style";
 		next if $nn eq "checkprofile";
 		next if $nn eq "tabs";
 		next if $nn eq "effekte";
 		next if $nn eq "pdf";
 		next if $nn eq "subset";
 		next if $nn eq "fonts";
 		print "unknown nodename:" . $nn . "\n" if $debug;
 	}
 }
 push @output, $prevstylepost;


 my $bibleregex = qr/(?:[123](?:&nbsp;|\s))?[A-Z][a-z]+(?:&nbsp;|\s)\d+(?::\d+)?(?:[-–]\d+(?::\d+)?)?/;

 foreach my $line (@output) {

 	#
 	# Can process all text before it gets printed out. In this
 	# case we substitute things that *look* like Bible
 	# book/chapter/verse references and put a <span> around
 	# them.
 	#
 	$line =~ s/($bibleregex)/<span class="ref">$1<\/span>/g;

 	print $line;
 }
	#! /usr/bin/perl

	#
	# Go some way to extracting text from a Scribus document and
	# converting it to simple HTML that can be used for an ePub file.
	#
	# Copyright (c) 2018 Matthew Newton
	#
	# Permission is hereby granted, free of charge, to any person
	# obtaining a copy of this software and associated documentation
	# files (the "Software"), to deal in the Software without
	# restriction, including without limitation the rights to use, copy,
	# modify, merge, publish, distribute, sublicense, and/or sell copies
	# of the Software, and to permit persons to whom the Software is
	# furnished to do so, subject to the following conditions:
	#
	# The above copyright notice and this permission notice shall be
	# included in all copies or substantial portions of the Software.
	#
	# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
	# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
	# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
	# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
	# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
	# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
	# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
	# DEALINGS IN THE SOFTWARE.
	#
	#
	# This script certainly doesn't do everything, but it's better
	# than having the options of "save as ASCII text" or nothing.
	#
	# The %styles hash defines what actions to take for a paragraph
	# with name matching the hash key.
	#
	# tag: use this tag <around>the</around> paragraph
	# pre: output this text before the paragraph (and before the opening tag)
	# post: output this text after the paragraph (and closing tag)
	# attr: add this attribute to the opening tag
	# sub: regex substitute this pattern in the paragraph text...
	# repl: ...and replace with this text instead
	#
	# The %fonts hash defines what to do when a font style is found in
	# paragraph text It will append text before and after the styled
	# text. This is usually going to be a <span> tag setting the
	# correct style.
	#
	# The suggestion here is to use something like "fixme", which can
	# then be searched easily in Sigil or similar, and replaced with
	# the correct style name, as the same font might be used for
	# multiple things in the source document.

	use XML::LibXML;
	use XML::LibXML::Iterator;

	use utf8;
	use open ':std', ':encoding(UTF-8)';

	my $debug = 0;

	#
	# Define paragraph styles that we might find in the source
	# document.
	#

	my %styles = (
	"ChapterHeaderNumber" => {
	"tag" => "",
	"pre" => '<hr class="sigil_split_marker"/><h1>',
	"post" => '. ',
	},
	"ChapterHeaderText" => {
	"tag" => "",
	"post" => "</h1>\n",
	},
	"SectionHeading" => {
	"tag" => "h2",
	"pre" => "\n",
	},
	"SectionSub" => {
	"tag" => "h3",
	"pre" => "\n",
	},
	"Default" => {
	"tag" => "p",
	},
	"Quotation" => {
	"tag" => "p",
	"attr" => "class=\"bible\"",
	},
	"Question" => {
	"tag" => "p",
	"attr" => "class=\"q\"",
	},
	"BulletList" => {
	"tag" => "li",
	"pre" => "\n<ul>\n",
	"post" => "</ul>\n\n",
	"sub" => qr/^.$/,
	"repl" => "",
	},
	);

	#
	# What to do when we find text that is a different font.
	#

	my %fonts = (
	"Delicious Roman" => {
	"pre" => "<span class=\"fixme\">",
	"post" => "</span>",
	},
	"Delicious Bold" => {
	"pre" => "<span class=\"fixme\">",
	"post" => "</span>",
	},
	"Delicious Italic" => {
	"pre" => "<span class=\"fixme\">",
	"post" => "</span>",
	},
	"Delicious Bold Italic" => {
	"pre" => "<span class=\"fixme\">",
	"post" => "</span>",
	},
	"Delicious Heavy Regular" => {
	"pre" => "<span class=\"fixme\">",
	"post" => "</span>",
	},
	"Delicious SmallCaps Regular" => {
	"pre" => "<span class=\"fixme\">",
	"post" => "</span>",
	},
	);

	#
	# Load the document and find the bits with text.
	#

	my $dom = XML::LibXML->load_xml(location => $ARGV[0]);
	my $page = $dom->findnodes('/SCRIBUSUTF8NEW/DOCUMENT/PAGEOBJECT/ITEXT');
	my $iter = XML::LibXML::Iterator->new($dom);

	my @output = ();
	my @text = ();
	my $prevclass = undef;
	my $prevstylepost = "";

	#
	# Iterate over all the nodes and look for paragraphs.
	#

	while ($iter->nextNode()) {
	my $node = $iter->current();
	my $nn = lc $node->nodeName;

	# Tab or non-breaking space?
	#
	push @text, "\t" if $nn eq "tab";
	push @text, " " if $nn eq "nbspace";

	next unless $node->hasAttributes();

	# Some text
	#
	if ($nn eq "itext") {
	my $line = $node->getAttribute("CH");
	my $font = $node->getAttribute("FONT");
	if (defined $font) {
	if (defined $fonts{$font}) {
	my $pre = $fonts{$font}{"pre"} \|\| "";
	my $post = $fonts{$font}{"post"} \|\| "";
	$line = "$pre$line$post";
	} else {
	$line = "<span style=\"font-name: $font\">$line</span>";
	}
	}
	my @attrs = $node->attributes;
	foreach my $attr (@attrs) {
	next if $attr->nodeName eq "CH";
	next if $attr->nodeName eq "FONT";
	print "unparsed attribute '". $attr->nodeName() . "': " . $attr->getValue() . "\n" if $debug;
	}
	push @text, $line;
	}

	# A paragraph to hold all the preceeding text
	#
	elsif ($nn eq "para" or $nn eq "trail") {
	my $class = $node->getAttribute("PARENT");
	$style = $styles{$class};

	if ($prevclass and $class ne $prevclass) {
	push @output, $prevstylepost;
	push @output, $$style{"pre"} \|\| "";
	}

	$tag = defined $$style{"tag"} ? $$style{"tag"} : "p";
	$attr = $$style{"attr"};
	$pre = $$style{"pre"} \|\| "";
	$post = $$style{"post"} \|\| "";
	if (scalar @text) {
	if ($$style{"sub"}) {
	$sub = $$style{"sub"};
	$repl = $$style{"repl"} \|\| "";
	for (my $i=0; $i<=$#text; $i++) {
	$text[$i] =~ s/$sub/$repl/s;
	}
	}
	if (defined $attr and not defined $tag) {
	die "style $class with attr but no tag";
	}
	push @output, "<$tag" . (defined $attr ? " $attr" : "") . ">" if $tag;
	push @output, join("", @text);
	push @output, "</$tag>\n" if $tag;
	}

	@text = ();
	$prevclass = $class;
	$prevstylepost = $$style{"post"} \|\| "";
	}

	# Don't handle other things yet
	#
	else {
	next if $nn eq "color";
	next if $nn eq "document";
	next if $nn eq "style";
	next if $nn eq "checkprofile";
	next if $nn eq "tabs";
	next if $nn eq "effekte";
	next if $nn eq "pdf";
	next if $nn eq "subset";
	next if $nn eq "fonts";
	print "unknown nodename:" . $nn . "\n" if $debug;
	}
	}
	push @output, $prevstylepost;


	my $bibleregex = qr/(?:[123](?: \|\s))?[A-Z][a-z]+(?: \|\s)\d+(?::\d+)?(?:[-–]\d+(?::\d+)?)?/;

	foreach my $line (@output) {

	#
	# Can process all text before it gets printed out. In this
	# case we substitute things that look like Bible
	# book/chapter/verse references and put a <span> around
	# them.
	#
	$line =~ s/($bibleregex)/<span class="ref">$1<\/span>/g;

	print $line;
	}