Last active
October 22, 2018 22:05
-
-
Save egardner/56934254ef84a8dedce29fc9d370bd50 to your computer and use it in GitHub Desktop.
Work in progress Epub Parser in Ruby
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# lib/tasks/epub.rake | |
require "pathname" | |
namespace :epub do | |
desc "Import data from a target epub file" | |
task import: :environment do | |
path = Pathname.new(ARGV.last) | |
raise ArgumentError, "Please provide a path to a valid EPUB file." unless path.exist? | |
EpubParser.parse(path) | |
exit 0 | |
end | |
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require "nokogiri" | |
require "pathname" | |
require "yaml" | |
require "zip" | |
require "zip/filesystem" | |
## | |
# This class is responsible for breaking down a zipped .epub file into a data | |
# structure that can be stored in the application's database. | |
# | |
class EpubParser | |
# Whitelist of block-level HTML elements we will recognize | |
BLOCK_TYPES = [ | |
'h1', | |
'h2', | |
'h3', | |
'h4', | |
'h5', | |
'h6', | |
'p', | |
'blockquote', | |
'ul', | |
'ol', | |
'img' | |
] | |
## | |
# Parses an epub file. This method is the "public interface" of this class. | |
# * +epub+ - Path to a valid epub file. | |
# | |
def self.parse(epub) | |
new(epub).parse | |
end | |
## | |
# * +epub+ - Path to a valid epub file. | |
# | |
def initialize(epub) | |
path = Pathname.new(ARGV.last) | |
raise ArgumentError, "Please provide a path to a valid EPUB file." unless path.exist? | |
@epub = Zip::File.open(epub) | |
@container = parse_zipped_xml_file('META-INF/container.xml') | |
@package_path = Pathname.new(@container.css('rootfile')[0]['full-path']) | |
@package = EpubPackage.new(parse_zipped_xml_file(@package_path)) | |
@chapters = [] | |
@package.contents.each do |item| | |
path = @package_path.dirname + item | |
@chapters << EpubChapter.new(parse_zipped_html_file(path.to_s)) | |
end | |
end | |
## | |
# Actually parse the epub; the main sequence of operations in this service. | |
# | |
def parse | |
puts "Parsing!" | |
puts "\n" | |
chapters.each { |chapter| puts "#{chapter.title}: #{chapter.contents.size}" } | |
epub.close | |
puts "\n" | |
puts chapters.sample.contents.sample.to_yaml | |
puts "Finished!" | |
end | |
private | |
# Zip::File object representing the .epub archive | |
attr_reader :epub | |
# Nokogiri::XML::Document | |
attr_reader :container, :package | |
# Array of Nokogiri::XML::Documents | |
attr_reader :chapters | |
## | |
# Closes the Zip::File instance after we are done using it. | |
# | |
def close_archive | |
epub.close | |
end | |
## | |
# Feeds a Zip::Entry object into Nokogiri to be parsed as XML. | |
# Returns Nokogiri::XML::Document. | |
# Zip::File#get_entry throws Errno::ENOENT if no file is found | |
# | |
def parse_zipped_xml_file(filename) | |
entry = epub.get_entry(filename) | |
entry.get_input_stream { |f| Nokogiri::XML(f) } | |
end | |
## | |
# Feeds a Zip::Entry object into Nokogiri to be parsed as HTML. | |
# Returns Nokogiri::HTML::Document. | |
# Zip::File#get_entry throws Errno::ENOENT if no file is found | |
# | |
def parse_zipped_html_file(filename) | |
entry = epub.get_entry(filename) | |
entry.get_input_stream { |f| Nokogiri::HTML(f) } | |
end | |
## | |
# XML Package document of an EPUB file. | |
# | |
class EpubPackage | |
METADATA_ATTRIBUTES = { | |
:id => "dc|identifier", | |
:title => "dc|title", | |
:creator => "dc|creator", | |
:language => "dc|language", | |
:description => "dc|description", | |
:date => "dc|date", | |
:rights => "dc|rights", | |
:publisher => "dc|publisher", | |
:subjects => "dc|subject" | |
} | |
## | |
# ==== Attributes | |
# * +package_doc+ Nokogiri::XML document | |
# | |
def initialize(package_doc) | |
raise ArgumentError "Valid Nokogiri::XML node not found" unless package_doc.xml? | |
@doc = package_doc | |
end | |
## | |
# Returns a Hash with keys identical to the METADATA_ATTRIBUTES hash above | |
# | |
def metadata | |
METADATA_ATTRIBUTES.map { |k, v| [ k, find_first(v).text ] }.to_h | |
end | |
## | |
# Returns an Array of strings representing paths to chapter files | |
# | |
def contents | |
contents = find_all("spine itemref").map { |node| node['idref'] } | |
contents.map { |id| find_first("[id='#{id}']")['href'] } | |
end | |
private | |
# Nokogiri::XML::Document | |
attr_reader :doc | |
## | |
# Returns a Nokogiri::XML::Node | |
# | |
# ==== Attributes | |
# * +selector+ CSS Selector (string) for use with Nokogiri's +css+ method | |
# | |
def find_first(selector) | |
namespaces = doc.collect_namespaces | |
doc.at_css(selector, namespaces) | |
end | |
## | |
# Returns a Nokogiri::XML::Nodeset | |
# | |
# ==== Attributes | |
# * +selector+ CSS Selector (string) for use with Nokogiri's +css+ method | |
# | |
def find_all(selector) | |
namespaces = doc.collect_namespaces | |
doc.css(selector, namespaces) | |
end | |
end | |
## | |
# (X)HTML document of an EPUB chapter. | |
# | |
class EpubChapter | |
# Array of EpubParser::Block objects | |
attr_reader :contents | |
# * +chapter_doc+ Nokogiri::XML document | |
# | |
def initialize(chapter_doc) | |
raise ArgumentError, "Valid Nokogiri::XML node not found" unless chapter_doc.html? | |
@doc = chapter_doc | |
@contents = [] | |
parse_block(doc.at_css('body')) | |
end | |
def title | |
doc.title | |
end | |
private | |
# Array of EpubParser::Block objects | |
attr_writer :contents | |
# Nokogiri::XML::Document | |
attr_reader :doc | |
def parse_block(node) | |
if BLOCK_TYPES.include?(node.name) | |
contents << Block.new(node) | |
else | |
node.element_children.each { |e| parse_block(e) } | |
end | |
end | |
## | |
# Returns a Nokogiri::XML::Node | |
# | |
# ==== Attributes | |
# * +selector+ CSS Selector (string) for use with Nokogiri's +css+ method | |
# | |
def find_first(selector) | |
namespaces = doc.collect_namespaces | |
doc.at_css(selector, namespaces) | |
end | |
## | |
# Returns a Nokogiri::XML::Nodeset | |
# | |
# ==== Attributes | |
# * +selector+ CSS Selector (string) for use with Nokogiri's +css+ method | |
# | |
def find_all(selector) | |
namespaces = doc.collect_namespaces | |
doc.css(selector, namespaces) | |
end | |
end | |
## | |
# Basic building-block of a book chapter | |
# | |
class Block | |
attr_reader :contents | |
## | |
# ==== Attributes | |
# * +node+ Nokogiri::XML::Node | |
def initialize(node) | |
@tag = node.name | |
@contents = [] | |
parse_children(node) | |
end | |
private | |
attr_writer :contents | |
def parse_children(node) | |
node.children.each do |child| | |
case child.type | |
when 1 # element node | |
if BLOCK_TYPES.include?(child.name) | |
contents << Block.new(child) | |
else | |
contents << TextNode.new(child.text, child.name) unless child.text.empty? | |
end | |
when 2 # attr node; ignore for now | |
when 3 # text node | |
contents << TextNode.new(child.text) unless child.text.empty? | |
end | |
end | |
end | |
end | |
## | |
# Raw text snippet with some basic properties | |
# | |
class TextNode | |
def initialize(text, tag="text") | |
@text = text | |
@tag = tag | |
end | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment