Skip to content

Instantly share code, notes, and snippets.

@egardner
Last active October 22, 2018 22:05
Show Gist options
  • Save egardner/56934254ef84a8dedce29fc9d370bd50 to your computer and use it in GitHub Desktop.
Save egardner/56934254ef84a8dedce29fc9d370bd50 to your computer and use it in GitHub Desktop.
Work in progress Epub Parser in Ruby
# lib/tasks/epub.rake
require "pathname"
namespace :epub do
desc "Import data from a target epub file"
task import: :environment do
path = Pathname.new(ARGV.last)
raise ArgumentError, "Please provide a path to a valid EPUB file." unless path.exist?
EpubParser.parse(path)
exit 0
end
end
require "nokogiri"
require "pathname"
require "yaml"
require "zip"
require "zip/filesystem"
##
# This class is responsible for breaking down a zipped .epub file into a data
# structure that can be stored in the application's database.
#
class EpubParser
# Whitelist of block-level HTML elements we will recognize
BLOCK_TYPES = [
'h1',
'h2',
'h3',
'h4',
'h5',
'h6',
'p',
'blockquote',
'ul',
'ol',
'img'
]
##
# Parses an epub file. This method is the "public interface" of this class.
# * +epub+ - Path to a valid epub file.
#
def self.parse(epub)
new(epub).parse
end
##
# * +epub+ - Path to a valid epub file.
#
def initialize(epub)
path = Pathname.new(ARGV.last)
raise ArgumentError, "Please provide a path to a valid EPUB file." unless path.exist?
@epub = Zip::File.open(epub)
@container = parse_zipped_xml_file('META-INF/container.xml')
@package_path = Pathname.new(@container.css('rootfile')[0]['full-path'])
@package = EpubPackage.new(parse_zipped_xml_file(@package_path))
@chapters = []
@package.contents.each do |item|
path = @package_path.dirname + item
@chapters << EpubChapter.new(parse_zipped_html_file(path.to_s))
end
end
##
# Actually parse the epub; the main sequence of operations in this service.
#
def parse
puts "Parsing!"
puts "\n"
chapters.each { |chapter| puts "#{chapter.title}: #{chapter.contents.size}" }
epub.close
puts "\n"
puts chapters.sample.contents.sample.to_yaml
puts "Finished!"
end
private
# Zip::File object representing the .epub archive
attr_reader :epub
# Nokogiri::XML::Document
attr_reader :container, :package
# Array of Nokogiri::XML::Documents
attr_reader :chapters
##
# Closes the Zip::File instance after we are done using it.
#
def close_archive
epub.close
end
##
# Feeds a Zip::Entry object into Nokogiri to be parsed as XML.
# Returns Nokogiri::XML::Document.
# Zip::File#get_entry throws Errno::ENOENT if no file is found
#
def parse_zipped_xml_file(filename)
entry = epub.get_entry(filename)
entry.get_input_stream { |f| Nokogiri::XML(f) }
end
##
# Feeds a Zip::Entry object into Nokogiri to be parsed as HTML.
# Returns Nokogiri::HTML::Document.
# Zip::File#get_entry throws Errno::ENOENT if no file is found
#
def parse_zipped_html_file(filename)
entry = epub.get_entry(filename)
entry.get_input_stream { |f| Nokogiri::HTML(f) }
end
##
# XML Package document of an EPUB file.
#
class EpubPackage
METADATA_ATTRIBUTES = {
:id => "dc|identifier",
:title => "dc|title",
:creator => "dc|creator",
:language => "dc|language",
:description => "dc|description",
:date => "dc|date",
:rights => "dc|rights",
:publisher => "dc|publisher",
:subjects => "dc|subject"
}
##
# ==== Attributes
# * +package_doc+ Nokogiri::XML document
#
def initialize(package_doc)
raise ArgumentError "Valid Nokogiri::XML node not found" unless package_doc.xml?
@doc = package_doc
end
##
# Returns a Hash with keys identical to the METADATA_ATTRIBUTES hash above
#
def metadata
METADATA_ATTRIBUTES.map { |k, v| [ k, find_first(v).text ] }.to_h
end
##
# Returns an Array of strings representing paths to chapter files
#
def contents
contents = find_all("spine itemref").map { |node| node['idref'] }
contents.map { |id| find_first("[id='#{id}']")['href'] }
end
private
# Nokogiri::XML::Document
attr_reader :doc
##
# Returns a Nokogiri::XML::Node
#
# ==== Attributes
# * +selector+ CSS Selector (string) for use with Nokogiri's +css+ method
#
def find_first(selector)
namespaces = doc.collect_namespaces
doc.at_css(selector, namespaces)
end
##
# Returns a Nokogiri::XML::Nodeset
#
# ==== Attributes
# * +selector+ CSS Selector (string) for use with Nokogiri's +css+ method
#
def find_all(selector)
namespaces = doc.collect_namespaces
doc.css(selector, namespaces)
end
end
##
# (X)HTML document of an EPUB chapter.
#
class EpubChapter
# Array of EpubParser::Block objects
attr_reader :contents
# * +chapter_doc+ Nokogiri::XML document
#
def initialize(chapter_doc)
raise ArgumentError, "Valid Nokogiri::XML node not found" unless chapter_doc.html?
@doc = chapter_doc
@contents = []
parse_block(doc.at_css('body'))
end
def title
doc.title
end
private
# Array of EpubParser::Block objects
attr_writer :contents
# Nokogiri::XML::Document
attr_reader :doc
def parse_block(node)
if BLOCK_TYPES.include?(node.name)
contents << Block.new(node)
else
node.element_children.each { |e| parse_block(e) }
end
end
##
# Returns a Nokogiri::XML::Node
#
# ==== Attributes
# * +selector+ CSS Selector (string) for use with Nokogiri's +css+ method
#
def find_first(selector)
namespaces = doc.collect_namespaces
doc.at_css(selector, namespaces)
end
##
# Returns a Nokogiri::XML::Nodeset
#
# ==== Attributes
# * +selector+ CSS Selector (string) for use with Nokogiri's +css+ method
#
def find_all(selector)
namespaces = doc.collect_namespaces
doc.css(selector, namespaces)
end
end
##
# Basic building-block of a book chapter
#
class Block
attr_reader :contents
##
# ==== Attributes
# * +node+ Nokogiri::XML::Node
def initialize(node)
@tag = node.name
@contents = []
parse_children(node)
end
private
attr_writer :contents
def parse_children(node)
node.children.each do |child|
case child.type
when 1 # element node
if BLOCK_TYPES.include?(child.name)
contents << Block.new(child)
else
contents << TextNode.new(child.text, child.name) unless child.text.empty?
end
when 2 # attr node; ignore for now
when 3 # text node
contents << TextNode.new(child.text) unless child.text.empty?
end
end
end
end
##
# Raw text snippet with some basic properties
#
class TextNode
def initialize(text, tag="text")
@text = text
@tag = tag
end
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment