egardner · October 22, 2018 22:05
diff --git a/epub.rake b/epub.rake
 # lib/tasks/epub.rake

 require "pathname"

 namespace :epub do
  desc "Import data from a target epub file"
  task import: :environment do
    path = Pathname.new(ARGV.last)
    raise ArgumentError, "Please provide a path to a valid EPUB file." unless path.exist?
    EpubParser.parse(path)
    exit 0
  end
 end
diff --git a/epub_parser.rb b/epub_parser.rb
 require "nokogiri"
 require "pathname"
 require "yaml"
 require "zip"
 require "zip/filesystem"

 ##
 # This class is responsible for breaking down a zipped .epub file into a data
 # structure that can be stored in the application's database.
 #
 class EpubParser

  # Whitelist of block-level HTML elements we will recognize
  BLOCK_TYPES = [
    'h1',
    'h2',
    'h3',
    'h4',
    'h5',
    'h6',
    'p',
    'blockquote',
    'ul',
    'ol',
    'img'
  ]

  ##
  # Parses an epub file. This method is the "public interface" of this class.
  # * +epub+ - Path to a valid epub file.
  #
  def self.parse(epub)
    new(epub).parse
  end

  ##
  # * +epub+ - Path to a valid epub file.
  #
  def initialize(epub)
    path = Pathname.new(ARGV.last)
    raise ArgumentError, "Please provide a path to a valid EPUB file." unless path.exist?

    @epub         = Zip::File.open(epub)
    @container    = parse_zipped_xml_file('META-INF/container.xml')
    @package_path = Pathname.new(@container.css('rootfile')[0]['full-path'])
    @package      = EpubPackage.new(parse_zipped_xml_file(@package_path))
    @chapters     = []

    @package.contents.each do |item|
      path = @package_path.dirname + item
      @chapters << EpubChapter.new(parse_zipped_html_file(path.to_s))
    end
  end

  ##
  # Actually parse the epub; the main sequence of operations in this service.
  #
  def parse
    puts "Parsing!"
    puts "\n"
    chapters.each { |chapter| puts "#{chapter.title}: #{chapter.contents.size}" }
    epub.close
    puts "\n"
    puts chapters.sample.contents.sample.to_yaml
    puts "Finished!"
  end

  private

  # Zip::File object representing the .epub archive
  attr_reader :epub

  # Nokogiri::XML::Document
  attr_reader :container, :package

  # Array of Nokogiri::XML::Documents
  attr_reader :chapters

  ##
  # Closes the Zip::File instance after we are done using it.
  #
  def close_archive
    epub.close
  end

  ##
  # Feeds a Zip::Entry object into Nokogiri to be parsed as XML.
  # Returns Nokogiri::XML::Document.
  # Zip::File#get_entry throws Errno::ENOENT if no file is found
  #
  def parse_zipped_xml_file(filename)
    entry = epub.get_entry(filename)
    entry.get_input_stream { |f| Nokogiri::XML(f) }
  end

  ##
  # Feeds a Zip::Entry object into Nokogiri to be parsed as HTML.
  # Returns Nokogiri::HTML::Document.
  # Zip::File#get_entry throws Errno::ENOENT if no file is found
  #
  def parse_zipped_html_file(filename)
    entry = epub.get_entry(filename)
    entry.get_input_stream { |f| Nokogiri::HTML(f) }
  end

  ##
  # XML Package document of an EPUB file.
  #
  class EpubPackage
    METADATA_ATTRIBUTES = {
      :id          => "dc|identifier",
      :title       => "dc|title",
      :creator     => "dc|creator",
      :language    => "dc|language",
      :description => "dc|description",
      :date        => "dc|date",
      :rights      => "dc|rights",
      :publisher   => "dc|publisher",
      :subjects    => "dc|subject"
    }

    ##
    # ==== Attributes
    # * +package_doc+ Nokogiri::XML document
    #
    def initialize(package_doc)
      raise ArgumentError "Valid Nokogiri::XML node not found" unless package_doc.xml?
      @doc = package_doc
    end

    ##
    # Returns a Hash with keys identical to the METADATA_ATTRIBUTES hash above
    #
    def metadata
      METADATA_ATTRIBUTES.map { |k, v| [ k, find_first(v).text ] }.to_h
    end

    ##
    # Returns an Array of strings representing paths to chapter files
    #
    def contents
      contents = find_all("spine itemref").map { |node| node['idref'] }
      contents.map { |id| find_first("[id='#{id}']")['href'] }
    end

    private

    # Nokogiri::XML::Document
    attr_reader :doc

    ##
    # Returns a Nokogiri::XML::Node
    #
    # ==== Attributes
    # * +selector+ CSS Selector (string) for use with Nokogiri's +css+ method
    #
    def find_first(selector)
      namespaces = doc.collect_namespaces
      doc.at_css(selector, namespaces)
    end

    ##
    # Returns a Nokogiri::XML::Nodeset
    #
    # ==== Attributes
    # * +selector+ CSS Selector (string) for use with Nokogiri's +css+ method
    #
    def find_all(selector)
      namespaces = doc.collect_namespaces
      doc.css(selector, namespaces)
    end
  end

  ##
  # (X)HTML document of an EPUB chapter.
  #
  class EpubChapter

    # Array of EpubParser::Block objects
    attr_reader :contents

    # * +chapter_doc+ Nokogiri::XML document
    #
    def initialize(chapter_doc)
      raise ArgumentError, "Valid Nokogiri::XML node not found" unless chapter_doc.html?

      @doc      = chapter_doc
      @contents = []
      parse_block(doc.at_css('body'))
    end

    def title
      doc.title
    end

    private

    # Array of EpubParser::Block objects
    attr_writer :contents

    # Nokogiri::XML::Document
    attr_reader :doc

    def parse_block(node)
      if BLOCK_TYPES.include?(node.name)
        contents << Block.new(node)
      else
        node.element_children.each { |e| parse_block(e) }
      end
    end

    ##
    # Returns a Nokogiri::XML::Node
    #
    # ==== Attributes
    # * +selector+ CSS Selector (string) for use with Nokogiri's +css+ method
    #
    def find_first(selector)
      namespaces = doc.collect_namespaces
      doc.at_css(selector, namespaces)
    end

    ##
    # Returns a Nokogiri::XML::Nodeset
    #
    # ==== Attributes
    # * +selector+ CSS Selector (string) for use with Nokogiri's +css+ method
    #
    def find_all(selector)
      namespaces = doc.collect_namespaces
      doc.css(selector, namespaces)
    end
  end

  ##
  # Basic building-block of a book chapter
  #
  class Block

    attr_reader :contents

    ##
    # ==== Attributes
    # * +node+ Nokogiri::XML::Node
    def initialize(node)
      @tag = node.name
      @contents = []

      parse_children(node)
    end

    private

    attr_writer :contents

    def parse_children(node)
      node.children.each do |child|
        case child.type
        when 1 # element node
          if BLOCK_TYPES.include?(child.name)
            contents << Block.new(child)
          else
            contents << TextNode.new(child.text, child.name) unless child.text.empty?
          end
        when 2 # attr node; ignore for now
        when 3 # text node
          contents << TextNode.new(child.text) unless child.text.empty?
        end
      end
    end
  end

  ##
  # Raw text snippet with some basic properties
  #
  class TextNode
    def initialize(text, tag="text")
      @text = text
      @tag = tag
    end
  end
 end
	# lib/tasks/epub.rake

	require "pathname"

	namespace :epub do
	desc "Import data from a target epub file"
	task import: :environment do
	path = Pathname.new(ARGV.last)
	raise ArgumentError, "Please provide a path to a valid EPUB file." unless path.exist?
	EpubParser.parse(path)
	exit 0
	end
	end
	require "nokogiri"
	require "pathname"
	require "yaml"
	require "zip"
	require "zip/filesystem"

	##
	# This class is responsible for breaking down a zipped .epub file into a data
	# structure that can be stored in the application's database.
	#
	class EpubParser

	# Whitelist of block-level HTML elements we will recognize
	BLOCK_TYPES = [
	'h1',
	'h2',
	'h3',
	'h4',
	'h5',
	'h6',
	'p',
	'blockquote',
	'ul',
	'ol',
	'img'
	]

	##
	# Parses an epub file. This method is the "public interface" of this class.
	# * +epub+ - Path to a valid epub file.
	#
	def self.parse(epub)
	new(epub).parse
	end

	##
	# * +epub+ - Path to a valid epub file.
	#
	def initialize(epub)
	path = Pathname.new(ARGV.last)
	raise ArgumentError, "Please provide a path to a valid EPUB file." unless path.exist?

	@epub = Zip::File.open(epub)
	@container = parse_zipped_xml_file('META-INF/container.xml')
	@package_path = Pathname.new(@container.css('rootfile')[0]['full-path'])
	@package = EpubPackage.new(parse_zipped_xml_file(@package_path))
	@chapters = []

	@package.contents.each do \|item\|
	path = @package_path.dirname + item
	@chapters << EpubChapter.new(parse_zipped_html_file(path.to_s))
	end
	end

	##
	# Actually parse the epub; the main sequence of operations in this service.
	#
	def parse
	puts "Parsing!"
	puts "\n"
	chapters.each { \|chapter\| puts "#{chapter.title}: #{chapter.contents.size}" }
	epub.close
	puts "\n"
	puts chapters.sample.contents.sample.to_yaml
	puts "Finished!"
	end

	private

	# Zip::File object representing the .epub archive
	attr_reader :epub

	# Nokogiri::XML::Document
	attr_reader :container, :package

	# Array of Nokogiri::XML::Documents
	attr_reader :chapters

	##
	# Closes the Zip::File instance after we are done using it.
	#
	def close_archive
	epub.close
	end

	##
	# Feeds a Zip::Entry object into Nokogiri to be parsed as XML.
	# Returns Nokogiri::XML::Document.
	# Zip::File#get_entry throws Errno::ENOENT if no file is found
	#
	def parse_zipped_xml_file(filename)
	entry = epub.get_entry(filename)
	entry.get_input_stream { \|f\| Nokogiri::XML(f) }
	end

	##
	# Feeds a Zip::Entry object into Nokogiri to be parsed as HTML.
	# Returns Nokogiri::HTML::Document.
	# Zip::File#get_entry throws Errno::ENOENT if no file is found
	#
	def parse_zipped_html_file(filename)
	entry = epub.get_entry(filename)
	entry.get_input_stream { \|f\| Nokogiri::HTML(f) }
	end

	##
	# XML Package document of an EPUB file.
	#
	class EpubPackage
	METADATA_ATTRIBUTES = {
	:id => "dc\|identifier",
	:title => "dc\|title",
	:creator => "dc\|creator",
	:language => "dc\|language",
	:description => "dc\|description",
	:date => "dc\|date",
	:rights => "dc\|rights",
	:publisher => "dc\|publisher",
	:subjects => "dc\|subject"
	}

	##
	# ==== Attributes
	# * +package_doc+ Nokogiri::XML document
	#
	def initialize(package_doc)
	raise ArgumentError "Valid Nokogiri::XML node not found" unless package_doc.xml?
	@doc = package_doc
	end

	##
	# Returns a Hash with keys identical to the METADATA_ATTRIBUTES hash above
	#
	def metadata
	METADATA_ATTRIBUTES.map { \|k, v\| [ k, find_first(v).text ] }.to_h
	end

	##
	# Returns an Array of strings representing paths to chapter files
	#
	def contents
	contents = find_all("spine itemref").map { \|node\| node['idref'] }
	contents.map { \|id\| find_first("[id='#{id}']")['href'] }
	end

	private

	# Nokogiri::XML::Document
	attr_reader :doc

	##
	# Returns a Nokogiri::XML::Node
	#
	# ==== Attributes
	# * +selector+ CSS Selector (string) for use with Nokogiri's +css+ method
	#
	def find_first(selector)
	namespaces = doc.collect_namespaces
	doc.at_css(selector, namespaces)
	end

	##
	# Returns a Nokogiri::XML::Nodeset
	#
	# ==== Attributes
	# * +selector+ CSS Selector (string) for use with Nokogiri's +css+ method
	#
	def find_all(selector)
	namespaces = doc.collect_namespaces
	doc.css(selector, namespaces)
	end
	end

	##
	# (X)HTML document of an EPUB chapter.
	#
	class EpubChapter

	# Array of EpubParser::Block objects
	attr_reader :contents

	# * +chapter_doc+ Nokogiri::XML document
	#
	def initialize(chapter_doc)
	raise ArgumentError, "Valid Nokogiri::XML node not found" unless chapter_doc.html?

	@doc = chapter_doc
	@contents = []
	parse_block(doc.at_css('body'))
	end

	def title
	doc.title
	end

	private

	# Array of EpubParser::Block objects
	attr_writer :contents

	# Nokogiri::XML::Document
	attr_reader :doc

	def parse_block(node)
	if BLOCK_TYPES.include?(node.name)
	contents << Block.new(node)
	else
	node.element_children.each { \|e\| parse_block(e) }
	end
	end

	##
	# Returns a Nokogiri::XML::Node
	#
	# ==== Attributes
	# * +selector+ CSS Selector (string) for use with Nokogiri's +css+ method
	#
	def find_first(selector)
	namespaces = doc.collect_namespaces
	doc.at_css(selector, namespaces)
	end

	##
	# Returns a Nokogiri::XML::Nodeset
	#
	# ==== Attributes
	# * +selector+ CSS Selector (string) for use with Nokogiri's +css+ method
	#
	def find_all(selector)
	namespaces = doc.collect_namespaces
	doc.css(selector, namespaces)
	end
	end

	##
	# Basic building-block of a book chapter
	#
	class Block

	attr_reader :contents

	##
	# ==== Attributes
	# * +node+ Nokogiri::XML::Node
	def initialize(node)
	@tag = node.name
	@contents = []

	parse_children(node)
	end

	private

	attr_writer :contents

	def parse_children(node)
	node.children.each do \|child\|
	case child.type
	when 1 # element node
	if BLOCK_TYPES.include?(child.name)
	contents << Block.new(child)
	else
	contents << TextNode.new(child.text, child.name) unless child.text.empty?
	end
	when 2 # attr node; ignore for now
	when 3 # text node
	contents << TextNode.new(child.text) unless child.text.empty?
	end
	end
	end
	end

	##
	# Raw text snippet with some basic properties
	#
	class TextNode
	def initialize(text, tag="text")
	@text = text
	@tag = tag
	end
	end
	end