takehiko · October 23, 2015 21:17
diff --git a/aozora-picker.rb b/aozora-picker.rb
 #!/usr/bin/env ruby

 # aozora-picker.rb : 青空文庫収録作品のイントロ収集
 #   by takehikom
 # usage:
 #   ruby aozora-picker.rb
 #   ruby aozora-picker.rb wa
 #   ruby aozora-picker.rb /path/to/github/aozorabunko/index_pages/sakuhin_wa?.html
 # see also:
 #   http://www.aozora.gr.jp/
 #   https://github.com/aozorabunko/aozorabunko

 require "open-uri"
 require "nokogiri"
 require "uri"
 require "kconv"

 class AozoraPicker
  def initialize(param = nil)
    if Array === param && !param.empty?
      @index_uri_a = param
    elsif /^wa/i =~ param.to_s
      @index_uri_a = (1..6).to_a.map {|i|
        "http://www.aozora.gr.jp/index_pages/sakuhin_wa#{i}.html"
      }
    else
      @index_uri_a = ["http://www.aozora.gr.jp/index_pages/sakuhin_wa1.html"]
    end
    @article_a = []
    @article_count_limit = 3
    @article_count = 0
    @sleep_time = 2
    @intro_length = 100
  end
  attr_accessor :index_uri_a, :sleep_time, :intro_length
  attr_accessor :article_count_limit, :article_count
  attr_reader :article_a

  def start
    @article_count = 0
    @index_uri_a.each do |index_uri|
      if index_uri.index(":/").nil?
        index_uri = "file:" + File.expand_path(index_uri)
      end
      puts "#{index_uri}"
      if collected?
        puts "   skipped"
        next
      end
      get_article_by_index(index_uri)
    end

    save
  end

  def save(filename = "aozora.txt")
    open(filename, "w") do |f_out|
      f_out.puts @article_a
    end
  end

  def get_article_by_index(index_uri)
    index_html = open(index_uri.tfs).read
    index_html.scan(/\.\.\/cards.+html/) do |path1|
      info_uri = URI.join(index_uri, path1).to_s
      puts "==> #{info_uri}"
      get_article_by_info(info_uri)
      break if collected?
    end
  end

  def get_article_by_info(info_uri)
    info_html = open(info_uri.tfs).read.toutf8
    if /(files.+html)..いますぐXHTML/ =~ info_html
      path2 = $1
      xhtml_uri = URI.join(info_uri, path2).to_s
      puts "   ==> #{xhtml_uri}"
      get_article_by_xhtml(xhtml_uri)
    end
  end

  def get_article_by_xhtml(xhtml_uri)
    xhtml_doc = Nokogiri::HTML.parse(open(xhtml_uri.tfs), nil, "CP932")
    xhtml_content = xhtml_doc.content.toutf8.strip
    xhtml_title = xhtml_doc.title.toutf8
    if xhtml_content.index(xhtml_title)
      puts "      Title found in content"
      xhtml_content[xhtml_content.index(xhtml_title), xhtml_title.length] = ""
    end
    xhtml_content2 = xhtml_content.dup
    xhtml_content2.gsub!(/[\s　]+/m, " ")
    xhtml_content2.strip!
    xhtml_content2.gsub!(/［.*?］/, "")
    xhtml_intro = xhtml_content2[0, @intro_length] + " ..."
    puts "      ==> #{xhtml_intro}"
    @article_a << [xhtml_uri, xhtml_intro].join(" : ")

    @article_count += 1
    return if collected?
    return if xhtml_uri.index("file:") == 0
    sleep @sleep_time if @sleep_time > 0
  end

  def collected?
    @article_count_limit <= @article_count
  end
 end

 class String
  def trim_file_scheme
    self.sub(/^file:\/*/, "/")
  end
  alias :tfs :trim_file_scheme
 end

 if __FILE__ == $0
  AozoraPicker.new(ARGV).start
 end
	#!/usr/bin/env ruby

	# aozora-picker.rb : 青空文庫収録作品のイントロ収集
	# by takehikom
	# usage:
	# ruby aozora-picker.rb
	# ruby aozora-picker.rb wa
	# ruby aozora-picker.rb /path/to/github/aozorabunko/index_pages/sakuhin_wa?.html
	# see also:
	# http://www.aozora.gr.jp/
	# https://github.com/aozorabunko/aozorabunko

	require "open-uri"
	require "nokogiri"
	require "uri"
	require "kconv"

	class AozoraPicker
	def initialize(param = nil)
	if Array === param && !param.empty?
	@index_uri_a = param
	elsif /^wa/i =~ param.to_s
	@index_uri_a = (1..6).to_a.map {\|i\|
	"http://www.aozora.gr.jp/index_pages/sakuhin_wa#{i}.html"
	}
	else
	@index_uri_a = ["http://www.aozora.gr.jp/index_pages/sakuhin_wa1.html"]
	end
	@article_a = []
	@article_count_limit = 3
	@article_count = 0
	@sleep_time = 2
	@intro_length = 100
	end
	attr_accessor :index_uri_a, :sleep_time, :intro_length
	attr_accessor :article_count_limit, :article_count
	attr_reader :article_a

	def start
	@article_count = 0
	@index_uri_a.each do \|index_uri\|
	if index_uri.index(":/").nil?
	index_uri = "file:" + File.expand_path(index_uri)
	end
	puts "#{index_uri}"
	if collected?
	puts " skipped"
	next
	end
	get_article_by_index(index_uri)
	end

	save
	end

	def save(filename = "aozora.txt")
	open(filename, "w") do \|f_out\|
	f_out.puts @article_a
	end
	end

	def get_article_by_index(index_uri)
	index_html = open(index_uri.tfs).read
	index_html.scan(/\.\.\/cards.+html/) do \|path1\|
	info_uri = URI.join(index_uri, path1).to_s
	puts "==> #{info_uri}"
	get_article_by_info(info_uri)
	break if collected?
	end
	end

	def get_article_by_info(info_uri)
	info_html = open(info_uri.tfs).read.toutf8
	if /(files.+html)..いますぐXHTML/ =~ info_html
	path2 = $1
	xhtml_uri = URI.join(info_uri, path2).to_s
	puts " ==> #{xhtml_uri}"
	get_article_by_xhtml(xhtml_uri)
	end
	end

	def get_article_by_xhtml(xhtml_uri)
	xhtml_doc = Nokogiri::HTML.parse(open(xhtml_uri.tfs), nil, "CP932")
	xhtml_content = xhtml_doc.content.toutf8.strip
	xhtml_title = xhtml_doc.title.toutf8
	if xhtml_content.index(xhtml_title)
	puts " Title found in content"
	xhtml_content[xhtml_content.index(xhtml_title), xhtml_title.length] = ""
	end
	xhtml_content2 = xhtml_content.dup
	xhtml_content2.gsub!(/[\s　]+/m, " ")
	xhtml_content2.strip!
	xhtml_content2.gsub!(/［.*?］/, "")
	xhtml_intro = xhtml_content2[0, @intro_length] + " ..."
	puts " ==> #{xhtml_intro}"
	@article_a << [xhtml_uri, xhtml_intro].join(" : ")

	@article_count += 1
	return if collected?
	return if xhtml_uri.index("file:") == 0
	sleep @sleep_time if @sleep_time > 0
	end

	def collected?
	@article_count_limit <= @article_count
	end
	end

	class String
	def trim_file_scheme
	self.sub(/^file:\/*/, "/")
	end
	alias :tfs :trim_file_scheme
	end

	if __FILE__ == $0
	AozoraPicker.new(ARGV).start
	end