Skip to content

Instantly share code, notes, and snippets.

@takehiko
Created October 23, 2015 21:17
Show Gist options
  • Save takehiko/51b0e9623bdf314821b0 to your computer and use it in GitHub Desktop.
Save takehiko/51b0e9623bdf314821b0 to your computer and use it in GitHub Desktop.
Intro Extractor for Articles in Aozora Bunko
#!/usr/bin/env ruby
# aozora-picker.rb : 青空文庫収録作品のイントロ収集
# by takehikom
# usage:
# ruby aozora-picker.rb
# ruby aozora-picker.rb wa
# ruby aozora-picker.rb /path/to/github/aozorabunko/index_pages/sakuhin_wa?.html
# see also:
# http://www.aozora.gr.jp/
# https://github.com/aozorabunko/aozorabunko
require "open-uri"
require "nokogiri"
require "uri"
require "kconv"
class AozoraPicker
def initialize(param = nil)
if Array === param && !param.empty?
@index_uri_a = param
elsif /^wa/i =~ param.to_s
@index_uri_a = (1..6).to_a.map {|i|
"http://www.aozora.gr.jp/index_pages/sakuhin_wa#{i}.html"
}
else
@index_uri_a = ["http://www.aozora.gr.jp/index_pages/sakuhin_wa1.html"]
end
@article_a = []
@article_count_limit = 3
@article_count = 0
@sleep_time = 2
@intro_length = 100
end
attr_accessor :index_uri_a, :sleep_time, :intro_length
attr_accessor :article_count_limit, :article_count
attr_reader :article_a
def start
@article_count = 0
@index_uri_a.each do |index_uri|
if index_uri.index(":/").nil?
index_uri = "file:" + File.expand_path(index_uri)
end
puts "#{index_uri}"
if collected?
puts " skipped"
next
end
get_article_by_index(index_uri)
end
save
end
def save(filename = "aozora.txt")
open(filename, "w") do |f_out|
f_out.puts @article_a
end
end
def get_article_by_index(index_uri)
index_html = open(index_uri.tfs).read
index_html.scan(/\.\.\/cards.+html/) do |path1|
info_uri = URI.join(index_uri, path1).to_s
puts "==> #{info_uri}"
get_article_by_info(info_uri)
break if collected?
end
end
def get_article_by_info(info_uri)
info_html = open(info_uri.tfs).read.toutf8
if /(files.+html)..いますぐXHTML/ =~ info_html
path2 = $1
xhtml_uri = URI.join(info_uri, path2).to_s
puts " ==> #{xhtml_uri}"
get_article_by_xhtml(xhtml_uri)
end
end
def get_article_by_xhtml(xhtml_uri)
xhtml_doc = Nokogiri::HTML.parse(open(xhtml_uri.tfs), nil, "CP932")
xhtml_content = xhtml_doc.content.toutf8.strip
xhtml_title = xhtml_doc.title.toutf8
if xhtml_content.index(xhtml_title)
puts " Title found in content"
xhtml_content[xhtml_content.index(xhtml_title), xhtml_title.length] = ""
end
xhtml_content2 = xhtml_content.dup
xhtml_content2.gsub!(/[\s ]+/m, " ")
xhtml_content2.strip!
xhtml_content2.gsub!(/[.*?]/, "")
xhtml_intro = xhtml_content2[0, @intro_length] + " ..."
puts " ==> #{xhtml_intro}"
@article_a << [xhtml_uri, xhtml_intro].join(" : ")
@article_count += 1
return if collected?
return if xhtml_uri.index("file:") == 0
sleep @sleep_time if @sleep_time > 0
end
def collected?
@article_count_limit <= @article_count
end
end
class String
def trim_file_scheme
self.sub(/^file:\/*/, "/")
end
alias :tfs :trim_file_scheme
end
if __FILE__ == $0
AozoraPicker.new(ARGV).start
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment