Created
October 23, 2015 21:17
-
-
Save takehiko/51b0e9623bdf314821b0 to your computer and use it in GitHub Desktop.
Intro Extractor for Articles in Aozora Bunko
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
# aozora-picker.rb : 青空文庫収録作品のイントロ収集 | |
# by takehikom | |
# usage: | |
# ruby aozora-picker.rb | |
# ruby aozora-picker.rb wa | |
# ruby aozora-picker.rb /path/to/github/aozorabunko/index_pages/sakuhin_wa?.html | |
# see also: | |
# http://www.aozora.gr.jp/ | |
# https://github.com/aozorabunko/aozorabunko | |
require "open-uri" | |
require "nokogiri" | |
require "uri" | |
require "kconv" | |
class AozoraPicker | |
def initialize(param = nil) | |
if Array === param && !param.empty? | |
@index_uri_a = param | |
elsif /^wa/i =~ param.to_s | |
@index_uri_a = (1..6).to_a.map {|i| | |
"http://www.aozora.gr.jp/index_pages/sakuhin_wa#{i}.html" | |
} | |
else | |
@index_uri_a = ["http://www.aozora.gr.jp/index_pages/sakuhin_wa1.html"] | |
end | |
@article_a = [] | |
@article_count_limit = 3 | |
@article_count = 0 | |
@sleep_time = 2 | |
@intro_length = 100 | |
end | |
attr_accessor :index_uri_a, :sleep_time, :intro_length | |
attr_accessor :article_count_limit, :article_count | |
attr_reader :article_a | |
def start | |
@article_count = 0 | |
@index_uri_a.each do |index_uri| | |
if index_uri.index(":/").nil? | |
index_uri = "file:" + File.expand_path(index_uri) | |
end | |
puts "#{index_uri}" | |
if collected? | |
puts " skipped" | |
next | |
end | |
get_article_by_index(index_uri) | |
end | |
save | |
end | |
def save(filename = "aozora.txt") | |
open(filename, "w") do |f_out| | |
f_out.puts @article_a | |
end | |
end | |
def get_article_by_index(index_uri) | |
index_html = open(index_uri.tfs).read | |
index_html.scan(/\.\.\/cards.+html/) do |path1| | |
info_uri = URI.join(index_uri, path1).to_s | |
puts "==> #{info_uri}" | |
get_article_by_info(info_uri) | |
break if collected? | |
end | |
end | |
def get_article_by_info(info_uri) | |
info_html = open(info_uri.tfs).read.toutf8 | |
if /(files.+html)..いますぐXHTML/ =~ info_html | |
path2 = $1 | |
xhtml_uri = URI.join(info_uri, path2).to_s | |
puts " ==> #{xhtml_uri}" | |
get_article_by_xhtml(xhtml_uri) | |
end | |
end | |
def get_article_by_xhtml(xhtml_uri) | |
xhtml_doc = Nokogiri::HTML.parse(open(xhtml_uri.tfs), nil, "CP932") | |
xhtml_content = xhtml_doc.content.toutf8.strip | |
xhtml_title = xhtml_doc.title.toutf8 | |
if xhtml_content.index(xhtml_title) | |
puts " Title found in content" | |
xhtml_content[xhtml_content.index(xhtml_title), xhtml_title.length] = "" | |
end | |
xhtml_content2 = xhtml_content.dup | |
xhtml_content2.gsub!(/[\s ]+/m, " ") | |
xhtml_content2.strip! | |
xhtml_content2.gsub!(/[.*?]/, "") | |
xhtml_intro = xhtml_content2[0, @intro_length] + " ..." | |
puts " ==> #{xhtml_intro}" | |
@article_a << [xhtml_uri, xhtml_intro].join(" : ") | |
@article_count += 1 | |
return if collected? | |
return if xhtml_uri.index("file:") == 0 | |
sleep @sleep_time if @sleep_time > 0 | |
end | |
def collected? | |
@article_count_limit <= @article_count | |
end | |
end | |
class String | |
def trim_file_scheme | |
self.sub(/^file:\/*/, "/") | |
end | |
alias :tfs :trim_file_scheme | |
end | |
if __FILE__ == $0 | |
AozoraPicker.new(ARGV).start | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment