Last active
December 18, 2015 13:09
-
-
Save rochefort/5788276 to your computer and use it in GitHub Desktop.
wikiの小倉百人一首及び画像をダウンロードする
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# encoding: utf-8 | |
#!/usr/bin/env ruby | |
require 'rubygems' | |
require 'pismo' | |
module Ogura100 | |
URL = 'http://ja.wikisource.org/wiki/%E5%B0%8F%E5%80%89%E7%99%BE%E4%BA%BA%E4%B8%80%E9%A6%96' | |
class Scrape | |
def run | |
poems = [] | |
doc = Pismo::Document.new(URL) | |
table = doc.doc.css('div#mw-content-text table') | |
table.css('tr').each_with_index do |tr, i| | |
next if i.zero? | |
poem = {} | |
poem[:id] = i | |
tds = tr.css('td') | |
tds[0].tap do |td| | |
poem[:first], | |
poem[:second], | |
poem[:first_kana], | |
poem[:second_kana] = td.text.gsub(/(|)/, '').split("\n") | |
poem[:kimariji] = td.css('b').text | |
end | |
tds[1].tap do |td| | |
poem[:author], poem[:author_kana] = td.text.gsub(/(|)/, '').split("\n") | |
end | |
poems << poem | |
end | |
p poems | |
end | |
def download_public_images | |
base_url = 'http://ja.wikisource.org/wiki/%E3%83%95%E3%82%A1%E3%82%A4%E3%83%AB:Hyakuninisshu_' | |
save_dir = 'images' | |
FileUtils.mkdir_p(save_dir) | |
FileUtils.cd save_dir | |
threads = [] | |
(1..100).each_slice(10).each do |chunk| | |
threads << Thread.new do | |
chunk.each do |i| | |
image = "#{sprintf("%03d", i)}.jpg" | |
link_url = "#{base_url}#{image}" | |
doc = Pismo::Document.new(link_url) | |
href = doc.doc.css('div#file a').attr('href').text | |
image_url = "http:#{href}" | |
open(image, 'wb') do |f| | |
open(image_url) { |data| f.write(data.read) } | |
end | |
end | |
end | |
end | |
threads .each { |t| t.join } | |
end | |
end | |
end | |
if $0 == __FILE__ | |
o = Ogura100::Scrape.new | |
o.run | |
o.download_public_images | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment