Last active
January 20, 2019 05:40
-
-
Save itkq/11d9953777cc2a2ba5f433cf8a60f408 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'net/http' | |
require 'open-uri' | |
require 'fileutils' | |
require 'nokogiri' | |
class KonoiroIinaCollector | |
BASE_URL = 'https://webnewtype.com'.freeze | |
DEFAULT_RESOLUTION = 'w3600h2700'.freeze | |
def self.save_all_images(save_dir = nil, resolution = nil) | |
new(save_dir, resolution).save_all_images | |
end | |
def initialize(save_dir, resolution) | |
@save_dir = save_dir || './images' | |
@resolution = resolution || DEFAULT_RESOLUTION | |
create_dir(@save_dir) | |
end | |
def create_dir(dir) | |
unless Dir.exist?(dir) | |
FileUtils.mkdir_p(dir) | |
end | |
end | |
def save_all_images | |
enumerate_article do |href, date| | |
puts "#{date}: #{href}" | |
save_images_in_article(href, date) | |
end | |
end | |
def save_images_in_article(href, date, sleep_sec: 0.5) | |
create_dir(File.join(@save_dir, date)) | |
enumerate_image_sources(href) do |src| | |
save_image(date, src) | |
sleep(sleep_sec) | |
end | |
end | |
def save_image(date, src) | |
target_src = image_source_with_resolution(src) | |
filename = File.join(@save_dir, date, image_file_name(src)) | |
print target_src | |
if File.exist?(filename) | |
puts " => already exists (#{filename})" | |
return | |
end | |
open(target_src) do |img| | |
open(filename, "w+b") do |out| | |
out.write(img.read) | |
end | |
end | |
puts " => saved (to #{filename})" | |
end | |
def image_file_name(src) | |
name, ext = src.sub(/\/w\d+h\d+\/\z/, '').split('/').last.split('.') | |
"#{name}_#{@resolution}.#{ext}" | |
end | |
def image_source_with_resolution(src) | |
genuine_img_src = src.sub(/\/w\d+h\d+\/\z/, '') | |
assert_genuine_image_source!(genuine_img_src) | |
"#{genuine_img_src}/#{@resolution}/" | |
end | |
def assert_genuine_image_source!(genuine_img_src) | |
unless genuine_img_src.end_with?('.jpg') | |
raise "#{src} is not genuine image source" | |
end | |
end | |
def enumerate_image_sources(article_href) | |
url = URI.join(BASE_URL, "#{article_href}1/") | |
res = Net::HTTP.get_response(url) | |
res.value | |
html = Nokogiri::HTML(res.body) | |
img_area = html.css('div.related_imgArea') | |
img_area.css('div.imgBox > img').each do |img| | |
src = img.attr('src') | |
yield(src) | |
end | |
end | |
def enumerate_article | |
p = 1 | |
loop do | |
url = URI.join(BASE_URL, "/column/color/p#{p}/") | |
res = Net::HTTP.get_response(url) | |
res.value | |
html = Nokogiri::HTML(res.body) | |
html.css('div.listBox > ul > li > a').map do |a| | |
title = a.css('p.columnTitle').text.strip | |
return if title.empty? | |
href = a.attr('href') | |
date = a.css('span.columnDate').text.strip | |
yield(href, normalize_date(date)) | |
p += 1 | |
end | |
end | |
end | |
# yyyy-mm-dd | |
def normalize_date(date) | |
yyyy, mm, dd = date.match(/\A(\d{4})年(\d{2})月(\d{2})日/)[1..3] | |
"#{yyyy}-#{mm}-#{dd}" | |
end | |
end | |
KonoiroIinaCollector.save_all_images |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment