Created
July 16, 2012 18:32
-
-
Save fukayatsu/3124191 to your computer and use it in GitHub Desktop.
画像取得スクリプト
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding: utf-8 | |
require 'fileutils' | |
require 'bbs2ch' | |
require 'yaml' | |
menu = BBS2ch::Menu.new | |
boards = menu.boards(/犬猫大好き/) | |
boards.each do |board| | |
threads = board.threads(/猫画像/) | |
threads.each do |thr| | |
dat_name = thr.url.split('/').last | |
dat_dir = "dat/#{thr.extra[:board][:url].split('//')[1][0...-1]}" | |
dat_path = "#{dat_dir}/#{dat_name}" | |
FileUtils.mkdir_p dat_dir | |
before_count = 0 | |
if File.exists? dat_path | |
before_count = open(dat_path).read.toutf8.count("\n") | |
`wget -t 3 -N -P #{dat_dir} -c #{thr.url}` | |
else | |
`wget -t 3 -N -P #{dat_dir} #{thr.url}` | |
end | |
after_count = open(dat_path).read.toutf8.count("\n") | |
new_lines = open(dat_path).read.toutf8.split("\n")[before_count ... after_count] | |
tmp_dir = "tmp" | |
FileUtils.mkdir_p tmp_dir | |
new_lines.each do |line| | |
response = BBS2ch::Response.parse(line, thr.extra) | |
response.images.each do |img| | |
img_name = CGI.escape img.url | |
`wget -t 1 --random-wait --timeout=5 -nc -O #{tmp_dir}/#{img_name} #{img.url}` | |
open("#{tmp_dir}/#{img_name}.txt",'a') do |file| | |
file << img.extra.to_yaml | |
end | |
end | |
end | |
img_dir = "img" | |
FileUtils.mkdir_p img_dir | |
#サイズ0の画像とメタ情報を削除 | |
Dir.chdir(tmp_dir) | |
Dir.glob("*.{jpg,png,gif}") do |file| | |
if File.stat(file).size < 5*1024 || File.open(file).read(1) == '<' | |
File.delete file | |
File.delete(file + ".txt") | |
else | |
File.rename(file, "../#{img_dir}/#{file}") | |
File.rename(file+".txt", "../#{img_dir}/#{file}.txt") | |
end | |
end | |
Dir.chdir("../") | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment