Skip to content

Instantly share code, notes, and snippets.

@quanon
Created April 11, 2017 14:09
Show Gist options
  • Save quanon/64739b5b73272949bf0a4e4b01931eea to your computer and use it in GitHub Desktop.
Save quanon/64739b5b73272949bf0a4e4b01931eea to your computer and use it in GitHub Desktop.
require 'selenium-webdriver'
require 'uri'
require 'open-uri'
require 'fileutils'
require 'digest/md5'
def write_urls(label, keyword)
driver = Selenium::WebDriver.for(:chrome)
driver.navigate.to('https://www.google.co.jp/imghp')
driver.find_element(:id, 'lst-ib').send_keys(keyword)
driver.find_element(:name, 'btnG').click
scroll_height = 0
loop do
driver.execute_script('window.scrollTo(0, document.body.scrollHeight);')
sleep(1)
more = driver.find_element(:id, 'smb')
more.click if more.displayed?
curent_scroll_height = driver.execute_script('return document.body.scrollHeight;')
break if scroll_height == curent_scroll_height
scroll_height = curent_scroll_height
end
elements = driver.find_elements(:xpath, '//div[@class="rg_meta"]')
urls =
elements
.map { |element| JSON.parse(element.attribute('innerHTML'))['ou'] }
.map { |url| url[%r{\A[^?\n]+}] }
.select { |url| url.match(/(.png|.jpe?g)\z/) }
url_filepath = "urls/#{label}.txt"
File.write(url_filepath, urls.join("\n"))
driver.quit
end
def download_images(label, keyword)
url_filepath = "urls/#{label}.txt"
image_dir = "images/#{label}"
FileUtils.mkdir_p(image_dir)
open(url_filepath) do |url_file|
line_count = 0
url_file.each_line { line_count = url_file.lineno }
url_file.rewind
url_file.each_line do |line|
url = line[%r{\A[^?\n]+}]
hash = Digest::MD5.hexdigest(url)
ext = File.extname(url)
image_filepath = File.join(image_dir, "#{hash}#{ext}")
if File.exist?(image_filepath)
puts(" (#{url_file.lineno}/#{line_count}) 同じファイルが存在するのでスキップ: #{url}")
next
end
puts(" (#{url_file.lineno}/#{line_count}) ダウンロード中: #{url}")
open(image_filepath, 'wb') do |image_file|
begin
open(url) { |data| image_file.write(data.read) }
rescue => e
puts(" (#{url_file.lineno}/#{line_count}) ダウンロード失敗: #{url}")
end
end
FileUtils.remove_entry_secure(image_filepath) if File.zero?(image_filepath)
end
end
end
def divide_images(label)
image_dir = "images/#{label}"
all = Dir.glob(File.join(image_dir, '*'))
FileUtils.rm_r("data/train/#{label}") if File.exist?("data/train/#{label}")
FileUtils.mkdir_p("data/train/#{label}")
train = all.sample((all.length / 4) * 3)
train.each { |filepath| FileUtils.cp(filepath, "data/train/#{label}") }
FileUtils.rm_r("data/validate/#{label}") if File.exist?("data/validate/#{label}")
FileUtils.mkdir_p("data/validate/#{label}")
validate = all - train
validate.each { |filepath| FileUtils.cp(filepath, "data/validate/#{label}") }
end
{
pikachu: 'Pikachu',
raichu: 'Raichu',
dedenne: 'Dedenne'
}
.each do |label, keyword|
write_urls(label, keyword)
download_images(label, keyword)
# divide_images(label)
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment