Skip to content

Instantly share code, notes, and snippets.

@katoy
Created April 21, 2012 08:01
Show Gist options
  • Save katoy/2435464 to your computer and use it in GitHub Desktop.
Save katoy/2435464 to your computer and use it in GitHub Desktop.
#
# https://github.com/katoy/CaptureSite (java) の ruby 版
#
require 'rubygems'
require 'watir-webdriver' # gem install watir-webdriver
require 'selenium-webdriver' # gem install selenium-webdriver
require 'uri'
require 'pp'
$KDOCE ="utf8"
CAPTURE_DIR = "./screen/"
MAX_CAP = 20
SLEEP = 2
# IGNORE_FRAGMENT = false # case raphael
IGNORE_FRAGMENT = true
# String START_URL = 'http://raphaeljs.com/'
# String START_URL = 'http://coffeescript.org/'
# String START_URL = 'http://www.sinatrarb.com/'
String START_URL = 'http://www.google.co.jp/'
# String START_URL = 'https://github.com/katoy/CaptureSite'
# String START_URL = 'http://www.youtube.com/results?search_query=%E8%A5%BF%E6%9D%91%E7%94%B1%E8%B5%B7%E6%B1%9F' # 西村由起江
PRE_NAME = START_URL
class Crawler
def initialize(driver, browser)
@driver = driver
@browser = browser
@cap_num = 0
@visited = []
@captured = []
Dir::mkdir(CAPTURE_DIR) unless File.exists?(CAPTURE_DIR)
end
def close()
@browser.close
open("#{CAPTURE_DIR}00-index.txt", "w") {|f|
@captured.each{ |c|
f.write "{c}\n"
}
}
end
def add_visited(href)
if IGNORE_FRAGMENT
sp = URI.split(href)
sp[8] = nil
@visited << sp.join('/')
else
@visited << URI.parse(href)
end
end
def process_page(href)
return if @cap_num >= MAX_CAP
@browser.goto href
add_visited(href)
begin
cap_name = "#{CAPTURE_DIR}cap#{sprintf('%04d', @cap_num)}.png"
sleep(SLEEP)
@driver.save_screenshot(cap_name)
@captured << [cap_name, href]
pp "#{cap_name}: #{href}"
@cap_num += 1
rescue => e
pp href
pp e
end
links = []
@browser.links().each { |link|
begin
to_href = link.href
next if (to_href == nil) or (to_href.length == 0)
next if self.visited?(to_href)
add_visited(to_href)
next unless valid(to_href)
links << to_href
rescue => e
pp "-----------------------"
pp "[#{link}]"
pp e
pp e.backtrace
end
}
links.each { |h| process_page(h) }
end
def visited?(href)
if IGNORE_FRAGMENT
sp = URI.split(href)
sp[8] = nil
u = sp.join('/')
indx = @visited.index(u)
else
uri = URI.parse(href)
indx = @visited.index(uri)
end
end
end
def valid(href)
return false unless href.index(PRE_NAME)
# google
return false if href.index("http://www.google.co.jp/news/")
return false if href.index("http://www.google.co.jp/products/")
# github
return false if href.index("https://github.com/katoy/CaptureSite/issues")
# rapahel
# return false if IGNORE_FRAGMENT and isSamed(uri)
# youtube
return false if href.index("&search_filter=")
true
end
# proxy
# profile = Selenium::WebDriver::Firefox::Profile.new
# proxy = Selenium::WebDriver::Proxy.new(:http => "www-gw:80")
# profile.proxy = proxy
#
#driver = Selenium::WebDriver.for :firefox, :profile => profile
Selenium::WebDriver::Firefox.path = "/Applications/Firefox.app/Contents/MacOS/firefox-bin-x"
driver = Selenium::WebDriver.for :firefox
browser = Watir::Browser.new(driver)
# browser.goto "http://www.google.com/"
# driver.save_screenshot 'test.png'
# pp driver.executeScript("return document.links.length")
crawler = Crawler.new(driver, browser)
begin
crawler.process_page(START_URL)
ensure
crawler.close()
end
#!/bin/sh
identify -format "%[mean]" diff.jpg
require 'rubygems'
require 'rmagick'
require 'fileutils'
require 'benchmark'
require 'pp'
OUT_DIR = "./thumbnail"
puts Benchmark::CAPTION
puts Benchmark.measure {
FileUtils::mkdir_p(OUT_DIR) unless File.exists?(OUT_DIR)
Dir::glob("./screen/*.png").each {|f|
toFile = "#{OUT_DIR}/#{File::basename(f)}"
image = Magick::Image.read(f).first
image.resize_to_fill(120).write(toFile)
}
}
#!/bin/sh
montage thumbnail/*.png -tile 4x all.png
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment