Created
April 21, 2012 08:01
-
-
Save katoy/2435464 to your computer and use it in GitHub Desktop.
https://github.com/katoy/CaptureSite (java) の ruby 版
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# | |
# https://github.com/katoy/CaptureSite (java) の ruby 版 | |
# | |
require 'rubygems' | |
require 'watir-webdriver' # gem install watir-webdriver | |
require 'selenium-webdriver' # gem install selenium-webdriver | |
require 'uri' | |
require 'pp' | |
$KDOCE ="utf8" | |
CAPTURE_DIR = "./screen/" | |
MAX_CAP = 20 | |
SLEEP = 2 | |
# IGNORE_FRAGMENT = false # case raphael | |
IGNORE_FRAGMENT = true | |
# String START_URL = 'http://raphaeljs.com/' | |
# String START_URL = 'http://coffeescript.org/' | |
# String START_URL = 'http://www.sinatrarb.com/' | |
String START_URL = 'http://www.google.co.jp/' | |
# String START_URL = 'https://github.com/katoy/CaptureSite' | |
# String START_URL = 'http://www.youtube.com/results?search_query=%E8%A5%BF%E6%9D%91%E7%94%B1%E8%B5%B7%E6%B1%9F' # 西村由起江 | |
PRE_NAME = START_URL | |
class Crawler | |
def initialize(driver, browser) | |
@driver = driver | |
@browser = browser | |
@cap_num = 0 | |
@visited = [] | |
@captured = [] | |
Dir::mkdir(CAPTURE_DIR) unless File.exists?(CAPTURE_DIR) | |
end | |
def close() | |
@browser.close | |
open("#{CAPTURE_DIR}00-index.txt", "w") {|f| | |
@captured.each{ |c| | |
f.write "{c}\n" | |
} | |
} | |
end | |
def add_visited(href) | |
if IGNORE_FRAGMENT | |
sp = URI.split(href) | |
sp[8] = nil | |
@visited << sp.join('/') | |
else | |
@visited << URI.parse(href) | |
end | |
end | |
def process_page(href) | |
return if @cap_num >= MAX_CAP | |
@browser.goto href | |
add_visited(href) | |
begin | |
cap_name = "#{CAPTURE_DIR}cap#{sprintf('%04d', @cap_num)}.png" | |
sleep(SLEEP) | |
@driver.save_screenshot(cap_name) | |
@captured << [cap_name, href] | |
pp "#{cap_name}: #{href}" | |
@cap_num += 1 | |
rescue => e | |
pp href | |
pp e | |
end | |
links = [] | |
@browser.links().each { |link| | |
begin | |
to_href = link.href | |
next if (to_href == nil) or (to_href.length == 0) | |
next if self.visited?(to_href) | |
add_visited(to_href) | |
next unless valid(to_href) | |
links << to_href | |
rescue => e | |
pp "-----------------------" | |
pp "[#{link}]" | |
pp e | |
pp e.backtrace | |
end | |
} | |
links.each { |h| process_page(h) } | |
end | |
def visited?(href) | |
if IGNORE_FRAGMENT | |
sp = URI.split(href) | |
sp[8] = nil | |
u = sp.join('/') | |
indx = @visited.index(u) | |
else | |
uri = URI.parse(href) | |
indx = @visited.index(uri) | |
end | |
end | |
end | |
def valid(href) | |
return false unless href.index(PRE_NAME) | |
return false if href.index("http://www.google.co.jp/news/") | |
return false if href.index("http://www.google.co.jp/products/") | |
# github | |
return false if href.index("https://github.com/katoy/CaptureSite/issues") | |
# rapahel | |
# return false if IGNORE_FRAGMENT and isSamed(uri) | |
# youtube | |
return false if href.index("&search_filter=") | |
true | |
end | |
# proxy | |
# profile = Selenium::WebDriver::Firefox::Profile.new | |
# proxy = Selenium::WebDriver::Proxy.new(:http => "www-gw:80") | |
# profile.proxy = proxy | |
# | |
#driver = Selenium::WebDriver.for :firefox, :profile => profile | |
Selenium::WebDriver::Firefox.path = "/Applications/Firefox.app/Contents/MacOS/firefox-bin-x" | |
driver = Selenium::WebDriver.for :firefox | |
browser = Watir::Browser.new(driver) | |
# browser.goto "http://www.google.com/" | |
# driver.save_screenshot 'test.png' | |
# pp driver.executeScript("return document.links.length") | |
crawler = Crawler.new(driver, browser) | |
begin | |
crawler.process_page(START_URL) | |
ensure | |
crawler.close() | |
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/sh | |
identify -format "%[mean]" diff.jpg |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'rubygems' | |
require 'rmagick' | |
require 'fileutils' | |
require 'benchmark' | |
require 'pp' | |
OUT_DIR = "./thumbnail" | |
puts Benchmark::CAPTION | |
puts Benchmark.measure { | |
FileUtils::mkdir_p(OUT_DIR) unless File.exists?(OUT_DIR) | |
Dir::glob("./screen/*.png").each {|f| | |
toFile = "#{OUT_DIR}/#{File::basename(f)}" | |
image = Magick::Image.read(f).first | |
image.resize_to_fill(120).write(toFile) | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/sh | |
montage thumbnail/*.png -tile 4x all.png |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment