katoy · April 21, 2012 08:01
diff --git a/captureSite.rb b/captureSite.rb
 #
 #  https://github.com/katoy/CaptureSite (java) の ruby 版
 # 

 require 'rubygems'
 require 'watir-webdriver'     # gem install watir-webdriver
 require 'selenium-webdriver'  # gem install selenium-webdriver
 require 'uri'
 require 'pp'

 $KDOCE ="utf8"

 CAPTURE_DIR = "./screen/"
 MAX_CAP = 20
 SLEEP = 2

 # IGNORE_FRAGMENT = false # case raphael
 IGNORE_FRAGMENT = true

 # String START_URL = 'http://raphaeljs.com/'
 # String START_URL = 'http://coffeescript.org/'
 # String START_URL = 'http://www.sinatrarb.com/'
 String START_URL = 'http://www.google.co.jp/'
 # String START_URL = 'https://github.com/katoy/CaptureSite'
 # String START_URL = 'http://www.youtube.com/results?search_query=%E8%A5%BF%E6%9D%91%E7%94%B1%E8%B5%B7%E6%B1%9F' # 西村由起江

 PRE_NAME = START_URL

 class Crawler

  def initialize(driver, browser)
    @driver = driver
    @browser = browser
    @cap_num = 0
    @visited = []
    @captured = []

    Dir::mkdir(CAPTURE_DIR) unless File.exists?(CAPTURE_DIR)
  end

  def close()
    @browser.close

    open("#{CAPTURE_DIR}00-index.txt", "w") {|f| 
      @captured.each{ |c|
        f.write "{c}\n"
      }
    }
    
  end

  def add_visited(href)

    if IGNORE_FRAGMENT
      sp = URI.split(href)
      sp[8] = nil
      @visited << sp.join('/')
    else
      @visited << URI.parse(href)
    end
  end

  def process_page(href)
    return if @cap_num >= MAX_CAP

    @browser.goto href
    add_visited(href)

    begin
      cap_name = "#{CAPTURE_DIR}cap#{sprintf('%04d', @cap_num)}.png"
      sleep(SLEEP)
      @driver.save_screenshot(cap_name)
      @captured << [cap_name, href]
      pp "#{cap_name}: #{href}"
      @cap_num += 1
    rescue => e
      pp href
      pp e
    end

    links = []
    @browser.links().each { |link|
      begin
        to_href = link.href
        next if (to_href == nil) or (to_href.length == 0)
        next if self.visited?(to_href)

        add_visited(to_href)

        next unless valid(to_href)
        links << to_href
      rescue => e
        pp "-----------------------"
        pp "[#{link}]"
        pp e
        pp e.backtrace
      end
    }

    links.each { |h| process_page(h) }

  end

  def visited?(href)
    if IGNORE_FRAGMENT
      sp = URI.split(href)
      sp[8] = nil
      u = sp.join('/')
      indx = @visited.index(u)
    else
      uri = URI.parse(href)
      indx = @visited.index(uri)
    end
  end

 end

 def valid(href)
  return false unless  href.index(PRE_NAME)

  # google
  return false if href.index("http://www.google.co.jp/news/")
  return false if href.index("http://www.google.co.jp/products/")

  # github
  return false if href.index("https://github.com/katoy/CaptureSite/issues")

  # rapahel
  # return false if IGNORE_FRAGMENT and isSamed(uri)

  # youtube
  return false if href.index("&search_filter=")

  true
 end

 # proxy
 # profile = Selenium::WebDriver::Firefox::Profile.new
 # proxy = Selenium::WebDriver::Proxy.new(:http => "www-gw:80")
 # profile.proxy = proxy
 #
 #driver = Selenium::WebDriver.for :firefox, :profile => profile

 Selenium::WebDriver::Firefox.path = "/Applications/Firefox.app/Contents/MacOS/firefox-bin-x"
 driver = Selenium::WebDriver.for :firefox
 browser = Watir::Browser.new(driver)

 # browser.goto "http://www.google.com/"

 # driver.save_screenshot 'test.png'
 # pp driver.executeScript("return document.links.length")

 crawler = Crawler.new(driver, browser)
 begin
  crawler.process_page(START_URL)
 ensure
  crawler.close()
 end
diff --git a/diffimage.sh b/diffimage.sh
 #!/bin/sh

 identify -format "%[mean]" diff.jpg
diff --git a/makethumbnail.rb b/makethumbnail.rb
 require 'rubygems'
 require 'rmagick'
 require 'fileutils'
 require 'benchmark'
 require 'pp'

 OUT_DIR = "./thumbnail"

 puts Benchmark::CAPTION
 puts Benchmark.measure {

  FileUtils::mkdir_p(OUT_DIR) unless File.exists?(OUT_DIR)
  Dir::glob("./screen/*.png").each {|f|
    toFile = "#{OUT_DIR}/#{File::basename(f)}"
    image = Magick::Image.read(f).first
    image.resize_to_fill(120).write(toFile)
  }
 }
diff --git a/maketile.sh b/maketile.sh
 #!/bin/sh

 montage thumbnail/*.png -tile 4x all.png
	#
	# https://github.com/katoy/CaptureSite (java) の ruby 版
	#

	require 'rubygems'
	require 'watir-webdriver' # gem install watir-webdriver
	require 'selenium-webdriver' # gem install selenium-webdriver
	require 'uri'
	require 'pp'

	$KDOCE ="utf8"

	CAPTURE_DIR = "./screen/"
	MAX_CAP = 20
	SLEEP = 2

	# IGNORE_FRAGMENT = false # case raphael
	IGNORE_FRAGMENT = true

	# String START_URL = 'http://raphaeljs.com/'
	# String START_URL = 'http://coffeescript.org/'
	# String START_URL = 'http://www.sinatrarb.com/'
	String START_URL = 'http://www.google.co.jp/'
	# String START_URL = 'https://github.com/katoy/CaptureSite'
	# String START_URL = 'http://www.youtube.com/results?search_query=%E8%A5%BF%E6%9D%91%E7%94%B1%E8%B5%B7%E6%B1%9F' # 西村由起江

	PRE_NAME = START_URL

	class Crawler

	def initialize(driver, browser)
	@driver = driver
	@browser = browser
	@cap_num = 0
	@visited = []
	@captured = []

	Dir::mkdir(CAPTURE_DIR) unless File.exists?(CAPTURE_DIR)
	end

	def close()
	@browser.close

	open("#{CAPTURE_DIR}00-index.txt", "w") {\|f\|
	@captured.each{ \|c\|
	f.write "{c}\n"
	}
	}

	end

	def add_visited(href)

	if IGNORE_FRAGMENT
	sp = URI.split(href)
	sp[8] = nil
	@visited << sp.join('/')
	else
	@visited << URI.parse(href)
	end
	end

	def process_page(href)
	return if @cap_num >= MAX_CAP

	@browser.goto href
	add_visited(href)

	begin
	cap_name = "#{CAPTURE_DIR}cap#{sprintf('%04d', @cap_num)}.png"
	sleep(SLEEP)
	@driver.save_screenshot(cap_name)
	@captured << [cap_name, href]
	pp "#{cap_name}: #{href}"
	@cap_num += 1
	rescue => e
	pp href
	pp e
	end

	links = []
	@browser.links().each { \|link\|
	begin
	to_href = link.href
	next if (to_href == nil) or (to_href.length == 0)
	next if self.visited?(to_href)

	add_visited(to_href)

	next unless valid(to_href)
	links << to_href
	rescue => e
	pp "-----------------------"
	pp "[#{link}]"
	pp e
	pp e.backtrace
	end
	}

	links.each { \|h\| process_page(h) }

	end

	def visited?(href)
	if IGNORE_FRAGMENT
	sp = URI.split(href)
	sp[8] = nil
	u = sp.join('/')
	indx = @visited.index(u)
	else
	uri = URI.parse(href)
	indx = @visited.index(uri)
	end
	end

	end

	def valid(href)
	return false unless href.index(PRE_NAME)

	# google
	return false if href.index("http://www.google.co.jp/news/")
	return false if href.index("http://www.google.co.jp/products/")

	# github
	return false if href.index("https://github.com/katoy/CaptureSite/issues")

	# rapahel
	# return false if IGNORE_FRAGMENT and isSamed(uri)

	# youtube
	return false if href.index("&search_filter=")

	true
	end

	# proxy
	# profile = Selenium::WebDriver::Firefox::Profile.new
	# proxy = Selenium::WebDriver::Proxy.new(:http => "www-gw:80")
	# profile.proxy = proxy
	#
	#driver = Selenium::WebDriver.for :firefox, :profile => profile

	Selenium::WebDriver::Firefox.path = "/Applications/Firefox.app/Contents/MacOS/firefox-bin-x"
	driver = Selenium::WebDriver.for :firefox
	browser = Watir::Browser.new(driver)

	# browser.goto "http://www.google.com/"

	# driver.save_screenshot 'test.png'
	# pp driver.executeScript("return document.links.length")

	crawler = Crawler.new(driver, browser)
	begin
	crawler.process_page(START_URL)
	ensure
	crawler.close()
	end
	require 'rubygems'
	require 'rmagick'
	require 'fileutils'
	require 'benchmark'
	require 'pp'

	OUT_DIR = "./thumbnail"

	puts Benchmark::CAPTION
	puts Benchmark.measure {

	FileUtils::mkdir_p(OUT_DIR) unless File.exists?(OUT_DIR)
	Dir::glob("./screen/*.png").each {\|f\|
	toFile = "#{OUT_DIR}/#{File::basename(f)}"
	image = Magick::Image.read(f).first
	image.resize_to_fill(120).write(toFile)
	}
	}