codeprimate · November 17, 2009 18:52
diff --git a/google_cache_scrape.rb b/google_cache_scrape.rb
 require 'rubygems'
 require 'www/mechanize'
 require 'open-uri'

 class GoogleCacheDump
  attr_accessor :query, :namer, :user_agent, :output_prefix, :search_url, :ignore
  
  def initialize
    @query = nil
    @namer = nil
    @user_agent = 'Windows Mozilla'
    @output_prefix = '.'
    @search_url = nil
    @ignore = []
  end
  
  def go
    next_link = nil
    a = WWW::Mechanize.new { |agent| agent.user_agent_alias = @user_agent }
    
    if @search_url.nil?
      a.get('http://google.com/') do |page|
        search_result = page.form_with(:name => 'f') do |search|
          search.q = @query
        end.submit
        next_link = get_content(search_result)
      end
    else
      puts "Getting #{@search_url}"
      a.get('http://www.google.com/')
      a.get(@search_url) do |page|
        next_link = get_content(page)
      end
    end
    while (next_link != nil) do
      a.get(next_link) do |page|
          next_link = get_content(page)
      end
    end
  end
  
  private
  
  def get_content(search_result)
    next_link = nil
    search_result.links.sort{|x,y| rand(2) }.each do |link|
        if link.text == "Cached"
          print "Fetching => #{link.href}"
          outfile_name = link.href.match(@namer)[1].gsub('/','-')
          outfile_filename = "#{@output_prefix}/#{outfile_name}.html"
          unless (@ignore.include?(outfile_name) || File.exist?(outfile_filename))
            outfile = File.open(outfile_filename,"w")
            outfile.puts open(link.href).read
            outfile.close
            puts " == Done."
            sleep rand(5) + 2
          else
            puts " == Skipped."
          end
        end
        if link.text == "Next"
          next_link = link.href
          break
        end
    end
    return next_link
  end
 end

 # Usage:
 #  Example: The following will lookup pages in example.com containing the Title Articles
 #     The individual pages will be saved as XXX.html, with XXX beinf defined as the first group
 #    in the "namer" regex.
 #     The ignore setting, specifies strings matched by the namer, which will not be downloaded.
 #     The "go" function starts the magic.
 #  gcd = GoogleCacheDump.new
 #  gcd.query = "intitle:'Articles' site:example.com"
 #  gcd.namer = /articles\/(\d+).*$/
 #  gcd.ignore = ["123","456"]
 #  gcd.go

 #  You may also specify a specific search URL instead of a search string.
 # gcd.search_url = 'http://www.google.com/search?hl=en&q=%22intitle:profile%22+site:example.com&start=0&sa=N&filter=0'
 # gcd.go
	require 'rubygems'
	require 'www/mechanize'
	require 'open-uri'

	class GoogleCacheDump
	attr_accessor :query, :namer, :user_agent, :output_prefix, :search_url, :ignore

	def initialize
	@query = nil
	@namer = nil
	@user_agent = 'Windows Mozilla'
	@output_prefix = '.'
	@search_url = nil
	@ignore = []
	end

	def go
	next_link = nil
	a = WWW::Mechanize.new { \|agent\| agent.user_agent_alias = @user_agent }

	if @search_url.nil?
	a.get('http://google.com/') do \|page\|
	search_result = page.form_with(:name => 'f') do \|search\|
	search.q = @query
	end.submit
	next_link = get_content(search_result)
	end
	else
	puts "Getting #{@search_url}"
	a.get('http://www.google.com/')
	a.get(@search_url) do \|page\|
	next_link = get_content(page)
	end
	end
	while (next_link != nil) do
	a.get(next_link) do \|page\|
	next_link = get_content(page)
	end
	end
	end

	private

	def get_content(search_result)
	next_link = nil
	search_result.links.sort{\|x,y\| rand(2) }.each do \|link\|
	if link.text == "Cached"
	print "Fetching => #{link.href}"
	outfile_name = link.href.match(@namer)[1].gsub('/','-')
	outfile_filename = "#{@output_prefix}/#{outfile_name}.html"
	unless (@ignore.include?(outfile_name) \|\| File.exist?(outfile_filename))
	outfile = File.open(outfile_filename,"w")
	outfile.puts open(link.href).read
	outfile.close
	puts " == Done."
	sleep rand(5) + 2
	else
	puts " == Skipped."
	end
	end
	if link.text == "Next"
	next_link = link.href
	break
	end
	end
	return next_link
	end
	end

	# Usage:
	# Example: The following will lookup pages in example.com containing the Title Articles
	# The individual pages will be saved as XXX.html, with XXX beinf defined as the first group
	# in the "namer" regex.
	# The ignore setting, specifies strings matched by the namer, which will not be downloaded.
	# The "go" function starts the magic.
	# gcd = GoogleCacheDump.new
	# gcd.query = "intitle:'Articles' site:example.com"
	# gcd.namer = /articles\/(\d+).*$/
	# gcd.ignore = ["123","456"]
	# gcd.go

	# You may also specify a specific search URL instead of a search string.
	# gcd.search_url = 'http://www.google.com/search?hl=en&q=%22intitle:profile%22+site:example.com&start=0&sa=N&filter=0'
	# gcd.go