Skip to content

Instantly share code, notes, and snippets.

@codeprimate
Created November 17, 2009 18:52
Show Gist options
  • Save codeprimate/237147 to your computer and use it in GitHub Desktop.
Save codeprimate/237147 to your computer and use it in GitHub Desktop.
require 'rubygems'
require 'www/mechanize'
require 'open-uri'
class GoogleCacheDump
attr_accessor :query, :namer, :user_agent, :output_prefix, :search_url, :ignore
def initialize
@query = nil
@namer = nil
@user_agent = 'Windows Mozilla'
@output_prefix = '.'
@search_url = nil
@ignore = []
end
def go
next_link = nil
a = WWW::Mechanize.new { |agent| agent.user_agent_alias = @user_agent }
if @search_url.nil?
a.get('http://google.com/') do |page|
search_result = page.form_with(:name => 'f') do |search|
search.q = @query
end.submit
next_link = get_content(search_result)
end
else
puts "Getting #{@search_url}"
a.get('http://www.google.com/')
a.get(@search_url) do |page|
next_link = get_content(page)
end
end
while (next_link != nil) do
a.get(next_link) do |page|
next_link = get_content(page)
end
end
end
private
def get_content(search_result)
next_link = nil
search_result.links.sort{|x,y| rand(2) }.each do |link|
if link.text == "Cached"
print "Fetching => #{link.href}"
outfile_name = link.href.match(@namer)[1].gsub('/','-')
outfile_filename = "#{@output_prefix}/#{outfile_name}.html"
unless (@ignore.include?(outfile_name) || File.exist?(outfile_filename))
outfile = File.open(outfile_filename,"w")
outfile.puts open(link.href).read
outfile.close
puts " == Done."
sleep rand(5) + 2
else
puts " == Skipped."
end
end
if link.text == "Next"
next_link = link.href
break
end
end
return next_link
end
end
# Usage:
# Example: The following will lookup pages in example.com containing the Title Articles
# The individual pages will be saved as XXX.html, with XXX beinf defined as the first group
# in the "namer" regex.
# The ignore setting, specifies strings matched by the namer, which will not be downloaded.
# The "go" function starts the magic.
# gcd = GoogleCacheDump.new
# gcd.query = "intitle:'Articles' site:example.com"
# gcd.namer = /articles\/(\d+).*$/
# gcd.ignore = ["123","456"]
# gcd.go
# You may also specify a specific search URL instead of a search string.
# gcd.search_url = 'http://www.google.com/search?hl=en&q=%22intitle:profile%22+site:example.com&start=0&sa=N&filter=0'
# gcd.go
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment