Created
November 17, 2009 18:52
-
-
Save codeprimate/237147 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'rubygems' | |
require 'www/mechanize' | |
require 'open-uri' | |
class GoogleCacheDump | |
attr_accessor :query, :namer, :user_agent, :output_prefix, :search_url, :ignore | |
def initialize | |
@query = nil | |
@namer = nil | |
@user_agent = 'Windows Mozilla' | |
@output_prefix = '.' | |
@search_url = nil | |
@ignore = [] | |
end | |
def go | |
next_link = nil | |
a = WWW::Mechanize.new { |agent| agent.user_agent_alias = @user_agent } | |
if @search_url.nil? | |
a.get('http://google.com/') do |page| | |
search_result = page.form_with(:name => 'f') do |search| | |
search.q = @query | |
end.submit | |
next_link = get_content(search_result) | |
end | |
else | |
puts "Getting #{@search_url}" | |
a.get('http://www.google.com/') | |
a.get(@search_url) do |page| | |
next_link = get_content(page) | |
end | |
end | |
while (next_link != nil) do | |
a.get(next_link) do |page| | |
next_link = get_content(page) | |
end | |
end | |
end | |
private | |
def get_content(search_result) | |
next_link = nil | |
search_result.links.sort{|x,y| rand(2) }.each do |link| | |
if link.text == "Cached" | |
print "Fetching => #{link.href}" | |
outfile_name = link.href.match(@namer)[1].gsub('/','-') | |
outfile_filename = "#{@output_prefix}/#{outfile_name}.html" | |
unless (@ignore.include?(outfile_name) || File.exist?(outfile_filename)) | |
outfile = File.open(outfile_filename,"w") | |
outfile.puts open(link.href).read | |
outfile.close | |
puts " == Done." | |
sleep rand(5) + 2 | |
else | |
puts " == Skipped." | |
end | |
end | |
if link.text == "Next" | |
next_link = link.href | |
break | |
end | |
end | |
return next_link | |
end | |
end | |
# Usage: | |
# Example: The following will lookup pages in example.com containing the Title Articles | |
# The individual pages will be saved as XXX.html, with XXX beinf defined as the first group | |
# in the "namer" regex. | |
# The ignore setting, specifies strings matched by the namer, which will not be downloaded. | |
# The "go" function starts the magic. | |
# gcd = GoogleCacheDump.new | |
# gcd.query = "intitle:'Articles' site:example.com" | |
# gcd.namer = /articles\/(\d+).*$/ | |
# gcd.ignore = ["123","456"] | |
# gcd.go | |
# You may also specify a specific search URL instead of a search string. | |
# gcd.search_url = 'http://www.google.com/search?hl=en&q=%22intitle:profile%22+site:example.com&start=0&sa=N&filter=0' | |
# gcd.go |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment