Skip to content

Instantly share code, notes, and snippets.

@redsquirrel
Created November 17, 2008 15:59
Show Gist options
  • Select an option

  • Save redsquirrel/25808 to your computer and use it in GitHub Desktop.

Select an option

Save redsquirrel/25808 to your computer and use it in GitHub Desktop.
require 'rubygems'
require 'mechanize'
require 'logger'
require 'band'
agent = WWW::Mechanize.new
agent.max_history = 1
agent.user_agent_alias = 'Mac Safari'
site = "http://www.purevolume.com"
search_page = agent.get("#{site}/browse/")
search_form = search_page.forms[1]
location_select = search_form.field('location')
start_page = 16
index = location_select.options.map {|option| option.value }.index("Sweden")
location_select.options.slice!(0, index)
location_select.options.each do |location|
next if location.value.nil? || location.value.strip == ""
location_select.value = location.value
search_results = agent.submit(search_form, search_form.buttons.first)
puts "POST-ing #{search_form.action} for #{location}"
location_uri = search_results.uri.path
location_match = location_uri.match('/browse/(\w+)/')
location_hash = location_match.captures.first if location_match
page_number = start_page || 1
start_page = nil
loop do
begin
location_page = agent.get("#{site}/browse/#{location_hash}/#{page_number}")
rescue Timeout::Error, Errno::ETIMEDOUT
retry
end
puts "GET-ing #{site}/browse/#{location_hash}/#{page_number} for #{location_select.value}"
break if location_page.body.match('"No Results Found"')
bands = []
location_page.body.each do |line|
data = line.match('<p><a href="(\/\w+)"><strong>(.+)</strong>')
if data
link, name = data.captures
unless Band.find_by_link(link) # let's not re-spider a band we've already grabbed
bands << Band.new( :link => link, :name => name )
end
end
end
bands.each do |band|
puts "GET-ing #{site}#{band[:link]}"
begin
page = agent.get("#{site}#{band[:link]}")
rescue Timeout::Error, Errno::ETIMEDOUT
retry
end
page.body.each do |line|
data = line.match('<a href="mailto:([\w@\.]+)">email</a>')
band[:email] = data.captures.first if data
end
band.save
end
page_number += 1
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment