Skip to content

Instantly share code, notes, and snippets.

@gmgent
Created March 8, 2011 19:08
Show Gist options
  • Select an option

  • Save gmgent/860806 to your computer and use it in GitHub Desktop.

Select an option

Save gmgent/860806 to your computer and use it in GitHub Desktop.
basically a proof of concept for playing with hpricot
desc 'Scrape away.'
require 'config/boot'
require 'config/environment'
require 'application'
require 'net/ftp'
#include Utils
require 'rubygems'
require 'open-uri'
require 'hpricot'
namespace :import do
task :get_events do
#get xml file
begin
MAX_PAGES = 2 #set the max page depth for scraping
for i in 1..MAX_PAGES do
path = "http://www.wegottickets.com/searchresults/page/#{i}/all"
open(path, "User-Agent" => "Ruby/#{RUBY_VERSION}",
"From" => "[email protected]",
"Referer" => "http://www.igvita.com/blog/") { |f|
puts "Fetched document: #{f.base_uri}"
puts "\\t Content Type: #{f.content_type}\\n"
puts "\\t Charset: #{f.charset}\\n"
puts "\\t Content-Encoding: #{f.content_encoding}\\n"
puts "\\t Last Modified: #{f.last_modified}\\n\\n"
# Save the response body
@response = f.read
}
doc = Hpricot(@response)
#cycle though dates per page
for date_num in 3..12 do
xpath_to_title = "/html/body/div/div/div[3]/div[3]/div/div[#{date_num}]/div/div[2]/blockquote/h3/a"
xpath_to_date ="/html/body/div/div/div[3]/div[3]/div/div[#{date_num}]/div/div[2]/blockquote/p/span"
xpath_date ="/html/body/div/div/div[3]/div[3]/div/div[#{date_num}]/div/div[2]/blockquote/"
xpath_to_price ="/html/body/div/div/div[3]/div[3]/div/div[#{date_num}]/div/div/span/strong"
xpath_to_available = "/html/body/div/div/div[3]/div[3]/div/div[#{date_num}]/div/div/span[2]"
artist_name = (doc/xpath_to_title).inner_html
gig_location = (doc/xpath_to_date).inner_html
get_date = (doc/xpath_date).inner_html
gig_date = get_date.split("")[1]
#remove pound if needed
#gig_price = (doc/xpath_to_price).inner_html
gig_price = (doc/xpath_to_price).inner_html.delete "£"
gig_available = (doc/xpath_to_available).inner_html
if gig_available.size < 2 then gig_available = "Tickets are available" end
#here we can insert the fields as artist_name, gig_location, and gig_date
# as well as gig_price and gig_available
puts artist_name
puts gig_location
puts gig_date
puts gig_price
puts gig_available
puts "\n\n"
end
end
rescue => ex
puts ex.message
end
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment