Skip to content

Instantly share code, notes, and snippets.

@deepthawtz
Created December 5, 2010 06:40
Show Gist options
  • Select an option

  • Save deepthawtz/728877 to your computer and use it in GitHub Desktop.

Select an option

Save deepthawtz/728877 to your computer and use it in GitHub Desktop.
%w[
open-uri
nokogiri
mongo
].map { |x| require x }
db = Mongo::Connection.new.db("tennis_network")
Courts = db.collection("courts")
Courts.create_index([["state", Mongo::ASCENDING]])
Courts.create_index([["city", Mongo::ASCENDING]])
Courts.create_index([["location_type", Mongo::ASCENDING]])
Courts.create_index([["court_type", Mongo::ASCENDING]])
Courts.create_index([["lights", Mongo::ASCENDING]])
Courts.create_index([["indoor", Mongo::ASCENDING]])
Courts.create_index([["location", Mongo::GEO2D]])
BASE_URL="http://www.globaltennisnetwork.com"
USA_URL="/tennis-courts/find-a-tennis-court/country/223-united-states/"
USER_AGENT="Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_5; en-US) AppleWebKit/534.7 (KHTML, like Gecko) Chrome/7.0.517.44 Safari/534.7"
def get_doc(url)
Nokogiri::HTML(open(BASE_URL + url, {"User-Agent"=>USER_AGENT}))
end
def get_geo_coords(script)
m = /GMarker.*GLatLng\((-?\d+\.\d+), (-?\d+.\d+)/.match(script)
# return a tuple [lat, long]
[m[1].to_f, m[2].to_f]
end
# recurse through 1) country -> 2) state -> 3) city -> 4) location -> {{ court info nugget }}
# 1) country
get_doc(USA_URL).css("tr td.leftspace a").each do |state|
# 2) state
state_name = state["href"].split("/")[-1].split("-")[1..-1].join(" ")
puts "...getting #{state_name}"
get_doc(state["href"]).css("tr td.leftspace a").each do |city|
# 3) city
city_name = city["href"].split("/")[-1].split("-")[1..-1].join(" ")
puts "...getting #{city_name}"
get_doc(city["href"]).css("tr td.leftspace a").each do |location|
# 4) court location
puts "...getting #{location['href']}"
court = get_doc(location["href"])
@data = {}
@data[:state] = state_name
@data[:city] = city_name.gsub(/#{state_name}/, "").strip
# title
name = court.at_css("tr td.leftspace h1")
@data[:name] = name.content
# details
# bah, a lot of trial and error but the first 6 matches are relevant data
court_info = court.css("tr td.leftAlign")[0..5]
# ...and they appear on the page in this order
[:location_type, :number_of_courts, :indoor, :membership_fees, :court_type, :lights].each do |key|
val = court_info.shift.content.strip
if val == "Yes"
@data[key] = true
elsif val == "No"
@data[key] = false
elsif key == :indoor
@data[key] = val.to_i > 0 ? true : false
elsif val.match(/\d+/)
@data[key] = val.to_i
else
@data[key] = val.downcase
end
end
@data[:location] ={}
@data[:location][:lat], @data[:location][:long] = get_geo_coords(court.css("td script")[-1])
@data[:country] = "usa"
puts "inserting #{@data}"
# finally, insert into mongo for later cuz that's how i like it
Courts.insert @data
end
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment