Skip to content

Instantly share code, notes, and snippets.

@D3MZ
Created March 19, 2013 18:11
Show Gist options
  • Save D3MZ/5198592 to your computer and use it in GitHub Desktop.
Save D3MZ/5198592 to your computer and use it in GitHub Desktop.
# encoding: UTF-8
%w{mechanize pp mongo peach}.each { |x| require x }
include Mongo
@agent = Mechanize.new { |agent| agent.user_agent_alias = 'Mac Safari' }
@coll = MongoClient.new('localhost', 27017)['google']['cities']
def states_in page
page.links.reject {|l| l.to_s.length > 2 || l.to_s.empty?}
end
def cities_in state
state = state.click
state.links.select {|l| l.uri.to_s.encode('utf-8', :invalid => :replace, :undef => :replace)[/^[A-Z]\w+\.htm/]}
end
def coordinate_to_integer string
string.gsub(/(W|S)/i,"-").gsub(/(N|E)/i,"").gsub /\s*/, ""
end
def location_in page
lat = coordinate_to_integer page.search("td div td:nth-child(1) strong").text.strip
lng = coordinate_to_integer page.search('td div td:nth-child(2) strong').text.strip
city_state = page.search('p font strong').text.strip.gsub(/Location of\n\s*/,"").scan(/[^,]\w+/).collect(&:strip)
city = city_state.first
state = city_state.last
{
latitude: lat,
longtitude: lng,
uri: page.uri.to_s,
_id: "#{lat},#{lng}",
date: Time.now,
city: city,
state: state
}
end
def get_and_save_location_in page
p @coll.save location_in page.click
rescue => e
p e
sleep 3
end
def in_db? uri
[email protected]_one({uri:uri},fields:{id:1}).nil?
end
page = @agent.get "http://citylatitudelongitude.com/AL/index.htm"
states_in(page).collect do |state|
cities_in(state).peach(10) do |city|
get_and_save_location_in city unless in_db? "http://citylatitudelongitude.com/AL/#{city.uri.to_s}"
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment