Created
March 19, 2013 18:11
-
-
Save D3MZ/5198592 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# encoding: UTF-8 | |
%w{mechanize pp mongo peach}.each { |x| require x } | |
include Mongo | |
@agent = Mechanize.new { |agent| agent.user_agent_alias = 'Mac Safari' } | |
@coll = MongoClient.new('localhost', 27017)['google']['cities'] | |
def states_in page | |
page.links.reject {|l| l.to_s.length > 2 || l.to_s.empty?} | |
end | |
def cities_in state | |
state = state.click | |
state.links.select {|l| l.uri.to_s.encode('utf-8', :invalid => :replace, :undef => :replace)[/^[A-Z]\w+\.htm/]} | |
end | |
def coordinate_to_integer string | |
string.gsub(/(W|S)/i,"-").gsub(/(N|E)/i,"").gsub /\s*/, "" | |
end | |
def location_in page | |
lat = coordinate_to_integer page.search("td div td:nth-child(1) strong").text.strip | |
lng = coordinate_to_integer page.search('td div td:nth-child(2) strong').text.strip | |
city_state = page.search('p font strong').text.strip.gsub(/Location of\n\s*/,"").scan(/[^,]\w+/).collect(&:strip) | |
city = city_state.first | |
state = city_state.last | |
{ | |
latitude: lat, | |
longtitude: lng, | |
uri: page.uri.to_s, | |
_id: "#{lat},#{lng}", | |
date: Time.now, | |
city: city, | |
state: state | |
} | |
end | |
def get_and_save_location_in page | |
p @coll.save location_in page.click | |
rescue => e | |
p e | |
sleep 3 | |
end | |
def in_db? uri | |
[email protected]_one({uri:uri},fields:{id:1}).nil? | |
end | |
page = @agent.get "http://citylatitudelongitude.com/AL/index.htm" | |
states_in(page).collect do |state| | |
cities_in(state).peach(10) do |city| | |
get_and_save_location_in city unless in_db? "http://citylatitudelongitude.com/AL/#{city.uri.to_s}" | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment