Created
November 15, 2010 06:19
-
-
Save jkeefe/700103 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'rubygems' | |
require 'mongo' | |
db = Mongo::Connection.new.db("taxi") | |
coll = db.collection("trips") | |
# Going to find records using: .find({selector},{options}) | |
# "Trip_Distance" => {"$not" => {"$type" => 2}} | |
# means ... | |
# find records where "Trip_Distance" isn't a string - or $type 2 - like "8.99999999999E-2" | |
#"\357\273\277Trip_Pickup_DateTime" => /(.{11})([0])([4-5])(.{6})/ | |
# is there because ... | |
# I want only times between 4 a.m. and 6 a.m. | |
# so I'm looking at "2009-03-06 04:03:42" where the hour is either 04 or 05 | |
# this regular expression is applied to the Trip_Pickup_DateTime (which has three octal chcaracters in the fieldname!) | |
# /(.{11})([0])([4-5])(.{6})/ | |
# which means match if: (There's exactly 11 of any character) (Then a 0) (Then 4 to 5) (then exactly 6 more of any char) | |
# "Fare_Amt" => {"$gt" => 4} | |
# Picks only cases where the fare was greater than $4 (because there were some long, zero or $3 fares. Driver's trip in?) | |
# :sort => ["Trip_Distance", :descending] | |
# sort with the longest trips at the top/first | |
# :fields => ["\357\273\277Trip_Pickup_DateTime", "Trip_Distance", "Start_Lon", "Start_Lat", "Fare_Amt", "Trip_Pickup_Location","Trip_Dropoff_Location"] | |
# pull out only certain fields | |
# coll.find({"Trip_Distance" => {"$not" => {"$type" => 2}}, "\357\273\277Trip_Pickup_DateTime" => /(.{11})([0])([4-5])(.{6})/, "Fare_Amt" => {"$gt" => 4} },{:sort => ["Trip_Distance", :descending], :fields => ["\357\273\277Trip_Pickup_DateTime", "Trip_Distance", "Start_Lon", "Start_Lat", "Fare_Amt", "Trip_Pickup_Location","Trip_Dropoff_Location"]}).limit(3000).each do |row| | |
# then took out the sort, and just chose distances over 10 miles | |
# then added Trip_Pickup_Location and Trip_Dropoff_Location | |
# .gsub(/,/,'') ... gets rid of the comma in the address | |
# .gsub(/\s\s/,'') ... gets rid of any multiple spaces, which in this case were tacked onto the end | |
coll.find({"Trip_Distance" => {"$not" => {"$type" => 2}}, "\357\273\277Trip_Pickup_DateTime" => /(.{11})([0])([4-5])(.{6})/, "Fare_Amt" => {"$gt" => 4}, "Trip_Distance" => {"$gt" => 10} },{:fields => ["\357\273\277Trip_Pickup_DateTime", "Trip_Distance", "Start_Lon", "Start_Lat", "Fare_Amt", "Trip_Pickup_Location","Trip_Dropoff_Location"]}).each do |row| | |
puts row["\357\273\277Trip_Pickup_DateTime"] + ", " + row["Trip_Distance"].to_s + ", " + row["Start_Lat"].to_s + " " + row["Start_Lon"].to_s + ", $" + row["Fare_Amt"].to_s + ", " + row["Trip_Pickup_Location"].gsub(/,/,'').gsub(/\s\s/,'') + ", " + row["Trip_Dropoff_Location"].gsub(/,/,'').gsub(/\s\s/,'') | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment