Skip to content

Instantly share code, notes, and snippets.

@camallen
Created July 26, 2018 10:12
Show Gist options
  • Save camallen/3092097383fe8971ab233348c6304f5b to your computer and use it in GitHub Desktop.
Save camallen/3092097383fe8971ab233348c6304f5b to your computer and use it in GitHub Desktop.
Extract Galaxy Zoo Subject location data
def _iterate_cursor(collection: nil, query: { }, opts: { }, message: '')
opts.reverse_merge! timeout: false
index = 0
total = collection.find(query).count
message = "#{ message } Galaxy Zoo Subjects"
collection.find(query, opts) do |cursor|
while cursor.has_next?
index += 1
if index % 100 == 0
progress = '%0.3f' % ((index / total.to_f) * 100)
puts "#{ message }: [#{ progress }%]"
end
doc = cursor.next_document
yield doc
end
end
end
require 'csv'
loc_headers = %w(standard thumbnail inverted)
csv_headers = ["id"] | loc_headers
CSV.open("log/gz_subject_locations.csv", "w") do |csv|
csv << csv_headers
_iterate_cursor(collection: GalaxyZooSubject.collection, message: 'Processing') do |doc|
subject_locs = loc_headers.map do |attr_name|
doc["location"][attr_name]
end
row = [ doc["_id"].to_s ] | subject_locs
csv << row
# puts [ doc["_id"].to_s ] | subject_locs
# break #testing
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment