Skip to content

Instantly share code, notes, and snippets.

@camallen
Created June 2, 2014 15:23
Show Gist options
  • Save camallen/ed512ee92d2e2d61320c to your computer and use it in GitHub Desktop.
Save camallen/ed512ee92d2e2d61320c to your computer and use it in GitHub Desktop.
Example project JSON manifest script
#!/usr/bin/env ruby
# Create a JSON file of all the penguin subject images in the s3 bucket
# combined with the per site/filename provided metadata in CSV files.
require 'aws-sdk'
require 'json'
require 'csv'
require 'active_support'
require 'zooniverse_data'
require 'pry'
class PenguinJsonMetdata
InvalidPenguinMetadata = Class.new(StandardError)
DupImageName = Class.new(StandardError)
EXPECTED_CSV_METADATA = ["imageid", "datetime", "moon", "tempf"]
CSV_METADATA_FILE_NAME = "zooniverse(_final|data)(_final)?\.(txt|csv)"
def initialize
aws_config = { access_key_id: ENV['AWS_ACCESS_KEY'], secret_access_key: ENV['AWS_SECRET_KEY'] }
AWS.config(aws_config)
@csv_image_metadata = {}
@demo = true
@sample = false
@sample_size = 200
end
def create_json_file
load_csv_image_metadata
penguin_image_paths = construct_image_paths
File.open('penguin_files.json', 'w') do |out|
out.puts JSON.generate(penguin_image_paths)
end
end
private
def strip_prefix_s3_object_key(s3_obj_key)
s3_obj_key.match(/project_data\/penguin\/(.+)/)[1]
end
def construct_image_paths
[].tap do |images|
subject_count = 0
s3_bucket_objects.each do |obj|
next unless subject_image?(obj)
csv_metadata_key = strip_prefix_s3_object_key(obj.key)
images << @csv_image_metadata[csv_metadata_key].merge!({ path: csv_metadata_key })
subject_count += 1
break if @sample && subject_count == @sample_size
end
end
end
def load_csv_image_metadata
s3_bucket_objects.each do |obj|
next unless obj.key.match(/#{CSV_METADATA_FILE_NAME}/i)
csv_file_data = CSV.parse(obj.read)
metadata_csv_file_name = strip_prefix_s3_object_key(obj.key)
header_row = csv_file_data.shift
check_expected_metadata(header_row, metadata_csv_file_name)
read_csv_file_rows(csv_file_data, metadata_csv_file_name)
end
end
def s3
@s3 ||= AWS::S3.new
end
def zooniverse_data_bucket
@zoo_data_bucket ||= s3.buckets['zooniverse-data']
end
def s3_bucket_objects
zooniverse_data_bucket.objects.with_prefix('project_data/penguin')
end
def check_expected_metadata(first_row, s3_file_path)
unless (first_row - EXPECTED_CSV_METADATA).empty?
raise InvalidPenguinMetadata.new("Unexpected first row of metadata file: #{s3_file_path}")
end
end
def read_csv_file_rows(csv_file_data, metadata_csv_file_name)
csv_file_data.each do |row|
row.unshift(nil) if row.length != 5 && row.first.match(/.+\.JPG/i)
image_folder_index, image_file_name, image_datetime, moon, tempf = row.map! { |val| val && (val.empty? || val.match(/na/i)) ? nil : val }
check_duplicate_data(image_file_name, row)
image_file_name_key = construct_image_file_name_key(metadata_csv_file_name, image_file_name)
@csv_image_metadata[image_file_name_key] = { index: image_folder_index, timestamp: image_datetime, lunar_phase: moon, temperature_f: tempf }
end
end
def check_duplicate_data(image_file_name, new_data)
if @csv_image_metadata.key? image_file_name
raise DupImageName.new("Exisiting record: #{image_file_name} - #{@csv_image_metadata[image_file_name]}, New Record: #{image_file_name} - #{new_data}")
end
end
def construct_image_file_name_key(metadata_csv_file_name, image_file_name)
metadata_prefix_name = metadata_csv_file_name.match(/(.+)\/#{CSV_METADATA_FILE_NAME}/i)[1]
metadata_prefix_name + "/#{image_file_name}"
end
def subject_image?(s3_obj)
match_key = @demo ? ".+(CUVEa2013|FORT|YALOa2013|MAIVb2012a).+.jpg" : ".+.jpg"
s3_obj.key.match(/#{match_key}/i)
end
end
PenguinJsonMetdata.new.create_json_file
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment