Created
June 2, 2014 15:23
-
-
Save camallen/ed512ee92d2e2d61320c to your computer and use it in GitHub Desktop.
Example project JSON manifest script
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
# Create a JSON file of all the penguin subject images in the s3 bucket | |
# combined with the per site/filename provided metadata in CSV files. | |
require 'aws-sdk' | |
require 'json' | |
require 'csv' | |
require 'active_support' | |
require 'zooniverse_data' | |
require 'pry' | |
class PenguinJsonMetdata | |
InvalidPenguinMetadata = Class.new(StandardError) | |
DupImageName = Class.new(StandardError) | |
EXPECTED_CSV_METADATA = ["imageid", "datetime", "moon", "tempf"] | |
CSV_METADATA_FILE_NAME = "zooniverse(_final|data)(_final)?\.(txt|csv)" | |
def initialize | |
aws_config = { access_key_id: ENV['AWS_ACCESS_KEY'], secret_access_key: ENV['AWS_SECRET_KEY'] } | |
AWS.config(aws_config) | |
@csv_image_metadata = {} | |
@demo = true | |
@sample = false | |
@sample_size = 200 | |
end | |
def create_json_file | |
load_csv_image_metadata | |
penguin_image_paths = construct_image_paths | |
File.open('penguin_files.json', 'w') do |out| | |
out.puts JSON.generate(penguin_image_paths) | |
end | |
end | |
private | |
def strip_prefix_s3_object_key(s3_obj_key) | |
s3_obj_key.match(/project_data\/penguin\/(.+)/)[1] | |
end | |
def construct_image_paths | |
[].tap do |images| | |
subject_count = 0 | |
s3_bucket_objects.each do |obj| | |
next unless subject_image?(obj) | |
csv_metadata_key = strip_prefix_s3_object_key(obj.key) | |
images << @csv_image_metadata[csv_metadata_key].merge!({ path: csv_metadata_key }) | |
subject_count += 1 | |
break if @sample && subject_count == @sample_size | |
end | |
end | |
end | |
def load_csv_image_metadata | |
s3_bucket_objects.each do |obj| | |
next unless obj.key.match(/#{CSV_METADATA_FILE_NAME}/i) | |
csv_file_data = CSV.parse(obj.read) | |
metadata_csv_file_name = strip_prefix_s3_object_key(obj.key) | |
header_row = csv_file_data.shift | |
check_expected_metadata(header_row, metadata_csv_file_name) | |
read_csv_file_rows(csv_file_data, metadata_csv_file_name) | |
end | |
end | |
def s3 | |
@s3 ||= AWS::S3.new | |
end | |
def zooniverse_data_bucket | |
@zoo_data_bucket ||= s3.buckets['zooniverse-data'] | |
end | |
def s3_bucket_objects | |
zooniverse_data_bucket.objects.with_prefix('project_data/penguin') | |
end | |
def check_expected_metadata(first_row, s3_file_path) | |
unless (first_row - EXPECTED_CSV_METADATA).empty? | |
raise InvalidPenguinMetadata.new("Unexpected first row of metadata file: #{s3_file_path}") | |
end | |
end | |
def read_csv_file_rows(csv_file_data, metadata_csv_file_name) | |
csv_file_data.each do |row| | |
row.unshift(nil) if row.length != 5 && row.first.match(/.+\.JPG/i) | |
image_folder_index, image_file_name, image_datetime, moon, tempf = row.map! { |val| val && (val.empty? || val.match(/na/i)) ? nil : val } | |
check_duplicate_data(image_file_name, row) | |
image_file_name_key = construct_image_file_name_key(metadata_csv_file_name, image_file_name) | |
@csv_image_metadata[image_file_name_key] = { index: image_folder_index, timestamp: image_datetime, lunar_phase: moon, temperature_f: tempf } | |
end | |
end | |
def check_duplicate_data(image_file_name, new_data) | |
if @csv_image_metadata.key? image_file_name | |
raise DupImageName.new("Exisiting record: #{image_file_name} - #{@csv_image_metadata[image_file_name]}, New Record: #{image_file_name} - #{new_data}") | |
end | |
end | |
def construct_image_file_name_key(metadata_csv_file_name, image_file_name) | |
metadata_prefix_name = metadata_csv_file_name.match(/(.+)\/#{CSV_METADATA_FILE_NAME}/i)[1] | |
metadata_prefix_name + "/#{image_file_name}" | |
end | |
def subject_image?(s3_obj) | |
match_key = @demo ? ".+(CUVEa2013|FORT|YALOa2013|MAIVb2012a).+.jpg" : ".+.jpg" | |
s3_obj.key.match(/#{match_key}/i) | |
end | |
end | |
PenguinJsonMetdata.new.create_json_file |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment