Created
April 23, 2009 15:41
-
-
Save judy-zz/100559 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
# Take the "combined.csv" file, and reorganize and annotate it into the | |
# finished data file that the group wants. | |
require 'date' | |
require 'logger' | |
require 'rubygems' | |
require 'fastercsv' | |
require 'progressbar' | |
require 'active_support' | |
INPUT_FILE = 'output/combined.csv' | |
OUTPUT_FILE = 'output/background_questionnaires.csv' | |
LOG_FILE = 'transform.log' | |
# Hack the [] method on FasterCSV::Row so that it records whatever column | |
# we're currently working on. This is helpful during logging. | |
class FasterCSV | |
class Row | |
def fetch_with_remember_column(index_or_header) | |
$current_column = index_or_header | |
fetch_without_remember_column(index_or_header) | |
end | |
alias_method :fetch_without_remember_column, :[] | |
alias_method :[], :fetch_with_remember_column | |
end | |
end | |
# Set up logging. | |
File.unlink(LOG_FILE) if File.exist?(LOG_FILE) | |
$log = Logger.new(LOG_FILE) | |
$log.level = Logger::WARN | |
# Log the given warning to the log, including the input line number. | |
def warn(msg) | |
$log.warn "#{$input_line_number}:#{$current_column}: #{msg}" | |
end | |
# Return as a three-element array the number of years, months, and days | |
# between the two given dates. | |
# | |
# Time is counted from the "from" date, in the order of years, | |
# months, and days. This order is significant since leap years will | |
# pass unnoticed if they are "absorbed" by a year or month | |
# calculation. For example, 2007 to 2008 is a year and February 15th | |
# to March 15th is a month, whether they include a leap day or not. | |
# However, the leap year is observed properly if it occurs within the | |
# days portion of the calculation. For example February 20th to March | |
# 3rd may be 12 or 11 days, depending on whether it's a leap year. | |
def age(from_date, to_date) | |
return nil if from_date.nil? || to_date.nil? | |
from, to = from_date, to_date.dup | |
age_years = to.year - from.year | |
# Carry a year? | |
if from.month > to.month | |
age_years -= 1 | |
carry_months = 12 | |
else | |
carry_months = 0 | |
end | |
age_months = to.month + carry_months - from.month | |
# Carry a month? | |
if from.day > to.day | |
age_months -= 1 | |
if age_months < 0 | |
# carry again | |
age_years -= 1 | |
age_months += 12 | |
end | |
# Carry the number of days in the month before the "to" month | |
carry_days = Time.days_in_month(to.last_month.month, to.last_month.year) | |
else | |
carry_days = 0 | |
end | |
age_days = to.day + carry_days - from.day | |
[age_years, age_months, age_days] | |
end | |
# Return the chronological age of the subject at the time of the test's | |
# administration as number of months. | |
# | |
# If the day component of the subject's age is more than 15 days, then the | |
# month is rounded upwards. | |
def chronological_time(from_date, to_date) | |
age_years, age_months, age_days = age(from_date, to_date) | |
return nil if age_years.nil? | |
# Round up if they're more than halfway through the month. | |
if age_days > 15 | |
if age_months == 11 | |
age_years += 1 | |
age_months = 0 | |
else | |
age_months += 1 | |
end | |
end | |
(age_years * 12) + age_months | |
end | |
# Turns date strings from csv file into date objects. | |
def date_from_string(string) | |
Date.parse(string, true) | |
rescue ArgumentError | |
nil | |
end | |
# Turns date/time strings from csv file into DateTime objects. | |
def datetime_from_string(string) | |
DateTime.parse(string, true) | |
rescue ArgumentError | |
nil | |
end | |
# Convert the given country string into a country code. | |
def codify_country(country) | |
case country | |
when "UnitedStatesMainland" then 1 | |
when "PuertoRico" then 2 | |
when "Cuba" then 3 | |
when "Mexico" then 4 | |
when "Other" then 5 | |
when '' then '' | |
else | |
warn "Unrecognized country \"#{country}\"" | |
country | |
end | |
end | |
# Codify the given state string into a state code. | |
def codify_state(state) | |
case state.downcase | |
when "fl", "fr", "florida", "florida ." then 1 | |
when "nm", "new mexico" then 2 | |
when "pa", "pennsylvania", "lancaster,pa" then 3 | |
when "ma", "massachusetts" then 4 | |
when "ny", "new york" then 5 | |
when "co", "colorado" then 6 | |
when "tx", "texas" then 7 | |
when "ca", "california" then 8 | |
when "il", "illinois" then 9 | |
when "ga", "georgia" then 10 | |
when "nj", "ns", "new jersey" then 11 | |
when "ct", "connecticut" then 12 | |
when '' then '' | |
else | |
warn "Unrecognized state \"#{state}\"" | |
state | |
end | |
end | |
def codify_time_in_location(string) | |
case string.downcase | |
when "frombirth" then 1 | |
when "weeks", "months", "years" then 2 | |
when '' then '' | |
else | |
warn "Unrecognized duration \"#{string}\"" | |
string | |
end | |
end | |
def codify_boolean(b) | |
case b.downcase | |
when "yes", "true" then 1 | |
when "no", "false" then 2 | |
when "", "n/a", "na", "dont_know" then '' | |
else | |
warn "Unrecognized boolean \"#{b}\"" | |
b | |
end | |
end | |
def codify_relation(relation) | |
case relation.downcase | |
when "friend", "amigo", "amiga" then 1 | |
when "neighbor", "vecino" then 2 | |
when "grandmother", "abuela" then 3 | |
when "aunt", "tia" then 4 | |
when "cousin", "primo", "prima" then 5 | |
when "family member", "varios miembros" then 6 | |
when "daycare", "cuidado de ninos" then 7 | |
when "other" then 8 | |
when "" then '' | |
else | |
warn "Unrecognized relation \"#{relation}\"" | |
relation | |
end | |
end | |
def codify_extended_relation(relation) | |
case relation.downcase | |
when "grandmother maternal" then 1 | |
when "grandmother paternal" then 2 | |
when "mother's sister" then 3 | |
when "mother's aunt" then 4 | |
when "mother's cousin" then 5 | |
when "father's sister" then 6 | |
when "father's aunt" then 7 | |
when "father's cousin" then 8 | |
when "close friend" then 9 | |
when "other" then 10 | |
when "" then '' | |
else | |
warn "Unrecognized extended relation \"#{relation}\"" | |
relation | |
end | |
end | |
def codify_language_mix(mix) | |
case mix.downcase | |
when "all_spanish" then 1 | |
when "more_spanish_than_english", "more_spanish" then 2 | |
when "equal_spanish_and_english", "equal" then 3 | |
when "more_english_than_spanish", "more_english" then 4 | |
when "all_english" then 5 | |
when "", "dont_know" then '' | |
else | |
warn "Unrecognized language mix \"#{mix}\"" | |
mix | |
end | |
end | |
def codify_ed_program(ed) | |
case ed.downcase | |
when "head start", "head_start" then 1 | |
when "even start", "even_start" then 2 | |
when "vpk" then 3 | |
when "k2", "kindergarten_2" then 4 | |
when "k4", "kindergarten_4" then 5 | |
when "k5", "kindergarten_5" then 6 | |
when "other" then 7 | |
when "", "na" then '' | |
else | |
warn "Unrecognized ed program \"#{ed}\"" | |
ed | |
end | |
end | |
def codify_ears(ears) | |
case ears.downcase | |
when "one ear", "one_ear" then 1 | |
when "two ears", "two_ears", "both ears" then 2 | |
when "" then '' | |
else | |
warn "Unrecognized ears \"#{ears}\"" | |
ears | |
end | |
end | |
def codify_ear(ear) | |
case ear.downcase | |
when "left ear", "left" then 1 | |
when "right ear", "right" then 2 | |
when "both ears", "both" then 3 | |
when "don't know", "dont-know" then 4 | |
when "" then '' | |
else | |
warn "Unrecognized ear \"#{ear}\"" | |
ear | |
end | |
end | |
def codify_schooling(ed) | |
case ed.downcase | |
when "0" then 0 | |
when "1", "2", "3", "4", "5", "6" then 1 | |
when "7", "8" then 2 | |
when "9", "10", "11" then 3 | |
when "ged" then 4 | |
when "high_school", "high school" then 5 | |
when "some_college", "some college" then 6 | |
when "associate" then 7 | |
when "bachelors" then 8 | |
when "masters" then 9 | |
when "phd", "md" then 10 | |
when "" then '' | |
else | |
warn "Unrecognized schooling \"#{ed}\"" | |
ed | |
end | |
end | |
def codify_school_years(ed) | |
case ed.downcase | |
# TODO: Refactor me! I'm really, really wet! | |
when "0" then 0 | |
when "1" then 1 | |
when "2" then 2 | |
when "3" then 3 | |
when "4" then 4 | |
when "5" then 5 | |
when "6" then 6 | |
when "7" then 7 | |
when "8" then 8 | |
when "9" then 9 | |
when "10" then 10 | |
when "11" then 11 | |
when "ged", "high_school", "high school" then 12 | |
when "some_college", "some college" then 13 | |
when "associate" then 14 | |
when "bachelors" then 16 | |
when "masters" then 18 | |
when "phd", "md" then 21 | |
when "" then '' | |
else | |
warn "Unrecognized school years \"#{ed}\"" | |
ed | |
end | |
end | |
def codify_first_to_move(ftm) | |
case ftm.downcase | |
when "you" then 1 | |
when "parent" then 2 | |
when "grandparent" then 3 | |
when "greatgrandparent" then 4 | |
else | |
warn "Unrecognized first_to_move \"#{ftm}\"" | |
ftm | |
end | |
end | |
def codify_stepfather(step) | |
case step.downcase | |
when "father" then 1 | |
when "stepfather" then 2 | |
when "" then '' | |
else | |
warn "Unrecognized stepfather \"#{step}\"" | |
step | |
end | |
end | |
def codify_ethnicity(eth) | |
case eth.downcase | |
when "hispanic/latino" then 1 | |
when "white" then 2 | |
when "african american" then 3 | |
when "other" then 4 | |
when "" then '' | |
else | |
warn "Unrecognized ethnicity \"#{eth}\"" | |
eth | |
end | |
end | |
def codify_viewing_frequency(view) | |
case view.downcase | |
when "5-7 days / wk" then 364 | |
when "2-4 days / wk" then 208 | |
when "1 day / wk", "few times a month" then 52 | |
when "1 day / mo" then 12 | |
when "few times a year" then 6 | |
when "not involved" then 0 | |
when "deceased" then 0 | |
when "livesinhomecountry" then 0 | |
when "", "refused", "na" then '' | |
else | |
warn "Unrecognized viewing frequency \"#{view}\"" | |
view | |
end | |
end | |
def codify_involvement(inv) | |
case inv.downcase | |
when "5-7 days / wk" then 7 | |
when "2-4 days / wk" then 6 | |
when "1 day / wk", "few times a month" then 5 | |
when "1 day / mo", "1 time / mo" then 4 | |
when "few times a year" then 4 | |
when "not involved" then 3 | |
when "deceased" then 2 | |
when "livesinhomecountry" then 1 | |
when "" then '' | |
else | |
warn "Unrecognized involvement \"#{inv}\"" | |
inv | |
end | |
end | |
# Count the number of rows in the file so that we know how much work we have to do (to inform the progressbar) | |
num_lines = 0 | |
File.open(INPUT_FILE) do |f| | |
while f.gets | |
num_lines += 1 | |
end | |
end | |
# Create the progress bar | |
progress_bar = ProgressBar.new("Processing", num_lines) | |
# Loop through each line of the input file. | |
FasterCSV.open(OUTPUT_FILE, 'w') do |output| | |
# Write headers for output file. | |
headers = [ | |
%w{ . language }, | |
%w{ . filename }, | |
%w{ . _FACILITY }, | |
%w{ . _SUBJECT }, | |
%w{ . _FILLED_IN }, | |
%w{ 1 birth_date }, | |
%w{ . chronological_age }, | |
%w{ 2a birth_place }, | |
%w{ 2b birth_place_city }, | |
%w{ 2c birth_place_state }, | |
%w{ 3 time_in_us }, | |
%w{ . time_in_us_count }, | |
%w{ . chronological_time_in_us }, | |
%w{ 4 times_returned_to_home_country }, | |
%w{ 5a time_in_home_country }, | |
%w{ 5b time_in_home_country_count }, | |
%w{ 6a to_one_yes }, | |
%w{ 6b to_one_relation }, | |
%w{ 6c to_one_relation_text }, | |
%w{ 6d to_one_language_to_child }, | |
%w{ 6e to_one_daycare_months }, | |
%w{ 6f to_one_daycare_country }, | |
%w{ 6g to_one_daycare_city }, | |
%w{ 6h to_one_daycare_state }, | |
%w{ 6i to_one_yes_2 }, | |
%w{ 6j to_one_relation_2 }, | |
%w{ 6k to_one_relation_text_2 }, | |
%w{ 6l to_one_language_to_child_2 }, | |
%w{ 6m to_one_daycare_months_2 }, | |
%w{ 6n to_one_daycare_country_2 }, | |
%w{ 6o to_one_daycare_city_2 }, | |
%w{ 6p to_one_daycare_state_2 }, | |
] | |
[ %w{ 7 two }, | |
%w{ 8 three }, | |
%w{ 9 four }, | |
%w{ 10 five }, | |
%w{ 11 six }, | |
].each do |pair| | |
num, year = pair | |
headers.concat([ | |
["#{num}a", "to_#{year}_yes" ], | |
["#{num}b", "to_#{year}_relation" ], | |
["#{num}c", "to_#{year}_relation_text" ], | |
["#{num}d", "to_#{year}_language_to_child" ], | |
["#{num}e", "to_#{year}_language_to_caregiver" ], | |
["#{num}f", "to_#{year}_daycare_months" ], | |
["#{num}g", "to_#{year}_daycare_country" ], | |
["#{num}h", "to_#{year}_daycare_city" ], | |
["#{num}i", "to_#{year}_daycare_state" ], | |
["#{num}j", "to_#{year}_yes_2" ], | |
["#{num}k", "to_#{year}_relation_2" ], | |
["#{num}l", "to_#{year}_relation_text_2" ], | |
["#{num}m", "to_#{year}_language_to_child_2" ], | |
["#{num}n", "to_#{year}_language_to_caregiver_2" ], | |
["#{num}o", "to_#{year}_daycare_months_2" ], | |
["#{num}p", "to_#{year}_daycare_country_2" ], | |
["#{num}q", "to_#{year}_daycare_city_2" ], | |
["#{num}r", "to_#{year}_daycare_state_2" ], | |
]) | |
end | |
headers.concat([ | |
%w{ 12a lived_with_other }, | |
%w{ 12b lived_with_whom }, | |
%w{ 12c lived_with_whom_other_text }, | |
%w{ 12d to_one_daycare_country6 }, | |
%w{ 12e live_with_time }, | |
%w{ 12f lived_with_language_to_child }, | |
%w{ 12g lived_with_language_to_person }, | |
%w{ 13a early_head_start_attended }, | |
%w{ 13b early_head_start_ages_attended_start_years }, | |
%w{ 13c early_head_start_ages_attended_start_months }, | |
%w{ . early_head_start_ages_attended_start }, | |
%w{ 13d early_head_start_ages_attended_end_years }, | |
%w{ 13e early_head_start_ages_attended_end_months }, | |
%w{ . early_head_start_ages_attended_end }, | |
%w{ 13f early_head_start_language_teachers_to_child }, | |
%w{ 13g early_head_start_language_child_to_teachers }, | |
%w{ 13h early_head_start_language_assistant_to_child }, | |
%w{ 13i early_head_start_language_child_to_assistant }, | |
%w{ 13j early_head_start_language_children_to_child }, | |
%w{ 13k early_head_start_language_child_to_children }, | |
%w{ 13l head_start_attended }, | |
%w{ 13m head_start_ages_attended_start_years }, | |
%w{ 13n head_start_ages_attended_start_months }, | |
%w{ . head_start_ages_attended_start }, | |
%w{ 13o head_start_ages_attended_end_years }, | |
%w{ 13p head_start_ages_attended_end_months }, | |
%w{ . head_start_ages_attended_end }, | |
%w{ 13q head_start_language_teachers_to_child }, | |
%w{ 13r head_start_language_child_to_teachers }, | |
%w{ 13s head_start_language_assistant_to_child }, | |
%w{ 13t head_start_language_child_to_assistant }, | |
%w{ 13u head_start_language_children_to_child }, | |
%w{ 13v head_start_language_child_to_children }, | |
%w{ 13w even_start_attended }, | |
%w{ 13x even_start_ages_attended_start_years }, | |
%w{ 13y even_start_ages_attended_start_months }, | |
%w{ . even_start_ages_attended_start }, | |
%w{ 13z even_start_ages_attended_end_years }, | |
%w{ 13aa even_start_ages_attended_end_months }, | |
%w{ . even_start_ages_attended_end }, | |
%w{ 13ab even_start_language_teachers_to_child }, | |
%w{ 13ac even_start_language_child_to_teachers }, | |
%w{ 13ad even_start_language_assistant_to_child }, | |
%w{ 13ae even_start_language_child_to_assistant }, | |
%w{ 13af even_start_language_children_to_child }, | |
%w{ 13ag even_start_language_child_to_children }, | |
%w{ 13ah vpk_attended }, | |
%w{ 13ai vpk_start_ages_attended_start_years }, | |
%w{ 13aj vpk_start_ages_attended_start_months }, | |
%w{ . vpk_start_ages_attended_start }, | |
%w{ 13ak vpk_start_ages_attended_end_years }, | |
%w{ 13al vpk_start_ages_attended_end_months }, | |
%w{ . vpk_start_ages_attended_end }, | |
%w{ 13am vpk_language_teachers_to_child }, | |
%w{ 13an vpk_language_child_to_teachers }, | |
%w{ 13ao vpk_language_assistant_to_child }, | |
%w{ 13ap vpk_language_child_to_assistant }, | |
%w{ 13aq vpk_language_children_to_child }, | |
%w{ 13ar vpk_language_child_to_children }, | |
%w{ 13as other_ed_attended }, | |
%w{ 13at other_program_name }, | |
%w{ 13av other_start_ages_attended_start_years }, | |
%w{ 13ax other_start_ages_attended_start_months }, | |
%w{ . other_start_ages_attended_start }, | |
%w{ 13ay other_start_ages_attended_end_years }, | |
%w{ 13az other_start_ages_attended_end_months }, | |
%w{ . other_start_ages_attended_end }, | |
%w{ 13ba other_ed_language_teachers_to_child }, | |
%w{ 13bb other_ed_language_child_to_teachers }, | |
%w{ 13bc other_ed_language_assistant_to_child }, | |
%w{ 13bd other_ed_language_child_to_assistant }, | |
%w{ 13be other_ed_language_children_to_child }, | |
%w{ 13bf other_ed_language_child_to_children }, | |
%w{ 14a current_ed_program_1 }, | |
%w{ 14b current_ed_program_other_text1 }, | |
%w{ 14c current_ed_program_description_1 }, | |
%w{ 14d to_one_daycare_country4 }, | |
%w{ 14e current_ed_program_age_begun_1 }, | |
%w{ 14g current_ed_program_day_per_week_1 }, | |
%w{ 14h current_ed_program_hours_per_day_1 }, | |
%w{ 14i current_ed_program_language_teachers_to_child_1 }, | |
%w{ 14j current_ed_program_language_child_to_teachers_1 }, | |
%w{ 14k current_ed_program_language_assistant_to_child_1 }, | |
%w{ 14l current_ed_program_language_child_to_assistant_1 }, | |
%w{ 14m current_ed_program_language_children_to_child_1 }, | |
%w{ 14n current_ed_program_language_child_to_children_1 }, | |
%w{ 14o current_ed_program_2 }, | |
%w{ 14p current_ed_program_other_text2 }, | |
%w{ 14q current_ed_program_description_2 }, | |
%w{ 14r to_one_daycare_country5 }, | |
%w{ 14s current_ed_program_age_begun_2 }, | |
%w{ 14u current_ed_program_day_per_week_2 }, | |
%w{ 14v current_ed_program_hours_per_day_2 }, | |
%w{ 14w current_ed_program_language_teachers_to_child_2 }, | |
%w{ 14x current_ed_program_language_child_to_teachers_2 }, | |
%w{ 14y current_ed_program_language_assistant_to_child_2 }, | |
%w{ 14z current_ed_program_language_child_to_assistant_2 }, | |
%w{ 14aa current_ed_program_language_children_to_child_2 }, | |
%w{ 14ab current_ed_program_language_child_to_children_2 }, | |
%w{ 14ac current_ed_program_3 }, | |
%w{ 14ad current_ed_program_other_text3 }, | |
%w{ 14ae current_ed_program_description_3 }, | |
%w{ 14af to_one_daycare_country3 }, | |
%w{ 14ag current_ed_program_age_begun_3 }, | |
%w{ 14ai current_ed_program_day_per_week_3 }, | |
%w{ 14aj current_ed_program_hours_per_day_3 }, | |
%w{ 14ak current_ed_program_language_teachers_to_child_3 }, | |
%w{ 14al current_ed_program_language_child_to_teachers_3 }, | |
%w{ 14am current_ed_program_language_assistant_to_child_3 }, | |
%w{ 14an current_ed_program_language_child_to_assistant_3 }, | |
%w{ 14ao current_ed_program_language_children_to_child_3 }, | |
%w{ 14ap current_ed_program_language_child_to_children_3 }, | |
%w{ 15a has_trouble_hearing }, | |
%w{ 15b has_trouble_hearing_always }, | |
%w{ 15c has_trouble_hearing_infection }, | |
%w{ 15d has_trouble_hearing_noisy }, | |
%w{ 15e has_trouble_hearing_ears }, | |
%w{ 16a ear_infection }, | |
%w{ 16b ear_infection_ear }, | |
%w{ 16c ear_infection_number }, | |
%w{ 16d ear_infection_under_1_year }, | |
%w{ 16e ear_infection_1_2 }, | |
%w{ 16f ear_infection_2_3 }, | |
%w{ 16g ear_infection_3_4 }, | |
%w{ 16h ear_infection_4_5 }, | |
%w{ 16i ear_infection_5_6 }, | |
%w{ 16j ear_infection_has_had_tubes }, | |
%w{ 17 difficulty_understanding }, | |
%w{ 18a difficulty_understanding_others }, | |
%w{ 18b difficulty_understanding_father }, | |
%w{ 18c difficulty_understanding_grandfather }, | |
%w{ 18d difficulty_understanding_grandmother }, | |
%w{ 18e difficulty_understanding_brother }, | |
%w{ 18f difficulty_understanding_sister }, | |
%w{ 18g difficulty_understanding_teacher }, | |
%w{ 18h difficulty_understanding_relative }, | |
%w{ 18i difficulty_understanding_relative_text }, | |
%w{ 18j difficulty_understanding_other }, | |
%w{ 18k difficulty_understanding_other_text }, | |
%w{ 19a think_speech_problem }, | |
%w{ 19b think_speech_problem_text2 }, | |
%w{ 19c think_speech_problem_age_problem }, | |
%w{ 19d think_speech_problem_aware }, | |
%w{ 19e think_speech_problem_swaps_sounds }, | |
%w{ 20a think_language_problem }, | |
%w{ 20b think_speech_problem_text }, | |
%w{ 20c think_language_problem_age }, | |
%w{ 21a has_received_therapy }, | |
%w{ 21b has_received_therapy_time }, | |
%w{ 21c has_received_therapy_time_count }, | |
%w{ 21d has_received_therapy_agency }, | |
%w{ 22 parent_birth_date }, | |
%w{ . parent_age_months }, | |
%w{ 23a parent_birth_place }, | |
%w{ 23b parent_birth_place_city }, | |
%w{ 23c parent_birth_place_stateprovince }, | |
%w{ 24a parent_time_in_us }, | |
%w{ 24b parent_time_in_us_count }, | |
%w{ . parent_time_in_us_months }, | |
%w{ 25 parent_times_returned_to_home_country }, | |
%w{ 26a parent_time_in_home_country }, | |
%w{ 26b parent_time_in_home_country_count }, | |
%w{ . parent_time_in_home_country_months }, | |
%w{ 27a parent_work_outside_home }, | |
%w{ 27b parent_job_title }, | |
%w{ 27c parent_job_responsibilities }, | |
%w{ 27d parent_job_hours }, | |
%w{ 28a parent_schooling }, | |
%w{ . parent_years_of_schooling }, | |
%w{ 28b parent_schooling_name }, | |
%w{ 28c parent_schooling_units }, | |
%w{ 28d parent_schooling_time }, | |
%w{ 29 parent_first_to_move }, | |
%w{ . father_or_stepfather }, | |
%w{ 30a father_birth_date_options }, | |
%w{ 30b father_birth_date }, | |
%w{ . father_age }, | |
%w{ 31a father_ethnicity }, | |
%w{ 31b father_ethnicity_other_text }, | |
%w{ 32a father_birth_place_na }, | |
%w{ 32b father_birth_place }, | |
%w{ 32c father_birth_place_city }, | |
%w{ 32d father_birth_place_state }, | |
%w{ 33a father_time_in_us_na }, | |
%w{ 33b father_time_in_us }, | |
%w{ 33c father_time_in_us_count }, | |
%w{ . father_time_in_us_months }, | |
%w{ 34a father_job_outside_home }, | |
%w{ 34b father_job_title }, | |
%w{ 34c father_job_responsibilities }, | |
%w{ 34d father_job_hours }, | |
%w{ 35a father_schooling }, | |
%w{ . father_schooling_name }, | |
%w{ 35b father_schooling_units }, | |
%w{ 35c father_schooling_time }, | |
%w{ 36 father_lives_with_parent }, | |
%w{ 37a father_viewing_frequency }, | |
%w{ . father_involvement }, | |
%w{ 38a parent_speaks_english }, | |
%w{ 38b parent_speaks_spanish }, | |
%w{ 38c parent_speaks_other }, | |
%w{ 38d parent_speaks_spanish_puertorican }, | |
%w{ 38e parent_speaks_spanish_mexican }, | |
%w{ 38f parent_speaks_spanish_cuban }, | |
%w{ 38g parent_speaks_spanish_other }, | |
%w{ 38h parent_speaks_spanish_other_text }, | |
]) | |
output << headers.map { |h| h[0] } | |
output << headers.map { |h| h[1] } | |
# Process each line of the input file. | |
$input_line_number = 0 | |
FasterCSV.foreach(INPUT_FILE, :headers => true, :return_headers => true) do |input| | |
$input_line_number += 1 | |
# Increment the progress bar. | |
progress_bar.inc | |
# Just skip the header row. | |
next if input.header_row? | |
# The array where we're storing the current row of output values. | |
row = [] | |
row << input['language'] | |
row << input['filename'] | |
row << input['_FACILITY'] | |
row << input['_SUBJECT'] | |
row << datetime_from_string(input['_FILLED_IN']) | |
row << date_from_string(input['birth_date']) | |
# Calculate chronological age. | |
chronological_age = chronological_time(date_from_string(input['birth_date']), date_from_string(input['_FILLED_IN'])) | |
row << chronological_age | |
# Codify birth place. | |
row << codify_country(input['birth_place']) | |
row << input['birth_place_city'] | |
# Codify birth place state. | |
row << codify_state(input['birth_place_state']) | |
row << codify_time_in_location(input['time_in_us']) | |
row << input['time_in_us_count'] | |
if codify_time_in_location(input['time_in_us']) == 1 # from birth | |
chronological_time_in_us = chronological_age | |
else | |
chronological_time_in_us = input['time_in_us_count'] | |
end | |
row << chronological_time_in_us | |
row << input['times_returned_to_home_country'] | |
# TODO: Clarify the amount of time spent in home country. | |
row << input['time_in_home_country'] | |
row << input['time_in_home_country_count'] | |
row << codify_boolean(input['to_one_yes']) | |
row << codify_relation(input['to_one_relation']) | |
row << input['to_one_relation_text'] | |
row << codify_language_mix(input['to_one_language_to_child']) | |
row << input['to_one_daycare_months'] | |
row << codify_country(input['to_one_daycare_country']) | |
row << input['to_one_daycare_city'] | |
row << codify_state(input['to_one_daycare_state']) | |
row << codify_boolean(input['to_one_yes_2']) | |
row << codify_relation(input['to_one_relation_2']) | |
row << input['to_one_relation_text_2'] | |
row << codify_language_mix(input['to_one_language_to_child_2']) | |
row << input['to_one_daycare_months_2'] | |
row << codify_country(input['to_one_daycare_country_2']) | |
row << input['to_one_daycare_city_2'] | |
row << codify_state(input['to_one_daycare_state_2']) | |
# The below have the extra "to_'year'_language_to_caregiver", which is | |
# why the above is not included. | |
%w{two three four five six}.each do |year| | |
row << codify_boolean(input["to_#{year}_yes"]) | |
row << codify_relation(input["to_#{year}_relation"]) | |
row << input["to_#{year}_relation_text"] | |
row << codify_language_mix(input["to_#{year}_language_to_child"]) | |
row << codify_language_mix(input["to_#{year}_language_to_caregiver"]) | |
row << input["to_#{year}_daycare_months"] | |
row << codify_country(input["to_#{year}_daycare_country"]) | |
row << input["to_#{year}_daycare_city"] | |
row << codify_state(input["to_#{year}_daycare_state"]) | |
row << codify_boolean(input["to_#{year}_yes_2"]) | |
row << codify_relation(input["to_#{year}_relation_2"]) | |
row << input["to_#{year}_relation_text_2"] | |
row << codify_language_mix(input["to_#{year}_language_to_child_2"]) | |
row << codify_language_mix(input["to_#{year}_language_to_caregiver_2"]) | |
row << input["to_#{year}_daycare_months_2"] | |
row << codify_country(input["to_#{year}_daycare_country_2"]) | |
row << input["to_#{year}_daycare_city_2"] | |
row << codify_state(input["to_#{year}_daycare_state_2"]) | |
end | |
row << codify_boolean(input["lived_with_other"]) | |
row << codify_extended_relation(input["lived_with_whom"]) | |
row << input["lived_with_whom_other_text"] | |
row << codify_country(input["to_one_daycare_country6"]) | |
row << input["live_with_time"] | |
row << codify_language_mix(input["lived_with_language_to_child"]) | |
row << codify_language_mix(input["lived_with_language_to_person"]) | |
row << codify_boolean(input["early_head_start_attended"]) | |
row << input["early_head_start_ages_attended_start_years"] | |
row << input["early_head_start_ages_attended_start_months"] | |
row << (input["early_head_start_ages_attended_start_years"].to_i * 12) + input["early_head_start_ages_attended_start_months"].to_i | |
row << input["early_head_start_ages_attended_end_years"] | |
row << input["early_head_start_ages_attended_end_months"] | |
row << (input["early_head_start_ages_attended_end_years"].to_i * 12) + input["early_head_start_ages_attended_end_months"].to_i | |
row << codify_language_mix(input["early_head_start_language_teachers_to_child"]) | |
row << codify_language_mix(input["early_head_start_language_child_to_teachers"]) | |
row << codify_language_mix(input["early_head_start_language_assistant_to_child"]) | |
row << codify_language_mix(input["early_head_start_language_child_to_assistant"]) | |
row << codify_language_mix(input["early_head_start_language_children_to_child"]) | |
row << codify_language_mix(input["early_head_start_language_child_to_children"]) | |
row << codify_boolean(input["head_start_attended"]) | |
row << input["head_start_ages_attended_start_years"] | |
row << input["head_start_ages_attended_start_months"] | |
row << (input["head_start_ages_attended_start_years"].to_i * 12) + input["head_start_ages_attended_start_months"].to_i | |
row << input["head_start_ages_attended_end_years"] | |
row << input["head_start_ages_attended_end_months"] | |
row << (input["head_start_ages_attended_end_years"].to_i * 12) + input["head_start_ages_attended_end_months"].to_i | |
row << codify_language_mix(input["head_start_language_teachers_to_child"]) | |
row << codify_language_mix(input["head_start_language_child_to_teachers"]) | |
row << codify_language_mix(input["head_start_language_assistant_to_child"]) | |
row << codify_language_mix(input["head_start_language_child_to_assistant"]) | |
row << codify_language_mix(input["head_start_language_children_to_child"]) | |
row << codify_language_mix(input["head_start_language_child_to_children"]) | |
row << codify_boolean(input["even_start_attended"]) | |
row << input["even_start_ages_attended_start_years"] | |
row << input["even_start_ages_attended_start_months"] | |
row << (input["even_start_ages_attended_start_years"].to_i * 12) + input["even_start_ages_attended_start_months"].to_i | |
row << input["even_start_ages_attended_end_years"] | |
row << input["even_start_ages_attended_end_months"] | |
row << (input["even_start_ages_attended_end_years"].to_i * 12) + input["even_start_ages_attended_end_months"].to_i | |
row << codify_language_mix(input["even_start_language_teachers_to_child"]) | |
row << codify_language_mix(input["even_start_language_child_to_teachers"]) | |
row << codify_language_mix(input["even_start_language_assistant_to_child"]) | |
row << codify_language_mix(input["even_start_language_child_to_assistant"]) | |
row << codify_language_mix(input["even_start_language_children_to_child"]) | |
row << codify_language_mix(input["even_start_language_child_to_children"]) | |
row << codify_boolean(input["vpk_attended"]) | |
row << input["vpk_start_ages_attended_start_years"] | |
row << input["vpk_start_ages_attended_start_months"] | |
row << (input["vpk_start_ages_attended_start_years"].to_i * 12) + input["vpk_start_ages_attended_start_months"].to_i | |
row << input["vpk_start_ages_attended_end_years"] | |
row << input["vpk_start_ages_attended_end_months"] | |
row << (input["vpk_start_ages_attended_end_years"].to_i * 12) + input["vpk_start_ages_attended_end_months"].to_i | |
row << codify_language_mix(input["vpk_language_teachers_to_child"]) | |
row << codify_language_mix(input["vpk_language_child_to_teachers"]) | |
row << codify_language_mix(input["vpk_language_assistant_to_child"]) | |
row << codify_language_mix(input["vpk_language_child_to_assistant"]) | |
row << codify_language_mix(input["vpk_language_children_to_child"]) | |
row << codify_language_mix(input["vpk_language_child_to_children"]) | |
row << codify_boolean(input["other_ed_attended"]) | |
row << input["other_program_name"] | |
row << input["other_start_ages_attended_start_years"] | |
row << input["other_start_ages_attended_start_months"] | |
row << (input["other_start_ages_attended_start_years"].to_i * 12) + input["vpk_start_ages_attended_start_months"].to_i | |
row << input["other_start_ages_attended_end_years"] | |
row << input["other_start_ages_attended_end_months"] | |
row << (input["other_start_ages_attended_end_years"].to_i * 12) + input["vpk_start_ages_attended_end_months"].to_i | |
row << codify_language_mix(input["other_ed_language_teachers_to_child"]) | |
row << codify_language_mix(input["other_ed_language_child_to_teachers"]) | |
row << codify_language_mix(input["other_ed_language_assistant_to_child"]) | |
row << codify_language_mix(input["other_ed_language_child_to_assistant"]) | |
row << codify_language_mix(input["other_ed_language_children_to_child"]) | |
row << codify_language_mix(input["other_ed_language_child_to_children"]) | |
row << codify_ed_program(input["current_ed_program_1"]) | |
row << input["current_ed_program_other_text1"] | |
row << input["current_ed_program_description_1"] | |
row << codify_country(input["to_one_daycare_country4"]) | |
row << (input["current_ed_program_age_begun_years_1"].to_i * 12) + input["current_ed_program_age_begun_months_1"].to_i | |
row << input["current_ed_program_day_per_week_1"] | |
row << input["current_ed_program_hours_per_day_1"] | |
row << codify_language_mix(input["current_ed_program_language_teachers_to_child_1"]) | |
row << codify_language_mix(input["current_ed_program_language_child_to_teachers_1"]) | |
row << codify_language_mix(input["current_ed_program_language_assistant_to_child_1"]) | |
row << codify_language_mix(input["current_ed_program_language_child_to_assistant_1"]) | |
row << codify_language_mix(input["current_ed_program_language_children_to_child_1"]) | |
row << codify_language_mix(input["current_ed_program_language_child_to_children_1"]) | |
row << codify_ed_program(input["current_ed_program_2"]) | |
row << input["current_ed_program_other_text2"] | |
row << input["current_ed_program_description_2"] | |
row << codify_country(input["to_one_daycare_country5"]) | |
row << (input["current_ed_program_age_begun_years_2"].to_i * 12) + input["current_ed_program_age_begun_months_2"].to_i | |
row << input["current_ed_program_day_per_week_2"] | |
row << input["current_ed_program_hours_per_day_2"] | |
row << codify_language_mix(input["current_ed_program_language_teachers_to_child_2"]) | |
row << codify_language_mix(input["current_ed_program_language_child_to_teachers_2"]) | |
row << codify_language_mix(input["current_ed_program_language_assistant_to_child_2"]) | |
row << codify_language_mix(input["current_ed_program_language_child_to_assistant_2"]) | |
row << codify_language_mix(input["current_ed_program_language_children_to_child_2"]) | |
row << codify_language_mix(input["current_ed_program_language_child_to_children_2"]) | |
row << codify_ed_program(input["current_ed_program_3"]) | |
row << input["current_ed_program_other_text3"] | |
row << input["current_ed_program_description_3"] | |
row << codify_country(input["to_one_daycare_country3"]) | |
row << (input["current_ed_program_age_begun_years_3"].to_i * 12) + input["current_ed_program_age_begun_months_3"].to_i | |
row << input["current_ed_program_day_per_week_3"] | |
row << input["current_ed_program_hours_per_day_3"] | |
row << codify_language_mix(input["current_ed_program_language_teachers_to_child_3"]) | |
row << codify_language_mix(input["current_ed_program_language_child_to_teachers_3"]) | |
row << codify_language_mix(input["current_ed_program_language_assistant_to_child_3"]) | |
row << codify_language_mix(input["current_ed_program_language_child_to_assistant_3"]) | |
row << codify_language_mix(input["current_ed_program_language_children_to_child_3"]) | |
row << codify_language_mix(input["current_ed_program_language_child_to_children_3"]) | |
row << codify_boolean(input["has_trouble_hearing"]) | |
row << codify_boolean(input["has_trouble_hearing_always"]) | |
row << codify_boolean(input["has_trouble_hearing_infection"]) | |
row << codify_boolean(input["has_trouble_hearing_noisy"]) | |
row << codify_ears(input["has_trouble_hearing_ears"]) | |
row << codify_boolean(input["ear_infection"]) | |
row << codify_ear(input["ear_infection_ear"]) | |
row << input["ear_infection_number"] | |
row << codify_boolean(input["ear_infection_under_1_year"]) | |
row << codify_boolean(input["ear_infection_1_2"]) | |
row << codify_boolean(input["ear_infection_2_3"]) | |
row << codify_boolean(input["ear_infection_3_4"]) | |
row << codify_boolean(input["ear_infection_4_5"]) | |
row << codify_boolean(input["ear_infection_5_6"]) | |
row << codify_boolean(input["ear_infection_has_had_tubes"]) | |
row << codify_boolean(input["difficulty_understanding"]) | |
row << codify_boolean(input["difficulty_understanding_others"]) | |
row << codify_boolean(input["difficulty_understanding_father"]) | |
row << codify_boolean(input["difficulty_understanding_grandfather"]) | |
row << codify_boolean(input["difficulty_understanding_grandmother"]) | |
row << codify_boolean(input["difficulty_understanding_brother"]) | |
row << codify_boolean(input["difficulty_understanding_sister"]) | |
row << codify_boolean(input["difficulty_understanding_teacher"]) | |
row << codify_boolean(input["difficulty_understanding_relative"]) | |
row << input["difficulty_understanding_relative_text"] | |
row << codify_boolean(input["difficulty_understanding_other"]) | |
row << input["difficulty_understanding_other_text"] | |
row << codify_boolean(input["think_speech_problem"]) | |
row << input["think_speech_problem_text2"] | |
row << input["think_speech_problem_age_problem"] | |
row << input["think_speech_problem_aware"] | |
row << input["think_speech_problem_swaps_sounds"] | |
row << codify_boolean(input["think_language_problem"]) | |
row << input["think_speech_problem_text"] | |
row << input["think_language_problem_age"] | |
row << codify_boolean(input["has_received_therapy"]) | |
row << input["has_received_therapy_time"] | |
row << input["has_received_therapy_time_count"] | |
row << input["has_received_therapy_agency"] | |
chronological_age = chronological_time(date_from_string(input['parent_birth_date']), date_from_string(input['_FILLED_IN'])) | |
row << input['parent_birth_date'] | |
row << chronological_age | |
row << codify_country(input['parent_birth_place']) | |
row << input['parent_birth_place_city'] | |
row << codify_state(input['parent_birth_place_stateprovince']) | |
row << input['parent_time_in_us'] | |
row << input['parent_time_in_us_count'] | |
if input['parent_time_in_us'].downcase == "frombirth" | |
row << chronological_age | |
elsif input['parent_time_in_us'].downcase == "years" | |
row << (input['parent_time_in_us_count'].to_i * 12) | |
elsif input['parent_time_in_us'].downcase == "months" | |
row << input['parent_time_in_us_count'] | |
elsif input['parent_time_in_us'].downcase == "weeks" | |
row << (input['parent_time_in_us_count'].to_i / 4) | |
elsif input['parent_time_in_us'].downcase == "" | |
row << "" | |
else | |
warn "Unrecognized parent_time_in_us \"#{input['parent_time_in_us']}\"" | |
row << input['parent_time_in_us_count'] | |
end | |
row << input['parent_times_returned_to_home_country'] | |
row << input['parent_time_in_home_country'] | |
row << input['parent_time_in_home_country_count'] | |
if input['parent_time_in_home_country'].downcase == "frombirth" | |
row << chronological_age | |
elsif input['parent_time_in_home_country'].downcase == "years" | |
row << (input['parent_time_in_home_country_count'].to_i * 12) | |
elsif input['parent_time_in_home_country'].downcase == "months" | |
row << input['parent_time_in_home_country_count'] | |
elsif input['parent_time_in_home_country'].downcase == "weeks" | |
row << (input['parent_time_in_home_country_count'].to_i / 4) | |
elsif input['parent_time_in_home_country'].downcase == "days" | |
row << (input['parent_time_in_home_country_count'].to_i / 30) | |
elsif input['parent_time_in_home_country'] == "" | |
row << "" | |
else | |
warn "Unrecognized parent_time_in_home_country \"#{input['parent_time_in_home_country']}\"" | |
row << input['parent_time_in_home_country_count'] | |
end | |
row << codify_boolean(input['parent_work_outside_home']) | |
row << input['parent_job_title'] | |
row << input['parent_job_responsibilities'] | |
row << input['parent_job_hours'] | |
row << codify_schooling(input['parent_schooling']) | |
row << codify_school_years(input['parent_schooling']) | |
row << input['parent_schooling_name'] | |
row << input['parent_schooling_units'] | |
row << input['parent_schooling_time'] | |
row << codify_first_to_move(input['parent_first_to_move']) | |
row << codify_stepfather(input['father_or_stepfather']) | |
row << input['father_birth_date_options'] | |
row << input['father_birth_date'] | |
father_chronological_age = chronological_time(date_from_string(input['father_birth_date']), date_from_string(input['_FILLED_IN'])) | |
row << father_chronological_age | |
row << codify_ethnicity(input['father_ethnicity']) | |
row << input['father_ethnicity_other_text'] | |
row << input['father_birth_place_na'] | |
row << codify_country(input['father_birth_place']) | |
row << input['father_birth_place_city'] | |
row << codify_state(input['father_birth_place_state']) | |
row << codify_boolean(input['father_time_in_us_na']) | |
row << input['father_time_in_us'] | |
row << input['father_time_in_us_count'] | |
if input['father_time_in_us'].downcase == "frombirth" | |
row << father_chronological_age | |
elsif input['father_time_in_us'].downcase == "years" | |
row << input['father_time_in_us_count'] | |
elsif input['father_time_in_us'].downcase == "months" | |
row << (input['father_time_in_us_count'].to_i / 12) | |
elsif input['father_time_in_us'].downcase == "weeks" | |
row << (input['father_time_in_us_count'].to_i / 52) | |
elsif input['father_time_in_us'].downcase == "" || input['father_time_in_us'].downcase == "doesnotliveinus" || input['father_time_in_us'].downcase == "unknown" | |
row << "" | |
else | |
warn "Unrecognized father_time_in_us \"#{input['father_time_in_us']}\"" | |
row << input['father_time_in_us_count'] | |
end | |
row << codify_boolean(input['father_job_outside_home']) | |
row << input['father_job_title'] | |
row << input['father_job_responsibilities'] | |
row << input['father_job_hours'] | |
row << codify_schooling(input['father_schooling']) | |
row << input['father_schooling_name'] | |
row << input['father_schooling_units'] | |
row << codify_school_years(input['father_schooling']) | |
row << codify_boolean(input['father_lives_with_parent']) | |
row << codify_viewing_frequency(input['father_viewing_frequency']) | |
row << codify_involvement(input['father_viewing_frequency']) | |
row << codify_boolean(input['parent_speaks_english']) | |
row << codify_boolean(input['parent_speaks_spanish']) | |
row << codify_boolean(input['parent_speaks_other']) | |
row << codify_boolean(input['parent_speaks_spanish_puertorican']) | |
row << codify_boolean(input['parent_speaks_spanish_mexican']) | |
row << codify_boolean(input['parent_speaks_spanish_cuban']) | |
row << codify_boolean(input['parent_speaks_spanish_other']) | |
row << input['parent_speaks_spanish_other_text'] | |
output << row | |
end | |
end | |
progress_bar.finish |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment