Created
May 13, 2014 16:33
-
-
Save malev/8598692a40023a1ef422 to your computer and use it in GitHub Desktop.
Clean Uruguay names & gender dataset
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# encoding: UTF-8 | |
require 'csv' | |
filename = 'nombre_nacim_por_anio_y_sexo.csv' | |
class Name | |
attr_reader :name, :gender, :male_count, :female_count, :year | |
def self.valid?(name) | |
!!name | |
end | |
def initialize(name, gender, freq, year=1960) | |
@year = year | |
@name = format(name) | |
@gender = translate(gender) | |
@male_count = 0 | |
@female_count = 0 | |
if female? | |
@female_count += freq.to_i | |
elsif male? | |
@male_count += freq.to_i | |
end | |
end | |
def format(cad) | |
cad = cad.strip.downcase | |
cad[0] = cad[0].upcase | |
cad | |
end | |
def translate(str) | |
if str && str.strip.downcase == 'femenino' | |
'female' | |
elsif str && str.strip.downcase == 'masculino' | |
'male' | |
else | |
'unknown' | |
end | |
end | |
def female? | |
gender == 'female' | |
end | |
def male? | |
gender == 'male' | |
end | |
def ==(other) | |
return @name == other.name && @gender == other.gender | |
end | |
def merge(other) | |
@male_count += other.male_count | |
@female_count += other.female_count | |
end | |
def <=>(other) | |
return @name <=> other.name | |
end | |
def to_s | |
"<Name: #{@name} | #{@gender} | #{@freq}>" | |
end | |
def to_row | |
#Name,years.appearing,count.male,count.female,prob.gender,obs.male,est.male,upper,lower | |
[@name, year_appearing, male_count, female_count, prob_gender, nil, nil, nil, nil] | |
end | |
def year_appearing | |
2014 - year | |
end | |
def prob_gender | |
if male_count > female_count | |
'male' | |
elsif female_count > male_count | |
'female' | |
else | |
'unknown' | |
end | |
end | |
end | |
names = [] | |
CSV.foreach(filename, "r:windows-1252") do |row| | |
if Name.valid?(row[2]) | |
new_name = Name.new(row[2], row[1], row[3]) | |
old_name = names.detect { |name| name == new_name } | |
if old_name | |
old_name.merge(new_name) | |
else | |
names << new_name | |
end | |
end | |
end | |
CSV.open("output_monte.csv", "w:iso-8859-1") do |csv| | |
names.sort.each { |name| csv << name.to_row } | |
end | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment