Skip to content

Instantly share code, notes, and snippets.

@Protonk
Last active December 12, 2015 06:59
Show Gist options
  • Save Protonk/4733427 to your computer and use it in GitHub Desktop.
Save Protonk/4733427 to your computer and use it in GitHub Desktop.
Grab baby names from the SSA, convert to a single dataframe while preserving years. Compute probability of name being female/male in toto. Return a data frame without years.
#####
#
# Download Baby Names and flatten into single data file
#
#####
# libraries required for the last step.
library(plyr)
library(reshape2)
## Setup temp files
# Not strictly necessary, but we don't need to retain the zip
names.tmpdir <- tempdir()
temp <- tempfile()
# download and unzip
# See http://www.ssa.gov/oact/babynames/limits.html for info
download.file('http://www.ssa.gov/oact/babynames/names.zip', temp)
unzip(temp, exdir = names.tmpdir)
# get vector of path names
yr.name.files <- list.files(path = names.tmpdir, pattern = "[^.]*\\.txt", full.names = TRUE)
# Read a csv from a file, adding the year from the file to
# a new column
readWrap <- function(filepath) {
yr.out <- read.csv(filepath,
col.names = c("Name", "Sex", "Count"),
header = FALSE, as.is = TRUE)
yr.out[, "Year"] <- gsub("yob([0-9]{4})\\.txt", "\\1", basename(filepath))
return(yr.out)
}
# Combine all years into a single dataframe
us.names.df <- do.call(rbind, lapply(yr.name.files, readWrap))
# Condense names to single row
matchSexes <- function(x) {
# melt and cast are two broad data handling patterns
# think of them as the two steps in constructing a
# pivot table
x.melt <- melt(x, id.vars = c("Name", "Sex"), measure.vars = "Count")
x.out <- dcast(x.melt, Name ~ Sex, sum)
x.out[, "Name"] <- as.character(x.out[, "Name"])
# Faster/safer than unique(x[, "Year"])
# no easy way to extract from the passed argument
x.out[, "Year"] <- x[, "Year"][1]
return(x.out)
}
# this will take a while. You're looping over 100+ years
# comprising ~2 million rows
us.names.df <- ddply(us.names.df, "Year", function(x) matchSexes(x))
## Cleanup from import
# not required but still
unlink(c(names.tmpdir, temp))
rm(temp, yr.name.files, names.tmpdir, readWrap)
######
#
# Compute incidence and ambiguity of gender in names
#
######
# plyr is pretty central to this
library(plyr)
# structure will look like this:
# Name F M Year
# 1 Aaron 0 102 1880
# 2 Ab 0 5 1880
# 3 Abbie 71 0 1880
# 4 Abbott 0 5 1880
# 5 Abby 6 0 1880
# 6 Abe 0 50 1880
# Each name is now associated w/ a *single* row
# so we only need to look up one "key" as it were
# Handling years now, so we convert it to numeric
us.names.df[, "Year"] <- as.numeric(us.names.df[, "Year"])
# Nate's year logic from https://gist.github.com/natematias/4743564
# used for consistency
nateMod <- function(x, penalty = 100) {
out <- x
# hahahaha That's awesome. This is
# actually handy http://rmazing.wordpress.com/2013/01/30/the-magic-empty-bracket/
out[] <- 1
# unless I'm reading it wrong, there's a
# bug in the ruby script. Should be 1 + ...
# 1 - ... weights old/new years more than 1960-1980
# Post 1980 names shouldn't be penalized b/c we want to
# capture m/f dynamics in new 1st gen american names
# out[x > 1980] <- 1 + (1960 - x[x > 1980])/penalty
out[x < 1960] <- 1 + (x[x < 1960] - 1980)/penalty
return(out)
}
us.names.df[, "YearModifier"] <- nateMod(us.names.df[, "Year"])
# Compute the bare proportion of female names
# ddply not necessary, since we're not changing the mapping
us.names.df[, "PropF"] <- with(us.names.df, F/(F + M))
# Male is just 1 - PropF
us.names.df[, "PropM"] <- 1 - us.names.df[, "PropF"]
## in practical terms very few names in a given year are ambiguous
## roughly 10% are not 1 or 0 and > 40% of those are 0-0.1 or 0.8-1
## There is still some value to retaining this information
## Sums up the proportion of female (male) names
# populate name column quickly
us.final.df <- data.frame(Name = sort(unique(us.names.df[, "Name"])), stringsAsFactors = FALSE)
# MUCH faster than ddply or tapply
us.final.df[, "SumPropF"] <- with(us.names.df, rowsum(PropF*YearModifier, Name))
us.final.df[, "SumPropM"] <- with(us.names.df, rowsum(PropM*YearModifier, Name))
# Counts appearances by years (used to normalize the result)
us.final.df <- merge(us.final.df, count(us.names.df, "Name"), by = "Name")
# This is a bit of spaghetti code, but it's our normalization
us.final.df[, "ImputedProbF"] <- us.final.df[, "SumPropF"] / us.final.df[, "freq"]
us.final.df[, "ImputedProbM"] <- us.final.df[, "SumPropM"] / us.final.df[, "freq"]
# I don't know if a boolean is faster to read from a csv in Ruby
# but this allows us to reject names which have *no* other sex
# occurrences
us.final.df[, "AnyFemale"] <- us.final.df[, "ImputedProbF"] > 0
us.final.df[, "AnyMale"] <- us.final.df[, "ImputedProbM"] > 0
# cleanup the final df
us.final.df <- us.final.df[, c("Name", "freq",
"AnyFemale", "ImputedProbF",
"AnyMale", "ImputedProbM")]
names(us.final.df)[2] <- "YearsAppearing"
# Output structure looks like:
# Name YearsAppearing AnyFemale ImputedProbF AnyMale ImputedProbM
# 1 Aaban 4 FALSE 0 TRUE 1
# 2 Aabha 1 TRUE 1 FALSE 0
# 3 Aabid 1 FALSE 0 TRUE 1
# 4 Aabriella 1 TRUE 1 FALSE 0
# 5 Aadam 20 FALSE 0 TRUE 1
# 6 Aadan 6 FALSE 0 TRUE 1
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment