Protonk · December 12, 2015 06:59
diff --git a/babynames.R b/babynames.R
 #####
 #
 # Download Baby Names and flatten into single data file
 #
 #####

 # libraries required for the last step. 
 library(plyr)
 library(reshape2)

 ## Setup temp files
 # Not strictly necessary, but we don't need to retain the zip

 names.tmpdir <- tempdir()
 temp <- tempfile()

 # download and unzip
 # See http://www.ssa.gov/oact/babynames/limits.html for info 
 download.file('http://www.ssa.gov/oact/babynames/names.zip', temp)
 unzip(temp, exdir = names.tmpdir)

 # get vector of path names
 yr.name.files <- list.files(path = names.tmpdir, pattern = "[^.]*\\.txt", full.names = TRUE)

 # Read a csv from a file, adding the year from the file to 
 # a new column
 readWrap <- function(filepath) {
  
  yr.out <- read.csv(filepath, 
                     col.names = c("Name", "Sex", "Count"), 
                     header = FALSE, as.is = TRUE)
  yr.out[, "Year"] <- gsub("yob([0-9]{4})\\.txt", "\\1", basename(filepath))
  return(yr.out)
 }

 # Combine all years into a single dataframe

 us.names.df <- do.call(rbind, lapply(yr.name.files, readWrap))

 # Condense names to single row

 matchSexes <- function(x) {
  # melt and cast are two broad data handling patterns
  # think of them as the two steps in constructing a
  # pivot table
  x.melt <- melt(x, id.vars = c("Name", "Sex"), measure.vars = "Count")
  x.out <- dcast(x.melt, Name ~ Sex, sum)
  x.out[, "Name"] <- as.character(x.out[, "Name"])
  # Faster/safer than unique(x[, "Year"])
  # no easy way to extract from the passed argument
  x.out[, "Year"] <- x[, "Year"][1]
  return(x.out)
 }

 # this will take a while. You're looping over 100+ years 
 # comprising ~2 million rows

 us.names.df <- ddply(us.names.df, "Year", function(x) matchSexes(x))


 ## Cleanup from import
 # not required but still

 unlink(c(names.tmpdir, temp))
 rm(temp, yr.name.files, names.tmpdir, readWrap)
diff --git a/gender.prob.R b/gender.prob.R
 ######
 #
 # Compute incidence and ambiguity of gender in names
 #
 ######

 # plyr is pretty central to this

 library(plyr)


 # structure will look like this:

 #     Name  F   M Year
 # 1  Aaron  0 102 1880
 # 2     Ab  0   5 1880
 # 3  Abbie 71   0 1880
 # 4 Abbott  0   5 1880
 # 5   Abby  6   0 1880
 # 6    Abe  0  50 1880

 # Each name is now associated w/ a *single* row
 # so we only need to look up one "key" as it were

 # Handling years now, so we convert it to numeric 

 us.names.df[, "Year"] <- as.numeric(us.names.df[, "Year"])

 # Nate's year logic from https://gist.github.com/natematias/4743564
 # used for consistency

 nateMod <- function(x, penalty = 100) {
  out <- x
  # hahahaha That's awesome. This is
  # actually handy http://rmazing.wordpress.com/2013/01/30/the-magic-empty-bracket/
  out[] <- 1
  # unless I'm reading it wrong, there's a 
  # bug in the ruby script. Should be 1 + ...
  # 1 - ... weights old/new years more than 1960-1980
  
  # Post 1980 names shouldn't be penalized b/c we want to
  # capture m/f dynamics in new 1st gen american names
  
  # out[x > 1980] <- 1 + (1960 - x[x > 1980])/penalty
  out[x < 1960] <- 1 + (x[x < 1960] - 1980)/penalty
  return(out)
 }

 us.names.df[, "YearModifier"] <- nateMod(us.names.df[, "Year"])

 # Compute the bare proportion of female names
 # ddply not necessary, since we're not changing the mapping

 us.names.df[, "PropF"] <- with(us.names.df, F/(F + M))

 # Male is just 1 - PropF

 us.names.df[, "PropM"] <- 1 - us.names.df[, "PropF"]

 ## in practical terms very few names in a given year are ambiguous
 ## roughly 10% are not 1 or 0 and > 40% of those are 0-0.1 or 0.8-1
 ## There is still some value to retaining this information

 ## Sums up the proportion of female (male) names 

 # populate name column quickly 
 us.final.df <- data.frame(Name = sort(unique(us.names.df[, "Name"])), stringsAsFactors = FALSE)

 # MUCH faster than ddply or tapply
 us.final.df[, "SumPropF"] <- with(us.names.df, rowsum(PropF*YearModifier, Name))
 us.final.df[, "SumPropM"] <- with(us.names.df, rowsum(PropM*YearModifier, Name))


 # Counts appearances by years (used to normalize the result)
 us.final.df <- merge(us.final.df, count(us.names.df, "Name"), by = "Name")
 
 # This is a bit of spaghetti code, but it's our normalization  
 us.final.df[, "ImputedProbF"] <- us.final.df[, "SumPropF"] / us.final.df[, "freq"]
 us.final.df[, "ImputedProbM"] <- us.final.df[, "SumPropM"] / us.final.df[, "freq"]

 # I don't know if a boolean is faster to read from a csv in Ruby
 # but this allows us to reject names which have *no* other sex
 # occurrences 
 us.final.df[, "AnyFemale"] <- us.final.df[, "ImputedProbF"] > 0
 us.final.df[, "AnyMale"] <- us.final.df[, "ImputedProbM"] > 0

 # cleanup the final df
 us.final.df <- us.final.df[, c("Name", "freq", 
                               "AnyFemale", "ImputedProbF",
                               "AnyMale", "ImputedProbM")]

 names(us.final.df)[2] <- "YearsAppearing"

 # Output structure looks like:

 #        Name YearsAppearing AnyFemale ImputedProbF AnyMale ImputedProbM
 # 1     Aaban              4     FALSE            0    TRUE            1
 # 2     Aabha              1      TRUE            1   FALSE            0
 # 3     Aabid              1     FALSE            0    TRUE            1
 # 4 Aabriella              1      TRUE            1   FALSE            0
 # 5     Aadam             20     FALSE            0    TRUE            1
 # 6     Aadan              6     FALSE            0    TRUE            1
	#####
	#
	# Download Baby Names and flatten into single data file
	#
	#####

	# libraries required for the last step.
	library(plyr)
	library(reshape2)

	## Setup temp files
	# Not strictly necessary, but we don't need to retain the zip

	names.tmpdir <- tempdir()
	temp <- tempfile()

	# download and unzip
	# See http://www.ssa.gov/oact/babynames/limits.html for info
	download.file('http://www.ssa.gov/oact/babynames/names.zip', temp)
	unzip(temp, exdir = names.tmpdir)

	# get vector of path names
	yr.name.files <- list.files(path = names.tmpdir, pattern = "[^.]*\\.txt", full.names = TRUE)

	# Read a csv from a file, adding the year from the file to
	# a new column
	readWrap <- function(filepath) {

	yr.out <- read.csv(filepath,
	col.names = c("Name", "Sex", "Count"),
	header = FALSE, as.is = TRUE)
	yr.out[, "Year"] <- gsub("yob([0-9]{4})\\.txt", "\\1", basename(filepath))
	return(yr.out)
	}

	# Combine all years into a single dataframe

	us.names.df <- do.call(rbind, lapply(yr.name.files, readWrap))

	# Condense names to single row

	matchSexes <- function(x) {
	# melt and cast are two broad data handling patterns
	# think of them as the two steps in constructing a
	# pivot table
	x.melt <- melt(x, id.vars = c("Name", "Sex"), measure.vars = "Count")
	x.out <- dcast(x.melt, Name ~ Sex, sum)
	x.out[, "Name"] <- as.character(x.out[, "Name"])
	# Faster/safer than unique(x[, "Year"])
	# no easy way to extract from the passed argument
	x.out[, "Year"] <- x[, "Year"][1]
	return(x.out)
	}

	# this will take a while. You're looping over 100+ years
	# comprising ~2 million rows

	us.names.df <- ddply(us.names.df, "Year", function(x) matchSexes(x))


	## Cleanup from import
	# not required but still

	unlink(c(names.tmpdir, temp))
	rm(temp, yr.name.files, names.tmpdir, readWrap)
	######
	#
	# Compute incidence and ambiguity of gender in names
	#
	######

	# plyr is pretty central to this

	library(plyr)


	# structure will look like this:

	# Name F M Year
	# 1 Aaron 0 102 1880
	# 2 Ab 0 5 1880
	# 3 Abbie 71 0 1880
	# 4 Abbott 0 5 1880
	# 5 Abby 6 0 1880
	# 6 Abe 0 50 1880

	# Each name is now associated w/ a single row
	# so we only need to look up one "key" as it were

	# Handling years now, so we convert it to numeric

	us.names.df[, "Year"] <- as.numeric(us.names.df[, "Year"])

	# Nate's year logic from https://gist.github.com/natematias/4743564
	# used for consistency

	nateMod <- function(x, penalty = 100) {
	out <- x
	# hahahaha That's awesome. This is
	# actually handy http://rmazing.wordpress.com/2013/01/30/the-magic-empty-bracket/
	out[] <- 1
	# unless I'm reading it wrong, there's a
	# bug in the ruby script. Should be 1 + ...
	# 1 - ... weights old/new years more than 1960-1980

	# Post 1980 names shouldn't be penalized b/c we want to
	# capture m/f dynamics in new 1st gen american names

	# out[x > 1980] <- 1 + (1960 - x[x > 1980])/penalty
	out[x < 1960] <- 1 + (x[x < 1960] - 1980)/penalty
	return(out)
	}

	us.names.df[, "YearModifier"] <- nateMod(us.names.df[, "Year"])

	# Compute the bare proportion of female names
	# ddply not necessary, since we're not changing the mapping

	us.names.df[, "PropF"] <- with(us.names.df, F/(F + M))

	# Male is just 1 - PropF

	us.names.df[, "PropM"] <- 1 - us.names.df[, "PropF"]

	## in practical terms very few names in a given year are ambiguous
	## roughly 10% are not 1 or 0 and > 40% of those are 0-0.1 or 0.8-1
	## There is still some value to retaining this information

	## Sums up the proportion of female (male) names

	# populate name column quickly
	us.final.df <- data.frame(Name = sort(unique(us.names.df[, "Name"])), stringsAsFactors = FALSE)

	# MUCH faster than ddply or tapply
	us.final.df[, "SumPropF"] <- with(us.names.df, rowsum(PropF*YearModifier, Name))
	us.final.df[, "SumPropM"] <- with(us.names.df, rowsum(PropM*YearModifier, Name))


	# Counts appearances by years (used to normalize the result)
	us.final.df <- merge(us.final.df, count(us.names.df, "Name"), by = "Name")

	# This is a bit of spaghetti code, but it's our normalization
	us.final.df[, "ImputedProbF"] <- us.final.df[, "SumPropF"] / us.final.df[, "freq"]
	us.final.df[, "ImputedProbM"] <- us.final.df[, "SumPropM"] / us.final.df[, "freq"]

	# I don't know if a boolean is faster to read from a csv in Ruby
	# but this allows us to reject names which have no other sex
	# occurrences
	us.final.df[, "AnyFemale"] <- us.final.df[, "ImputedProbF"] > 0
	us.final.df[, "AnyMale"] <- us.final.df[, "ImputedProbM"] > 0

	# cleanup the final df
	us.final.df <- us.final.df[, c("Name", "freq",
	"AnyFemale", "ImputedProbF",
	"AnyMale", "ImputedProbM")]

	names(us.final.df)[2] <- "YearsAppearing"

	# Output structure looks like:

	# Name YearsAppearing AnyFemale ImputedProbF AnyMale ImputedProbM
	# 1 Aaban 4 FALSE 0 TRUE 1
	# 2 Aabha 1 TRUE 1 FALSE 0
	# 3 Aabid 1 FALSE 0 TRUE 1
	# 4 Aabriella 1 TRUE 1 FALSE 0
	# 5 Aadam 20 FALSE 0 TRUE 1
	# 6 Aadan 6 FALSE 0 TRUE 1