Created
March 23, 2022 20:04
-
-
Save tysonwepprich/c7b3debc29c46d0780d081fcebc8e84e to your computer and use it in GitHub Desktop.
Babynames Elo rating system
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Install these packages if you don't have them | |
library(dplyr) | |
library(tidyr) | |
library(ggplot2) | |
library(elo) | |
library(ggrepel) | |
'%!in%' <- function(x,y)!('%in%'(x,y)) | |
# babynames <- read.csv("data/national.csv", header = TRUE, stringsAsFactors = FALSE) # From SS admin download | |
babynames <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-03-22/babynames.csv') | |
# Choose filter for your set of names here | |
babyset <- babynames %>% | |
filter(year > 2000 & year <= 2018) %>% # choose which years to use (1880-2018) | |
group_by(name) %>% | |
summarise(numb = sum(n), | |
propF = sum(n[sex == "F"])/numb) %>% | |
filter(propF >= 0 & propF <= .8) %>% # choose sex (proportion females), here excluding names 80% or more female | |
arrange(-numb) %>% | |
mutate(usrank = row_number()) %>% | |
filter(usrank <= 400 & usrank >= 300) # choose rank (1 most popular, will change with the other filters you choose) | |
# Show your set of names to rank | |
print(sort(babyset$name)) | |
# how should pairs be presented? | |
# random seems inefficient, but start there | |
# could sort by unasked pairs, or rearrange by Elo after so many rounds | |
# bump out names with lots of losses (instead of button)? | |
# Sets up the one vs one matches | |
allpairs <- expand.grid(team.Home = unique(babyset$name), team.Visitor = unique(babyset$name)) %>% | |
filter(team.Home != team.Visitor) %>% | |
mutate_if(is.factor, as.character) %>% | |
mutate(qorder = sample(nrow(.), size = nrow(.), replace = FALSE), | |
asked = 0) %>% | |
arrange(qorder) | |
outdf <- data.frame(team.Home = NA, team.Visitor = NA, match = NA, wins.A = NA, query = NA, stringsAsFactors = FALSE) | |
graveyard <- data.frame(name = NA, match = NA, stringsAsFactors = FALSE) | |
# Run this "while" loop to make choices pop up in the console | |
# Press 0 to exit loop when you're sick of it and want to see results | |
a <- 1 | |
match <- 1 | |
while(a > 0){ | |
choice1 <- allpairs[which(allpairs$asked == 0), 1][1] | |
choice2 <- allpairs[which(allpairs$asked == 0), 2][1] | |
query <- menu(c(choice1, choice2, "Tie", "Never #1", "Never #2", "Remove both"), title="Pick the better name (press 0 to stop and rank)") | |
wins.A <- case_when(query %in% c(1, 5) ~ 1, | |
query %in% c(2, 4) ~ 0, | |
query %in% c(3, 6) ~ 0.5) | |
outdf[match, ] <- data.frame(team.Home = choice1, team.Visitor = choice2, | |
match = match, wins.A = wins.A, answer = query, stringsAsFactors = FALSE) | |
# update allpairs | |
allpairs$asked[which(allpairs$asked == 0)][1] <- 1 | |
if(query == 4){ | |
allpairs$asked[which(allpairs$team.Home == choice1 | allpairs$team.Visitor == choice1)] <- 1 | |
graveyard[(nrow(graveyard)+1), ] <- data.frame(name = choice1, match = match, stringsAsFactors = FALSE) | |
} | |
if(query == 5){ | |
allpairs$asked[which(allpairs$team.Home == choice2 | allpairs$team.Visitor == choice2)] <- 1 | |
graveyard[(nrow(graveyard)+1), ] <- data.frame(name = choice2, match = match, stringsAsFactors = FALSE) | |
} | |
if(query == 6){ | |
allpairs$asked[which(allpairs$team.Home == choice1 | allpairs$team.Visitor == choice1 | allpairs$team.Home == choice2 | allpairs$team.Visitor == choice2)] <- 1 | |
graveyard[(nrow(graveyard)+1):(nrow(graveyard)+2), ] <- data.frame(name = c(choice1, choice2), match = c(match, match), stringsAsFactors = FALSE) | |
} | |
a <- query | |
match <- match + 1 | |
print(paste(length(which(allpairs$asked == 0)), "matches to go")) | |
} | |
# Now get results | |
matchups <- outdf %>% filter(complete.cases(.)) | |
e <- elo.run(wins.A ~ team.Home + team.Visitor, data = matchups, k = 50) | |
summary(e) | |
sort(rank.teams(e)[-which(names(rank.teams(e)) %in% graveyard$name)]) # YOUR BABY'S NAME, SPOILER | |
head(as.matrix(e)) | |
str(as.data.frame(e)) | |
# final.elos(e) | |
# matplot(as.matrix(e), type = "l") | |
# Set up data to plot | |
dat <- as.data.frame(as.matrix(e)) %>% | |
mutate(match = row_number()) %>% | |
pivot_longer(-match, names_to = "name", values_to = "elo") | |
maxmatch <- max(dat$match) | |
minelo <- min(dat$elo, na.rm = TRUE) | |
labeldat <- matchups[, c("team.Home", "match")] %>% | |
bind_rows(matchups[, c("team.Visitor", "match")]) %>% | |
rowwise() %>% | |
mutate(name = ifelse(is.na(team.Home), team.Visitor, team.Home)) %>% | |
left_join(dat) %>% | |
group_by(name) %>% | |
summarise(endscore = elo[order(-match)][1], | |
xmatch = ifelse(name[1] %in% graveyard$name, max(match, na.rm = TRUE), maxmatch)) | |
labeldat$endscore[which(labeldat$name %in% graveyard$name)] <- rep(seq(minelo-110, minelo-10, by = 10), nrow(labeldat)) | |
linedat <- dat %>% | |
filter(name %!in% graveyard$name) | |
theme_set(theme_bw(base_size = 14)) | |
# Plots score over time (matches) and shows graveyard of rejected names along bottom. | |
ggplot(linedat, aes(x = match, y = elo, group = name, color = name)) + | |
geom_line() + | |
scale_x_continuous(limits = c(0, maxmatch * 1.1)) + | |
annotate(geom="text", x = maxmatch/2, y = 1400, label = "GRAVEYARD") + | |
ylab("Elo rating") + xlab("Matchup") + | |
geom_text_repel(data = labeldat, aes(x = xmatch, y = endscore, label = name), inherit.aes = FALSE, nudge_x = maxmatch * 0.01) + | |
theme(panel.grid.major = element_blank(), | |
panel.grid.minor = element_blank(), | |
legend.position = "none") + | |
ggtitle("Elo rating progression for 100 names in head-to-head matchups") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment