Skip to content

Instantly share code, notes, and snippets.

@alexhanna
Last active December 14, 2015 18:48
Show Gist options
  • Save alexhanna/5131764 to your computer and use it in GitHub Desktop.
Save alexhanna/5131764 to your computer and use it in GitHub Desktop.
RuPaul's drag race
library(RCurl)
library(ggplot2)
library(survival)
## plot Kaplan-Meier curve
plotKM <- function(df, metric) {
## make age categorical
df$Age[df$Age < 25] <- 1
df$Age[df$Age >= 25 & df$Age < 31] <- 2
df$Age[df$Age >= 31] <- 3
t.surv <- with(df, Surv(Start, End, Out))
t.fit <- survfit(t.surv ~ df[[ metric ]])
## a lot of this coming from here: http://www.ceb-institute.org/bbs/wp-content/uploads/2011/09/handout_ggplot2.pdf
## create strata vector
t.strata <- NULL
for(f.i in 1:length(t.fit$strata)) {
# add vector for one strata according to number of rows of strata
t.strata <- c(t.strata, rep(names(t.fit$strata)[f.i], t.fit$strata[f.i]))
}
## create a data frame from fit data
t.survframe <- data.frame( time = t.fit$time, strata = t.strata, surv = t.fit$surv, censor = t.fit$n.censor )
## plot
p <- ggplot(t.survframe)
p <- p + geom_step(aes(time, surv, color = strata), direction="hv")
## use points for censors
p <- p + geom_point(aes(time, surv, color = strata), data = subset(t.survframe, censor == 1))
if (metric == "Age") {
p <- p + scale_colour_discrete(
name="Age",
labels=c("< 25", "25 - 30", "> 30"))
} else if (metric == "Wins") {
p <- p + scale_colour_discrete(name = metric, labels = c("0", "1", "2", "3", "4"))
} else if (metric == "Lipsyncs") {
p <- p + scale_colour_discrete(name = metric, labels = c("0", "1", "2", "3"))
}
ggsave(p, filename = paste("km-", metric, ".png", sep = ""), width = 10, height = 8)
}
plotKM(df, "Age")
plotKM(df, "Wins")
plotKM(df, "Lipsyncs")
myCsv <- getURL("https://docs.google.com/spreadsheet/pub?key=0Avriupr1jL4kdEVRcklVX3J0bkJibEswbVdiZ0pnM0E&output=csv")
raw <- read.csv(textConnection(myCsv))
currentSeason <- 5
## replace NAs with 0
raw[is.na(raw)] <- 0
raw$CompLeft <- 0
## control for the number of competitors left
for (i in 1:currentSeason) {
df.sub <- raw[which(raw$Season == i), ]
cl <- max(df.sub$Place) - df.sub$End
raw[which(raw$Season == i), ]$CompLeft <- cl
## fix for final round
if (i < currentSeason) {
raw[which(raw$Season == i & raw$Place <= 3),]$CompLeft <- 3
}
}
## Calculate a lipsync variable such that it only counts for the person who stayed in.
## Outs are not counted for those who ended in the top three of each season.
raw$LipsyncWithoutOut <- raw$Lipsyncs
raw[which(raw$Out == 1 & raw$Place >= 3),]$LipsyncWithoutOut <- raw[which(raw$Out == 1 & raw$Place >= 3),]$Lipsyncs - 1
## Willam was disqualified and therefore did not leave with a lipsync
raw[which(raw$Name == "Willam" & raw$Out == 1),]$LipsyncWithoutOut <- subset(raw, Name == "Willam" & Out == 1)$Lipsyncs
## restrict the data the model is trained on to first four seasons
df <- raw[which(raw$Season < 5),]
## Survival object
t.surv <- with(df, Surv(Start, End, Out))
## time invariant models
t.cox1 <- coxph(t.surv ~ (Age + PlusSize + PuertoRico), df)
## time variant models. need to use robust to account for temporal dependence
t.cox2 <- coxph(t.surv ~ (Age + PlusSize + PuertoRico + Wins + Highs + Lows + Lipsyncs) + cluster(ID), df)
t.cox3 <- coxph(t.surv ~ (Age + PlusSize + PuertoRico + Wins + Highs + Lows + LipsyncWithoutOut) + cluster(ID), df)
## Prediction
currentEpisode <- 6
## use the fifth season to predict outcomes
dp <- raw[which(raw$Season == 5),]
dp[is.na(dp)] <- 0
# do the lipsync correction
dp$LipsyncWithoutOut <- dp$Lipsyncs
dp[which(dp$Out == 1 & dp$Place > 3),]$LipsyncWithoutOut <- dp[which(dp$Out == 1 & dp$Place > 3),]$Lipsyncs - 1
## generate the relative risk with preferred model and merge it with the existing data
t.predict <- predict(t.cox2, dp, "risk", se.fit = T)
dp$relRisk <- t.predict$fit
dp$seRelRisk <- t.predict$se.fit
## clean up the data frame and display the ordered relative risks
predictions <- dp[which(dp$End == currentEpisode & dp$Out == 0),]
ordered <- predictions[with(predictions, order(relRisk)),]
ordered <- data.frame(Name = ordered$Name, relRisk = ordered$relRisk, se.relRisk = ordered$seRelRisk)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment