Last active
December 14, 2015 18:48
-
-
Save alexhanna/5131764 to your computer and use it in GitHub Desktop.
RuPaul's drag race
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(RCurl) | |
library(ggplot2) | |
library(survival) | |
## plot Kaplan-Meier curve | |
plotKM <- function(df, metric) { | |
## make age categorical | |
df$Age[df$Age < 25] <- 1 | |
df$Age[df$Age >= 25 & df$Age < 31] <- 2 | |
df$Age[df$Age >= 31] <- 3 | |
t.surv <- with(df, Surv(Start, End, Out)) | |
t.fit <- survfit(t.surv ~ df[[ metric ]]) | |
## a lot of this coming from here: http://www.ceb-institute.org/bbs/wp-content/uploads/2011/09/handout_ggplot2.pdf | |
## create strata vector | |
t.strata <- NULL | |
for(f.i in 1:length(t.fit$strata)) { | |
# add vector for one strata according to number of rows of strata | |
t.strata <- c(t.strata, rep(names(t.fit$strata)[f.i], t.fit$strata[f.i])) | |
} | |
## create a data frame from fit data | |
t.survframe <- data.frame( time = t.fit$time, strata = t.strata, surv = t.fit$surv, censor = t.fit$n.censor ) | |
## plot | |
p <- ggplot(t.survframe) | |
p <- p + geom_step(aes(time, surv, color = strata), direction="hv") | |
## use points for censors | |
p <- p + geom_point(aes(time, surv, color = strata), data = subset(t.survframe, censor == 1)) | |
if (metric == "Age") { | |
p <- p + scale_colour_discrete( | |
name="Age", | |
labels=c("< 25", "25 - 30", "> 30")) | |
} else if (metric == "Wins") { | |
p <- p + scale_colour_discrete(name = metric, labels = c("0", "1", "2", "3", "4")) | |
} else if (metric == "Lipsyncs") { | |
p <- p + scale_colour_discrete(name = metric, labels = c("0", "1", "2", "3")) | |
} | |
ggsave(p, filename = paste("km-", metric, ".png", sep = ""), width = 10, height = 8) | |
} | |
plotKM(df, "Age") | |
plotKM(df, "Wins") | |
plotKM(df, "Lipsyncs") | |
myCsv <- getURL("https://docs.google.com/spreadsheet/pub?key=0Avriupr1jL4kdEVRcklVX3J0bkJibEswbVdiZ0pnM0E&output=csv") | |
raw <- read.csv(textConnection(myCsv)) | |
currentSeason <- 5 | |
## replace NAs with 0 | |
raw[is.na(raw)] <- 0 | |
raw$CompLeft <- 0 | |
## control for the number of competitors left | |
for (i in 1:currentSeason) { | |
df.sub <- raw[which(raw$Season == i), ] | |
cl <- max(df.sub$Place) - df.sub$End | |
raw[which(raw$Season == i), ]$CompLeft <- cl | |
## fix for final round | |
if (i < currentSeason) { | |
raw[which(raw$Season == i & raw$Place <= 3),]$CompLeft <- 3 | |
} | |
} | |
## Calculate a lipsync variable such that it only counts for the person who stayed in. | |
## Outs are not counted for those who ended in the top three of each season. | |
raw$LipsyncWithoutOut <- raw$Lipsyncs | |
raw[which(raw$Out == 1 & raw$Place >= 3),]$LipsyncWithoutOut <- raw[which(raw$Out == 1 & raw$Place >= 3),]$Lipsyncs - 1 | |
## Willam was disqualified and therefore did not leave with a lipsync | |
raw[which(raw$Name == "Willam" & raw$Out == 1),]$LipsyncWithoutOut <- subset(raw, Name == "Willam" & Out == 1)$Lipsyncs | |
## restrict the data the model is trained on to first four seasons | |
df <- raw[which(raw$Season < 5),] | |
## Survival object | |
t.surv <- with(df, Surv(Start, End, Out)) | |
## time invariant models | |
t.cox1 <- coxph(t.surv ~ (Age + PlusSize + PuertoRico), df) | |
## time variant models. need to use robust to account for temporal dependence | |
t.cox2 <- coxph(t.surv ~ (Age + PlusSize + PuertoRico + Wins + Highs + Lows + Lipsyncs) + cluster(ID), df) | |
t.cox3 <- coxph(t.surv ~ (Age + PlusSize + PuertoRico + Wins + Highs + Lows + LipsyncWithoutOut) + cluster(ID), df) | |
## Prediction | |
currentEpisode <- 6 | |
## use the fifth season to predict outcomes | |
dp <- raw[which(raw$Season == 5),] | |
dp[is.na(dp)] <- 0 | |
# do the lipsync correction | |
dp$LipsyncWithoutOut <- dp$Lipsyncs | |
dp[which(dp$Out == 1 & dp$Place > 3),]$LipsyncWithoutOut <- dp[which(dp$Out == 1 & dp$Place > 3),]$Lipsyncs - 1 | |
## generate the relative risk with preferred model and merge it with the existing data | |
t.predict <- predict(t.cox2, dp, "risk", se.fit = T) | |
dp$relRisk <- t.predict$fit | |
dp$seRelRisk <- t.predict$se.fit | |
## clean up the data frame and display the ordered relative risks | |
predictions <- dp[which(dp$End == currentEpisode & dp$Out == 0),] | |
ordered <- predictions[with(predictions, order(relRisk)),] | |
ordered <- data.frame(Name = ordered$Name, relRisk = ordered$relRisk, se.relRisk = ordered$seRelRisk) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment