alexhanna · December 14, 2015 18:48
diff --git a/rupaul.R b/rupaul.R
 library(RCurl)
 library(ggplot2)
 library(survival)

 ## plot Kaplan-Meier curve
 plotKM <- function(df, metric) {
 	## make age categorical
 	df$Age[df$Age < 25] <- 1
 	df$Age[df$Age >= 25 & df$Age < 31] <- 2
 	df$Age[df$Age >= 31] <- 3

 	t.surv  <- with(df, Surv(Start, End, Out))
 	t.fit   <- survfit(t.surv ~ df[[ metric ]])

 	## a lot of this coming from here: http://www.ceb-institute.org/bbs/wp-content/uploads/2011/09/handout_ggplot2.pdf
 	## create strata vector
 	t.strata <- NULL
 	for(f.i in 1:length(t.fit$strata)) {
 		# add vector for one strata according to number of rows of strata
 		t.strata <- c(t.strata, rep(names(t.fit$strata)[f.i], t.fit$strata[f.i]))
 	}

 	## create a data frame from fit data
 	t.survframe <- data.frame( time = t.fit$time, strata = t.strata, surv = t.fit$surv, censor = t.fit$n.censor )

 	## plot 
 	p <- ggplot(t.survframe)
 	p <- p + geom_step(aes(time, surv, color = strata), direction="hv")
 	
 	## use points for censors
 	p <- p + geom_point(aes(time, surv, color = strata), data = subset(t.survframe, censor == 1))

 	if (metric == "Age") {
 		p <- p + scale_colour_discrete(
                       name="Age",
                       labels=c("< 25", "25 - 30", "> 30"))
 	} else if (metric == "Wins") {
 		p <- p + scale_colour_discrete(name = metric, labels = c("0", "1", "2", "3", "4"))
 	} else if (metric == "Lipsyncs") {
 		p <- p + scale_colour_discrete(name = metric, labels = c("0", "1", "2", "3"))
 	}

 	ggsave(p, filename = paste("km-", metric, ".png", sep = ""), width = 10, height = 8)
 }

 plotKM(df, "Age")
 plotKM(df, "Wins")
 plotKM(df, "Lipsyncs")

 myCsv <- getURL("https://docs.google.com/spreadsheet/pub?key=0Avriupr1jL4kdEVRcklVX3J0bkJibEswbVdiZ0pnM0E&output=csv")
 raw   <- read.csv(textConnection(myCsv))

 currentSeason <- 5

 ## replace NAs with 0
 raw[is.na(raw)] <- 0

 raw$CompLeft <- 0

 ## control for the number of competitors left
 for (i in 1:currentSeason) {
 	df.sub <- raw[which(raw$Season == i), ]
 	cl     <- max(df.sub$Place) - df.sub$End
 	raw[which(raw$Season == i), ]$CompLeft <- cl

 	## fix for final round
 	if (i < currentSeason) {
 		raw[which(raw$Season == i & raw$Place <= 3),]$CompLeft <- 3
 	}
 }

 ## Calculate a lipsync variable such that it only counts for the person who stayed in.
 ## Outs are not counted for those who ended in the top three of each season.
 raw$LipsyncWithoutOut <- raw$Lipsyncs
 raw[which(raw$Out == 1 & raw$Place >= 3),]$LipsyncWithoutOut <- raw[which(raw$Out == 1 & raw$Place >= 3),]$Lipsyncs - 1

 ## Willam was disqualified and therefore did not leave with a lipsync
 raw[which(raw$Name == "Willam" & raw$Out == 1),]$LipsyncWithoutOut <- subset(raw, Name == "Willam" & Out == 1)$Lipsyncs

 ## restrict the data the model is trained on to first four seasons
 df <- raw[which(raw$Season < 5),]

 ## Survival object
 t.surv <- with(df, Surv(Start, End, Out))

 ## time invariant models
 t.cox1  <- coxph(t.surv ~ (Age + PlusSize + PuertoRico), df)

 ## time variant models. need to use robust to account for temporal dependence
 t.cox2  <- coxph(t.surv ~ (Age + PlusSize + PuertoRico + Wins + Highs + Lows + Lipsyncs) + cluster(ID), df) 
 t.cox3  <- coxph(t.surv ~ (Age + PlusSize + PuertoRico + Wins + Highs + Lows + LipsyncWithoutOut) + cluster(ID), df) 

 ## Prediction
 currentEpisode <- 6

 ## use the fifth season to predict outcomes
 dp <- raw[which(raw$Season == 5),]
 dp[is.na(dp)] <- 0

 # do the lipsync correction
 dp$LipsyncWithoutOut <- dp$Lipsyncs
 dp[which(dp$Out == 1 & dp$Place > 3),]$LipsyncWithoutOut <- dp[which(dp$Out == 1 & dp$Place > 3),]$Lipsyncs - 1

 ## generate the relative risk with preferred model and merge it with the existing data
 t.predict    <- predict(t.cox2, dp, "risk", se.fit = T)
 dp$relRisk   <- t.predict$fit
 dp$seRelRisk <- t.predict$se.fit

 ## clean up the data frame and display the ordered relative risks
 predictions <- dp[which(dp$End == currentEpisode & dp$Out == 0),]
 ordered     <- predictions[with(predictions, order(relRisk)),]
 ordered     <- data.frame(Name = ordered$Name, relRisk = ordered$relRisk, se.relRisk = ordered$seRelRisk)
	library(RCurl)
	library(ggplot2)
	library(survival)

	## plot Kaplan-Meier curve
	plotKM <- function(df, metric) {
	## make age categorical
	df$Age[df$Age < 25] <- 1
	df$Age[df$Age >= 25 & df$Age < 31] <- 2
	df$Age[df$Age >= 31] <- 3

	t.surv <- with(df, Surv(Start, End, Out))
	t.fit <- survfit(t.surv ~ df[[ metric ]])

	## a lot of this coming from here: http://www.ceb-institute.org/bbs/wp-content/uploads/2011/09/handout_ggplot2.pdf
	## create strata vector
	t.strata <- NULL
	for(f.i in 1:length(t.fit$strata)) {
	# add vector for one strata according to number of rows of strata
	t.strata <- c(t.strata, rep(names(t.fit$strata)[f.i], t.fit$strata[f.i]))
	}

	## create a data frame from fit data
	t.survframe <- data.frame( time = t.fit$time, strata = t.strata, surv = t.fit$surv, censor = t.fit$n.censor )

	## plot
	p <- ggplot(t.survframe)
	p <- p + geom_step(aes(time, surv, color = strata), direction="hv")

	## use points for censors
	p <- p + geom_point(aes(time, surv, color = strata), data = subset(t.survframe, censor == 1))

	if (metric == "Age") {
	p <- p + scale_colour_discrete(
	name="Age",
	labels=c("< 25", "25 - 30", "> 30"))
	} else if (metric == "Wins") {
	p <- p + scale_colour_discrete(name = metric, labels = c("0", "1", "2", "3", "4"))
	} else if (metric == "Lipsyncs") {
	p <- p + scale_colour_discrete(name = metric, labels = c("0", "1", "2", "3"))
	}

	ggsave(p, filename = paste("km-", metric, ".png", sep = ""), width = 10, height = 8)
	}

	plotKM(df, "Age")
	plotKM(df, "Wins")
	plotKM(df, "Lipsyncs")

	myCsv <- getURL("https://docs.google.com/spreadsheet/pub?key=0Avriupr1jL4kdEVRcklVX3J0bkJibEswbVdiZ0pnM0E&output=csv")
	raw <- read.csv(textConnection(myCsv))

	currentSeason <- 5

	## replace NAs with 0
	raw[is.na(raw)] <- 0

	raw$CompLeft <- 0

	## control for the number of competitors left
	for (i in 1:currentSeason) {
	df.sub <- raw[which(raw$Season == i), ]
	cl <- max(df.sub$Place) - df.sub$End
	raw[which(raw$Season == i), ]$CompLeft <- cl

	## fix for final round
	if (i < currentSeason) {
	raw[which(raw$Season == i & raw$Place <= 3),]$CompLeft <- 3
	}
	}

	## Calculate a lipsync variable such that it only counts for the person who stayed in.
	## Outs are not counted for those who ended in the top three of each season.
	raw$LipsyncWithoutOut <- raw$Lipsyncs
	raw[which(raw$Out == 1 & raw$Place >= 3),]$LipsyncWithoutOut <- raw[which(raw$Out == 1 & raw$Place >= 3),]$Lipsyncs - 1

	## Willam was disqualified and therefore did not leave with a lipsync
	raw[which(raw$Name == "Willam" & raw$Out == 1),]$LipsyncWithoutOut <- subset(raw, Name == "Willam" & Out == 1)$Lipsyncs

	## restrict the data the model is trained on to first four seasons
	df <- raw[which(raw$Season < 5),]

	## Survival object
	t.surv <- with(df, Surv(Start, End, Out))

	## time invariant models
	t.cox1 <- coxph(t.surv ~ (Age + PlusSize + PuertoRico), df)

	## time variant models. need to use robust to account for temporal dependence
	t.cox2 <- coxph(t.surv ~ (Age + PlusSize + PuertoRico + Wins + Highs + Lows + Lipsyncs) + cluster(ID), df)
	t.cox3 <- coxph(t.surv ~ (Age + PlusSize + PuertoRico + Wins + Highs + Lows + LipsyncWithoutOut) + cluster(ID), df)

	## Prediction
	currentEpisode <- 6

	## use the fifth season to predict outcomes
	dp <- raw[which(raw$Season == 5),]
	dp[is.na(dp)] <- 0

	# do the lipsync correction
	dp$LipsyncWithoutOut <- dp$Lipsyncs
	dp[which(dp$Out == 1 & dp$Place > 3),]$LipsyncWithoutOut <- dp[which(dp$Out == 1 & dp$Place > 3),]$Lipsyncs - 1

	## generate the relative risk with preferred model and merge it with the existing data
	t.predict <- predict(t.cox2, dp, "risk", se.fit = T)
	dp$relRisk <- t.predict$fit
	dp$seRelRisk <- t.predict$se.fit

	## clean up the data frame and display the ordered relative risks
	predictions <- dp[which(dp$End == currentEpisode & dp$Out == 0),]
	ordered <- predictions[with(predictions, order(relRisk)),]
	ordered <- data.frame(Name = ordered$Name, relRisk = ordered$relRisk, se.relRisk = ordered$seRelRisk)