Skip to content

Instantly share code, notes, and snippets.

@jwaage
Created January 25, 2016 11:09
Show Gist options
  • Save jwaage/5a5da5b4405ebcc3984a to your computer and use it in GitHub Desktop.
Save jwaage/5a5da5b4405ebcc3984a to your computer and use it in GitHub Desktop.
Survival analysis and kaplan meyer plots
# Welcome to the COPSAC ggplot Kaplan-Meier template. It builds nice and very customizable KM curves as shown below. Please run the two functions to load into memory, it works like a SAS macro.
# Load dependencies
library(survival)
library(ggplot2)
library(cowplot)
library(reshape2)
# Run these functions to load into memory
# Define custom function to create a survival data.frame. Credit http://www.ceb-institute.org/bbs/wp-content/uploads/2011/09/handout_ggplot2.pdf
createSurvivalFrame <- function(f.survfit){
# initialise frame variable
f.frame <- NULL
# check if more then one strata
if(length(names(f.survfit$strata)) == 0){
# create data.frame with data from survfit
f.frame <- data.frame(time=f.survfit$time, n.risk=f.survfit$n.risk, n.event=f.survfit$n.event, n.censor = f.survfit$n.censor, surv=f.survfit$surv, upper=f.survfit$upper, lower=f.survfit$lower)
# create first two rows (start at 1)
f.start <- data.frame(time=c(0, f.frame$time[1]), n.risk=c(f.survfit$n, f.survfit$n), n.event=c(0,0), n.censor=c(0,0), surv=c(1,1), upper=c(1,1), lower=c(1,1))
# add first row to dataset
f.frame <- rbind(f.start, f.frame)
# remove temporary data
rm(f.start)
}
else {
# create vector for strata identification
f.strata <- NULL
print(f.survfit$strata)
for(f.i in 1:length(f.survfit$strata)){
# add vector for one strata according to number of rows of strata
f.strata <- c(f.strata, rep(names(f.survfit$strata)[f.i], f.survfit$strata[f.i]))
}
# create data.frame with data from survfit (create column for strata)
f.frame <- data.frame(time=f.survfit$time, n.risk=f.survfit$n.risk, n.event=f.survfit$n.event, n.censor = f.survfit$n.censor, surv=f.survfit$surv, upper=f.survfit$upper, lower=f.survfit$lower, strata=factor(f.strata))
# remove temporary data
rm(f.strata)
# create first two rows (start at 1) for each strata
for(f.i in 1:length(f.survfit$strata)){
# take only subset for this strata from data
f.subset <- subset(f.frame, strata==names(f.survfit$strata)[f.i])
# create first two rows (time: 0, time of first event)
f.start <- data.frame(time=c(0, f.subset$time[1]), n.risk=rep(f.survfit[f.i]$n, 2), n.event=c(0,0), n.censor=c(0,0), surv=c(1,1), upper=c(1,1), lower=c(1,1), strata=rep(names(f.survfit$strata)[f.i], 2))
# add first two rows to dataset
f.frame <- rbind(f.start, f.frame)
# remove temporary data
rm(f.start, f.subset)
}
# reorder data
f.frame <- f.frame[order(f.frame$strata, f.frame$time), ]
# rename row.names
rownames(f.frame) <- NULL
}
# return frame
return(f.frame)
}
# Credit http://www.ceb-institute.org/bbs/wp-content/uploads/2011/09/handout_ggplot2.pdf
qplot_survival <- function(f.frame, f.CI="default", f.shape=3){
# use different plotting commands dependig whether or not strata's are given
if("strata" %in% names(f.frame) == FALSE){
# confidence intervals are drawn if not specified otherwise
if(f.CI=="default" | f.CI==TRUE ){
# create plot with 4 layers (first 3 layers only events, last layer only censored)
# hint: censoring data for multiple censoring events at timepoint are overplotted
# (unlike in plot.survfit in survival package)
ggplot(data=f.frame) + geom_step(aes(x=time, y=surv), direction="hv") + geom_step(aes(x=time, y=upper), directions="hv", linetype=2) + geom_step(aes(x=time,y=lower), direction="hv", linetype=2) + geom_point(data=subset(f.frame, n.censor==1), aes(x=time, y=surv), shape=f.shape)
}
else {
# create plot without confidence intervalls
ggplot(data=f.frame) + geom_step(aes(x=time, y=surv), direction="hv") +
geom_point(data=subset(f.frame, n.censor==1), aes(x=time, y=surv), shape=f.shape)
}
}
else {
if(f.CI=="default" | f.CI==FALSE){
# without CI
ggplot(data=f.frame, aes(group=strata, colour=strata)) + geom_step(aes(x=time, y=surv),
direction="hv") + geom_point(data=subset(f.frame, n.censor==1), aes(x=time, y=surv), shape=f.shape)
}
else {
# with CI (hint: use alpha for CI)
ggplot(data=f.frame, aes(colour=strata, group=strata)) + geom_step(aes(x=time, y=surv), direction="hv") + geom_step(aes(x=time, y=upper), directions="hv", linetype=2, alpha=0.5) +
geom_step(aes(x=time,y=lower), direction="hv", linetype=2, alpha=0.5) +
geom_point(data=subset(f.frame, n.censor==1), aes(x=time, y=surv), shape=f.shape)
}
}
}
#
## Now for a short example
#
## Example Data, lung cancer study from the survival package. Read about it with ?lung
#
#data(lung)
## Variable description:
## time - survival time in days
## status - 1=censored, 2=dead
## ph.ecog - Performance Score, ability to perform normal daily activities. Takes values 0-3 in this dataset. We will remove lvl 3 (1 obs), and model as factor.
#
#lunge <- lung[lung$ph.ecog != 3, c("time","status","ph.ecog")] # Select interesting variables, remove PS 3, save in a new dataframe called "lunge"
#lunge <- lunge[complete.cases(lunge),] # Remove missing data
#
## Research question - does performance score influence survival time?
#
## Fit a cox regession. Here you could add other variables to the model to adjust for potential confounders.
#fit <- coxph(Surv(time, status == 2) ~ factor(ph.ecog), data=lunge) # Fit the model
#summary(fit) # Hazard Ratios can be seen as "exp(coef)", PS 0 is the reference level. Model p value at the bottom.
#
## Extract HR, CI, p as text, can be used for titles in plots. Here we'll use PS=2 as an example
#resdf <- signif(coef(summary(fit)),3)
#cidf <- round(exp(confint(fit)),2)
#HRtext <- paste0("HR ", resdf[2,2], " [", cidf[2,1], "-",cidf[2,2],"], p = ", resdf[2,5])
#
## Fit a "survfit" (Kaplan-Meier) object. Only use what you want on graph as right-hand side factor (ie. only Performance Score)
#sf <- survfit(Surv(time, status == 2) ~ factor(ph.ecog), data=lunge)
#
## Use "macro" function to create plot-data
#sdf <- createSurvivalFrame(sf)
#levels(sdf$strata) # Inspect group levels
#levels(sdf$strata) <- c("2", "1", "0") # Rename group levels
#
## Optional - flip y axis to change survival rate to cumulative event rate
#sdf_flip <- sdf
#sdf_flip[,c("surv", "upper", "lower")] <- 1 - sdf_flip[,c("surv", "upper", "lower")]
#
## Plot with the above template. Can be modified with ggplot commands too, eg.
#qplot_survival(sdf) # Survival rate
#qplot_survival(sdf_flip) # Cumulative event rate
#qplot_survival(sdf_flip) + xlab("Time since diagnosis, days")
#qplot_survival(sdf_flip, f.CI=TRUE) + xlab("Time since diagnosis, days") # Add confidence interval
#
## Make a plot manually. Distinguish Performance Score strata by color.
#ggplot(sdf_flip, aes(x=time, y=surv, color=strata)) + # Define and map data
#geom_step(size=0.9) + # Adjust line thickness
#xlab("Survival time, days") + # Set x-axis label
#ylab("Cumulative risk of death") + # Set y axis label
#scale_color_manual(name = "Performance Score", values=c("darkblue", "blue", "lightblue")) + # Set legend name and colors
#theme_cowplot() + # Set base theme
#theme(legend.position=c(1,0), legend.justification=c(1,0)) + # Set legend position and "anchor point"(x,y)
#ggtitle("Lung cancer mortality") # Set a title of the plot
#
## Alternative theme (standard ggplot2)
#ggplot(sdf_flip, aes(x=time, y=surv, color=strata)) + # Define and map data
#geom_step(size=0.5) + # Adjust line thickness
#xlab("Survival time, days") + # Set x-axis label
#ylab("Cumulative risk of death") + # Set y axis label
#scale_color_manual(name = "Performance Score", values=c("red", "blue", "green")) + # Set legend name and colors
#theme_grey() + # Set base theme
#theme(legend.position=c(1,0), legend.justification=c(1,0)) + # Set legend position and "anchor point"(x,y)
#ggtitle("Lung cancer mortality") # Set a title of the plot
#
## Alternative, black and white lines with "black & white" theme. Distinguish Performance Score by line type
#ggplot(sdf_flip, aes(x=time, y=surv, linetype=strata)) + # Define and map data
#geom_step(size=0.2) + # Adjust line thickness
#xlab("Survival time, days") + # Set x-axis label
#ylab("Cumulative risk of death") + # Set y axis label
#scale_linetype_manual(name = "Performance Score", values=c("solid", "dotted", "dashed")) + # Set legend name
#theme_bw() + # Set base theme
#theme(legend.position=c(1,0), legend.justification=c(1,0), legend.background=element_blank(), legend.direction="horizontal") + # Set legend position and "anchor point"(x,y), change to horizontal
#ggtitle("Lung cancer mortality") # Set a title of the plot
#
#
#
## Make extra plot with n at risk for each group in a lower subplot
## Vector of time points, define manually
#timepoints <- c(0, 250, 500, 750, 1000)
#
## Split dataframe into groups
#nlist <- split(sdf_flip, list(sdf_flip$strata))
#getmax <- function(data, time) data[which.max(data$time[data$time <= time]),"n.risk"]
#timenleft <- as.data.frame(lapply(timepoints, function(x) sapply(nlist, getmax, x) ))
#names(timenleft) <- timepoints
#timenleft$group <- rownames(timenleft)
#plotdat <- melt(timenleft)
#names(plotdat) <- c("group", "time", "n")
#
#KMplot <- ggplot(sdf_flip, aes(x=time, y=surv, linetype=strata)) + # Define and map data
#geom_step(size=0.2) + # Adjust line thickness
#xlab("Survival time, days") + # Set x-axis label
#ylab("Cumulative risk of death") +
#scale_linetype_manual(name = "Performance Score", values=c("solid", "dotted", "dashed")) + # Set legend name
#theme(legend.position=c(1,0), legend.justification=c(1,0), legend.background=element_blank(), legend.direction="horizontal") + # Set legend position and "anchor point"(x,y), change to horizontal
#ggtitle(HRtext) # Set a title of the plot
#
#nleftplot <- ggplot(plotdat, aes(x=as.numeric(time), y=factor(group), label=n)) + geom_text() + theme_cowplot() + xlab(NULL) + ylab("Performance\nScore") + theme(axis.ticks.x=element_blank(), axis.line.x=element_blank(), axis.text.x=element_blank())
#
#plot_grid(KMplot, nleftplot, nrow=2, rel_heights=c(0.7,0.3))
#
##
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment