Last active
January 21, 2020 13:24
-
-
Save ShixiangWang/75ae36de5d1b42c3d4de79986d03e16b to your computer and use it in GitHub Desktop.
two R functions used to separate continuous variable into two group for survival analysis, then plot the result.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(survival) | |
library(survminer) | |
groupSurvival <- function(df, event="OS_IND", time="OS", var=NULL, time.limit=NULL, interval=c("open","close"), | |
method=c("quartile", "mean", "median", "percent", "custom"), percent=NULL, | |
step=20, custom_fun=NULL, group1="High", group2="Low"){ | |
#'@param df a data.frame at leaset including three column which refer to survival info and variable used to | |
#' set group. | |
#'@param event The status indicator, normally 0=alive, 1=dead. Other choices are TRUE/FALSE (TRUE = death) or 1/2 (2=death). | |
#'For interval censored data, the status indicator is 0=right censored, 1=event at time, 2=left censored, 3=interval censored. | |
#' Although unusual, the event indicator can be omitted, in which case all subjects are assumed to have an event. | |
#'@param time for right censored data, this is the follow up time. For interval data, the first argument is the starting time for the interval. | |
#'@param var a column name of df data.frame which specify the numeric column used to group. | |
#'@param time.limit a number, if any time bigger than it, the corresponding row will be ruled out from df data.frame | |
#'@param interval specify if the border should be included. | |
#'@param method a mehod used to group, you can use one of "quartile", "mean", "median", "percent", "custom". | |
#'@param percent if you choose "pecent" method, you must specify what percent you want. | |
#'@param step a number to define a step length for finding p values and time cutoff. | |
#'@param custom_fun a function object. If you choose "custom" method, you must define a function which use | |
#'values of 'var' column as input and return a list with two logical vectors responding group1 and group2, respectively. | |
#'@param group1 a character define the group name which respond to bigger values in quartile, mean, median | |
#',percent methods and the first logical vector in custom method. 'High' is default. | |
#'@param group2 a character define the group name which respond to smaller values in quartile, mean, median | |
#',percent methods and the second logical vector in custom method. 'Low' is default. | |
if(is.null(var)){ | |
stop("Please set your variable name for building groups!") | |
} | |
if(! "data.frame" %in% class(df)){ | |
stop("Be care about you input data. The df argument should be a data.frame!") | |
} | |
method=match.arg(method) | |
if(method == "percent" & is.null(percent)){ | |
stop("You choose 'percent' method but not percent values defined in argument. Please check!") | |
} | |
df1 <- df[, c(event, time, var)] | |
if(!is.null(time.limit)){ | |
df1 <- df[df[,time]<=time.limit,] | |
} | |
if (method == "custom"){ | |
if(is.null(custom_fun)){ | |
stop("You should specify a function to group 'var' column.") | |
} | |
if(!inherits(custom_fun, "function")){ | |
stop("custom_fun should be a function type.") | |
} | |
th <- custom_fun(as.numeric(df1[, var])) | |
if(typeof(th) != "list" | length(th)!=2){ | |
stop("You should store your results for grouping in a list with two logical elements!") | |
} | |
if(any(th[[1]] & th[[2]])){ | |
stop("You are mapping one record to two different groups, please check your function!") | |
} | |
df1$group <- NA | |
df1$group[th[[1]]] <- group1 | |
df1$group[th[[2]]] <- group2 | |
df2 <- subset(df1, group %in% c(group1, group2)) | |
#df2$group[df2[,var]>th2] = "High" | |
}else{ | |
sm <- summary(df1[, var]) | |
# choose threshold by method | |
if (method == "quartile"){ | |
th1 <- as.numeric(sm[2]) | |
th2 <- as.numeric(sm[5]) | |
} | |
if (method == "mean"){ | |
th1 <- th2 <- as.numeric(sm[4]) | |
} | |
if (method == "median"){ | |
th1 <- th2 <- as.numeric(sm[3]) | |
} | |
if (method == "percent"){ | |
th1 <- as.numeric(quantile(as.numeric(df1[, var]) ,percent)) | |
th2 <- as.numeric(quantile(as.numeric(df1[, var]) ,1-percent)) | |
} | |
# do groupping | |
interval <- match.arg(interval) | |
if (interval == "open"){ | |
df2 <- df1[df[, var]>th2 | df[, var] < th1 , ] | |
df2$group <- NA | |
df2$group[df2[,var]>th2] <- group1 | |
df2$group[df2[,var]<th1] <- group2 | |
}else{ | |
df2 <- df1[df[, var]>=th2 | df[, var] <= th1 , ] | |
df2$group <- NA | |
df2$group[df2[,var]>=th2] <- group1 | |
df2$group[df2[,var]<=th1] <- group2 | |
} | |
} | |
colnames(df2)[colnames(df2) == time] <- "time" | |
colnames(df2)[colnames(df2) == event] <- "event" | |
sfit <- survfit(Surv(time, event) ~ group, data=df2) | |
res <- list() | |
res$data <- df2 | |
search_cutoff <- function(mat, step=20){ | |
# mat <- os_mat | |
days <- max(na.omit(mat$time)) | |
cutoff_list <- {} | |
pval_list <- {} | |
cutoff <- list() | |
while(days > 300){ | |
if(length(table(mat$group))!=2 | sum(table(mat$group)) < 10){ | |
break | |
} | |
mat <- mat[mat$time<=days,] | |
sdf <- survdiff(Surv(time, event)~group,data = mat) | |
p.val <- 1 - pchisq(sdf$chisq, length(sdf$n) - 1) | |
cutoff_list <- c(cutoff_list, days) | |
pval_list <- c(pval_list, p.val) | |
days <- days - step | |
} | |
min_pval <- min(pval_list, na.rm = TRUE) | |
min_cutoff <- cutoff_list[which(pval_list==min_pval)] | |
cutoff$cutoff_list <- cutoff_list | |
cutoff$pval_list <- pval_list | |
cutoff$min_pval <- min_pval | |
cutoff$min_cutoff <- min_cutoff | |
return(cutoff) | |
} | |
cutoff <- search_cutoff(df2, step=step) | |
res$cutoff <- cutoff | |
return(res) | |
} | |
plot_surv <- function(os_mat, cutoff=NULL, pval=TRUE, ...){ | |
if(is.null(cutoff)){ | |
cutoff <- max(na.omit(os_mat$time)) | |
} | |
fit <- survfit(Surv(time, event) ~ group, data = os_mat[os_mat$time<=cutoff,]) | |
ggsurv <- ggsurvplot( | |
fit, # survfit object with calculated statistics. | |
data = os_mat[os_mat$time<=cutoff,], # data used to fit survival curves. | |
risk.table = FALSE, # show risk table. | |
pval = pval, # show p-value of log-rank test. | |
conf.int = FALSE, # show confidence intervals for | |
# point estimates of survival curves. | |
palette = c("red", "blue"), | |
# palette = c("#E7B800", "#2E9FDF"), | |
# xlim = c(0,5000), # present narrower X axis, but not affect | |
# survival estimates. | |
xlab = "", # customize X axis label. | |
ylab = "", | |
# break.time.by = 100, # break X axis in time intervals by 500. | |
# ggtheme = theme_light(), # customize plot and risk table with a theme. | |
risk.table.y.text.col = T,# colour risk table text annotations. | |
risk.table.height = 0.25, # the height of the risk table | |
risk.table.y.text = FALSE,# show bars instead of names in text annotations | |
# in legend of risk table. | |
ncensor.plot = FALSE, # plot the number of censored subjects at time t | |
ncensor.plot.height = 0.25, | |
# conf.int.style = "step", # customize style of confidence intervals | |
# surv.median.line = "hv", # add the median survival pointer. | |
#legend.labs = | |
# c("High expression", "Low expression"), # change legend labels. | |
#legend.title = "", | |
size = 0.5, | |
censor.size = 2, | |
... | |
) | |
ggsurv <- ggpar( | |
ggsurv, | |
font.caption = c(6, "plain", "black"), | |
font.tickslab = c(8, "plain", "black"), | |
font.legend = c(6, "plain", "black"), | |
ggtheme = theme_survminer()+theme(legend.background=element_rect(fill=NA), legend.key.height = unit(3,"mm"), | |
line = element_line(size=.1), legend.position =c(0.8, 0.9)) | |
) | |
ggsurv$plot | |
} | |
# defineGroups <- function(x){ | |
# res <- list() | |
# res[[1]] <- x > 0.4 | |
# res[[2]] <- x > -0.1 & x < 0.1 | |
# return(res) | |
# } | |
defineGroups <- function(x){ | |
res <- list() | |
res[[1]] <- x > 0.4 | |
res[[2]] <- x == 0 | |
return(res) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
谢谢啊!您的代码很有用!