Skip to content

Instantly share code, notes, and snippets.

@epijim
Last active September 9, 2022 09:38
Show Gist options
  • Save epijim/339f5f37b1b9ec314ec78b005d8bb9ac to your computer and use it in GitHub Desktop.
Save epijim/339f5f37b1b9ec314ec78b005d8bb9ac to your computer and use it in GitHub Desktop.
score_cindex <- function(
zip_path, # zip file path
truth # in memory R object
){
library("pec") # pec functions don't work without the lib loaded :(
files_present <- zip::zip_list(zip_path)$filename
# Check files present
if (!"test_features.parquet" %in% files_present) stop("test_features.parquet missing from zip file")
if (!"training_features.parquet" %in% files_present) stop("training_features.parquet missing from zip file")
# Read files
# save to temp dir
zip::unzip(zip_path,exdir = tempdir())
# get data
test_features <- arrow::read_parquet(file.path(tempdir(),
"test_features.parquet"))
train_features <- arrow::read_parquet(file.path(tempdir(),
"training_features.parquet"))
# Checks on data
error_col_names <- function(expected_names, data){
observed_names <- sort(names(data))
!identical(observed_names,sort(expected_names))
}
if (error_col_names(
c(paste0("feature",1:10),"died","days_to_event"),
train_features
)) stop("training_features.parquet has wrong columns")
if (error_col_names(
c(paste0("feature",1:10),"patientid"),
test_features
)) stop("test_features.parquet has wrong columns")
if (nrow(train_features) > 100000) stop("training features has more than 10,000 rows")
# impute
# Ideally teams do not rely on this... but if they have, add median and mode
mode_stat <- function(x) {
ux <- unique(x)
ux[which.max(tabulate(match(x, ux)))]
}
train_features <- train_features %>%
dplyr::mutate_if(
is.numeric,
function(x) ifelse(is.na(x), median(x, na.rm = T), x)
) %>%
dplyr::mutate_if(
is.character,
function(x) ifelse(is.na(x), mode_stat(x), x)
)
test_features <- test_features %>%
dplyr::mutate_if(
is.numeric,
function(x) ifelse(is.na(x), median(x, na.rm = T), x)
) %>%
dplyr::mutate_if(
is.character,
function(x) ifelse(is.na(x), mode_stat(x), x)
)
# check no missing
if (sum(is.na(train_features)) != 0) stop("Missing data detected in train")
if (sum(is.na(test_features)) != 0) stop("Missing data detected in test")
# Fit model
fitted_model <- survival::coxph(
survival::Surv(days_to_event, died) ~
feature1 + feature2 + feature3 + feature4 + feature5 +
feature6 + feature7 + feature8 + feature9 + feature10,
data = train_features,
x = TRUE,y = TRUE
)
# Score model
result <- pec::cindex(
list("Fitted model" = fitted_model),
formula = Surv(days_to_event, died) ~
feature1 + feature2 + feature3 + feature4 + feature5 +
feature6 + feature7 + feature8 + feature9 + feature10,
data = test_features %>%
mutate(patientid = as.character(patientid)) %>%
left_join(
truth,
by = "patientid"
),
cens.model = "marginal"
)
# results
tibble(
`c-index` = result$AppCindex[[1]],
`pairs` = result$Pairs[[1]],
`concordant` = paste0(result$Concordant,
" (",round(100*result$Concordant[[1]]/result$Pairs[[1]]),"%)")
)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment