Last active
September 9, 2022 09:38
-
-
Save epijim/339f5f37b1b9ec314ec78b005d8bb9ac to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
score_cindex <- function( | |
zip_path, # zip file path | |
truth # in memory R object | |
){ | |
library("pec") # pec functions don't work without the lib loaded :( | |
files_present <- zip::zip_list(zip_path)$filename | |
# Check files present | |
if (!"test_features.parquet" %in% files_present) stop("test_features.parquet missing from zip file") | |
if (!"training_features.parquet" %in% files_present) stop("training_features.parquet missing from zip file") | |
# Read files | |
# save to temp dir | |
zip::unzip(zip_path,exdir = tempdir()) | |
# get data | |
test_features <- arrow::read_parquet(file.path(tempdir(), | |
"test_features.parquet")) | |
train_features <- arrow::read_parquet(file.path(tempdir(), | |
"training_features.parquet")) | |
# Checks on data | |
error_col_names <- function(expected_names, data){ | |
observed_names <- sort(names(data)) | |
!identical(observed_names,sort(expected_names)) | |
} | |
if (error_col_names( | |
c(paste0("feature",1:10),"died","days_to_event"), | |
train_features | |
)) stop("training_features.parquet has wrong columns") | |
if (error_col_names( | |
c(paste0("feature",1:10),"patientid"), | |
test_features | |
)) stop("test_features.parquet has wrong columns") | |
if (nrow(train_features) > 100000) stop("training features has more than 10,000 rows") | |
# impute | |
# Ideally teams do not rely on this... but if they have, add median and mode | |
mode_stat <- function(x) { | |
ux <- unique(x) | |
ux[which.max(tabulate(match(x, ux)))] | |
} | |
train_features <- train_features %>% | |
dplyr::mutate_if( | |
is.numeric, | |
function(x) ifelse(is.na(x), median(x, na.rm = T), x) | |
) %>% | |
dplyr::mutate_if( | |
is.character, | |
function(x) ifelse(is.na(x), mode_stat(x), x) | |
) | |
test_features <- test_features %>% | |
dplyr::mutate_if( | |
is.numeric, | |
function(x) ifelse(is.na(x), median(x, na.rm = T), x) | |
) %>% | |
dplyr::mutate_if( | |
is.character, | |
function(x) ifelse(is.na(x), mode_stat(x), x) | |
) | |
# check no missing | |
if (sum(is.na(train_features)) != 0) stop("Missing data detected in train") | |
if (sum(is.na(test_features)) != 0) stop("Missing data detected in test") | |
# Fit model | |
fitted_model <- survival::coxph( | |
survival::Surv(days_to_event, died) ~ | |
feature1 + feature2 + feature3 + feature4 + feature5 + | |
feature6 + feature7 + feature8 + feature9 + feature10, | |
data = train_features, | |
x = TRUE,y = TRUE | |
) | |
# Score model | |
result <- pec::cindex( | |
list("Fitted model" = fitted_model), | |
formula = Surv(days_to_event, died) ~ | |
feature1 + feature2 + feature3 + feature4 + feature5 + | |
feature6 + feature7 + feature8 + feature9 + feature10, | |
data = test_features %>% | |
mutate(patientid = as.character(patientid)) %>% | |
left_join( | |
truth, | |
by = "patientid" | |
), | |
cens.model = "marginal" | |
) | |
# results | |
tibble( | |
`c-index` = result$AppCindex[[1]], | |
`pairs` = result$Pairs[[1]], | |
`concordant` = paste0(result$Concordant, | |
" (",round(100*result$Concordant[[1]]/result$Pairs[[1]]),"%)") | |
) | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment