epijim · September 9, 2022 09:38
diff --git a/score_cindex.R b/score_cindex.R
 score_cindex <- function(
    zip_path, # zip file path
    truth # in memory R object
 ){

  library("pec") # pec functions don't work without the lib loaded :(
  
  files_present <- zip::zip_list(zip_path)$filename
  
  # Check files present
  if (!"test_features.parquet" %in% files_present) stop("test_features.parquet missing from zip file")
  if (!"training_features.parquet" %in% files_present) stop("training_features.parquet missing from zip file")
  
  # Read files
  # save to temp dir
  zip::unzip(zip_path,exdir = tempdir())
  # get data
  test_features <- arrow::read_parquet(file.path(tempdir(),
                                                 "test_features.parquet"))
  train_features <- arrow::read_parquet(file.path(tempdir(),
                                                  "training_features.parquet"))
  
  
  # Checks on data
  error_col_names <- function(expected_names, data){
    observed_names <- sort(names(data))
    !identical(observed_names,sort(expected_names))
  }
  
  if (error_col_names(
    c(paste0("feature",1:10),"died","days_to_event"),
    train_features
  )) stop("training_features.parquet has wrong columns")
  
  if (error_col_names(
    c(paste0("feature",1:10),"patientid"),
    test_features
  )) stop("test_features.parquet has wrong columns")
  
  if (nrow(train_features) > 100000) stop("training features has more than 10,000 rows")
  
  # impute
  # Ideally teams do not rely on this... but if they have, add median and mode
  mode_stat <- function(x) {
    ux <- unique(x)
    ux[which.max(tabulate(match(x, ux)))]
  }
  
  train_features <- train_features %>% 
    dplyr::mutate_if(
      is.numeric, 
      function(x) ifelse(is.na(x), median(x, na.rm = T), x)
      ) %>% 
    dplyr::mutate_if(
      is.character, 
      function(x) ifelse(is.na(x), mode_stat(x), x)
    )
  
  test_features <- test_features %>% 
    dplyr::mutate_if(
      is.numeric,
      function(x) ifelse(is.na(x), median(x, na.rm = T), x)
      )  %>% 
    dplyr::mutate_if(
      is.character, 
      function(x) ifelse(is.na(x), mode_stat(x), x)
    )
  
  # check no missing
  if (sum(is.na(train_features)) != 0) stop("Missing data detected in train")
  if (sum(is.na(test_features)) != 0) stop("Missing data detected in test")
  
  # Fit model
  fitted_model <- survival::coxph(
    survival::Surv(days_to_event, died) ~ 
      feature1 + feature2 + feature3 + feature4 + feature5 +
      feature6 + feature7 + feature8 + feature9 + feature10,
    data = train_features,
    x = TRUE,y = TRUE
  )  
  
  # Score model
  result <- pec::cindex(
    list("Fitted model" = fitted_model),
    formula = Surv(days_to_event, died) ~ 
      feature1 + feature2 + feature3 + feature4 + feature5 +
      feature6 + feature7 + feature8 + feature9 + feature10,
    data = test_features %>%
      mutate(patientid = as.character(patientid)) %>%
      left_join(
        truth, 
        by = "patientid"
      ),
    cens.model = "marginal"
  )
  
  # results
  tibble(
    `c-index` = result$AppCindex[[1]],
    `pairs` = result$Pairs[[1]],
    `concordant` = paste0(result$Concordant,
                          " (",round(100*result$Concordant[[1]]/result$Pairs[[1]]),"%)")
  )
  
 }
	score_cindex <- function(
	zip_path, # zip file path
	truth # in memory R object
	){

	library("pec") # pec functions don't work without the lib loaded :(

	files_present <- zip::zip_list(zip_path)$filename

	# Check files present
	if (!"test_features.parquet" %in% files_present) stop("test_features.parquet missing from zip file")
	if (!"training_features.parquet" %in% files_present) stop("training_features.parquet missing from zip file")

	# Read files
	# save to temp dir
	zip::unzip(zip_path,exdir = tempdir())
	# get data
	test_features <- arrow::read_parquet(file.path(tempdir(),
	"test_features.parquet"))
	train_features <- arrow::read_parquet(file.path(tempdir(),
	"training_features.parquet"))


	# Checks on data
	error_col_names <- function(expected_names, data){
	observed_names <- sort(names(data))
	!identical(observed_names,sort(expected_names))
	}

	if (error_col_names(
	c(paste0("feature",1:10),"died","days_to_event"),
	train_features
	)) stop("training_features.parquet has wrong columns")

	if (error_col_names(
	c(paste0("feature",1:10),"patientid"),
	test_features
	)) stop("test_features.parquet has wrong columns")

	if (nrow(train_features) > 100000) stop("training features has more than 10,000 rows")

	# impute
	# Ideally teams do not rely on this... but if they have, add median and mode
	mode_stat <- function(x) {
	ux <- unique(x)
	ux[which.max(tabulate(match(x, ux)))]
	}

	train_features <- train_features %>%
	dplyr::mutate_if(
	is.numeric,
	function(x) ifelse(is.na(x), median(x, na.rm = T), x)
	) %>%
	dplyr::mutate_if(
	is.character,
	function(x) ifelse(is.na(x), mode_stat(x), x)
	)

	test_features <- test_features %>%
	dplyr::mutate_if(
	is.numeric,
	function(x) ifelse(is.na(x), median(x, na.rm = T), x)
	) %>%
	dplyr::mutate_if(
	is.character,
	function(x) ifelse(is.na(x), mode_stat(x), x)
	)

	# check no missing
	if (sum(is.na(train_features)) != 0) stop("Missing data detected in train")
	if (sum(is.na(test_features)) != 0) stop("Missing data detected in test")

	# Fit model
	fitted_model <- survival::coxph(
	survival::Surv(days_to_event, died) ~
	feature1 + feature2 + feature3 + feature4 + feature5 +
	feature6 + feature7 + feature8 + feature9 + feature10,
	data = train_features,
	x = TRUE,y = TRUE
	)

	# Score model
	result <- pec::cindex(
	list("Fitted model" = fitted_model),
	formula = Surv(days_to_event, died) ~
	feature1 + feature2 + feature3 + feature4 + feature5 +
	feature6 + feature7 + feature8 + feature9 + feature10,
	data = test_features %>%
	mutate(patientid = as.character(patientid)) %>%
	left_join(
	truth,
	by = "patientid"
	),
	cens.model = "marginal"
	)

	# results
	tibble(
	`c-index` = result$AppCindex[[1]],
	`pairs` = result$Pairs[[1]],
	`concordant` = paste0(result$Concordant,
	" (",round(100*result$Concordant[[1]]/result$Pairs[[1]]),"%)")
	)

	}