Last active
January 7, 2024 17:54
-
-
Save mathzero/25b29dff128622820401ca232587cfae to your computer and use it in GitHub Desktop.
Search algorithm to drop NAs from data set with minimum data loss
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Main function to find the 'optimal' combination of rows and columns to drop in order to | |
# maximise the number of remaining data points in a data set | |
# Search space rapidly becomes enormous as data size grows, so this random search solution will | |
# only ever be a rough approximation of the optimal data set. | |
optimiseDataset <- function(data, max_iterations = 10000) { | |
best_solution <- data | |
dims=dim(data[complete.cases(data),]) | |
orig_dimension <- dims[1]*dims[2] | |
best_dimension <- orig_dimension | |
# loop | |
progress <- txtProgressBar(min = 0, max = max_iterations, style = 3) | |
for (i in 1:max_iterations) { | |
setTxtProgressBar(progress, i) | |
set.seed(i) | |
proportions <- missing_data_proportions(data) | |
dropped_data <- random_drop(data, proportions$row_missings, proportions$col_missings) | |
dims=dim(as.matrix(dropped_data[complete.cases(dropped_data),])) | |
complete_case_dimension <- dims[1]*dims[2] | |
if (complete_case_dimension > best_dimension) { | |
best_solution <- dropped_data | |
best_dimension <- complete_case_dimension | |
} | |
} | |
print(paste0("Original data has ",orig_dimension," complete cases")) | |
print(paste0("Optimal solution has ",best_dimension," complete cases")) | |
# get complete cases version of data set | |
complete_cases=best_solution[complete.cases(best_solution),] | |
# get plot | |
h1=plotKeptDropped(data = data,complete_cases = complete_cases) | |
# return results | |
return(list(best_dimension=best_dimension, | |
best_solution=best_solution, | |
complete_cases=complete_cases, | |
heatmap=h1)) | |
} | |
# function to plot the kept/dropped columns using ComplexHeatmap | |
# Takes iunputs from the optimiseDataset function above | |
plotKeptDropped <- function(data,complete_cases){ | |
# create boolean matrix of NAs | |
dat_na=is.na(data) | |
# reorder data by NA counts in rows and columns | |
dat <- data[order(rowSums(dat_na)),order(colSums(dat_na))] | |
# get boolean inclusion vectors for rows and columns | |
rowvect=rownames(data)%in%rownames(complete_cases) | |
colvect=colnames(data)%in%colnames(complete_cases) | |
# create new create boolean matrix of NAs in reordered data set | |
dat_na_new=is.na(dat) | |
# function to colour plot | |
col_fun = circlize::colorRamp2(c(0,1), c( "grey40","white")) | |
# create plot | |
h1 <- ComplexHeatmap::Heatmap(matrix = dat_na_new, | |
col = col_fun,show_heatmap_legend = F, | |
row_split = (ifelse(rowvect,"Row included","Row removed")), | |
column_split = (ifelse(colvect,"Column included","Column removed")), | |
cluster_rows = F, | |
cluster_columns = F, | |
row_names_side = "left", | |
column_names_side = "top") | |
return(h1) | |
} | |
# Function to compute proportion of missing data for rows and columns | |
missing_data_proportions <- function(data) { | |
row_missings <- apply(data, 1, function(x) mean(is.na(x))) | |
col_missings <- apply(data, 2, function(x) mean(is.na(x))) | |
# Replace NA in proportions with a small value | |
row_missings[is.na(row_missings)] <- 0.01 | |
col_missings[is.na(col_missings)] <- 0.01 | |
list(row_missings = row_missings, col_missings = col_missings) | |
} | |
# Function to randomly drop rows and columns based on missing data proportions | |
random_drop <- function(data, row_missings, col_missings) { | |
row_dropcount <- sample(1:(nrow(data) - 1), size = 1, replace = F) | |
col_dropcount <- sample(1:(ncol(data) - 1), size = 1, replace = F) | |
rows_to_drop <- sample(1:nrow(data), size = row_dropcount, prob = row_missings, replace = TRUE) | |
cols_to_drop <- sample(1:ncol(data), size = col_dropcount, prob = col_missings, replace = TRUE) | |
data[-rows_to_drop, -cols_to_drop, drop = FALSE] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment