Created
March 28, 2020 01:15
-
-
Save ha0ye/197de5797df1d4e2b14a8bff4fb99c95 to your computer and use it in GitHub Desktop.
pairwise overlap calculations
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
calc3 <- function(sets) | |
{ | |
sets <- check_sets(sets) | |
set_lengths <- vapply(sets, length, 0) | |
set_order <- order(set_lengths) | |
sets <- sets[set_order] | |
set_lengths <- set_lengths[set_order] | |
n_sets <- length(sets) | |
set_names <- names(sets) | |
n_overlaps <- choose(n = n_sets, k = 2) | |
symbols <- unique(do.call(c, sets)) | |
occ_mat <- vector("list", n_sets) | |
for (j in seq_len(n_sets)) | |
{ | |
occ_mat[[j]] <- symbols %in% sets[[j]] | |
} | |
vec_num_shared <- integer(length = n_overlaps) | |
vec_overlap <- numeric(length = n_overlaps) | |
vec_jaccard <- numeric(length = n_overlaps) | |
overlaps_index <- 1 | |
for (i in seq_len(n_sets - 1)) | |
{ | |
for (j in seq(i + 1, n_sets)) | |
{ | |
num_union <- sum(occ_mat[[i]] | occ_mat[[j]]) | |
num_shared <- sum(occ_mat[[i]] & occ_mat[[j]]) | |
overlap <- num_shared / set_lengths[i] | |
jaccard <- num_shared / num_union | |
vec_num_shared[overlaps_index] <- num_shared | |
vec_overlap[overlaps_index] <- overlap | |
vec_jaccard[overlaps_index] <- jaccard | |
overlaps_index <- overlaps_index + 1 | |
} | |
} | |
idx_df <- expand.grid(idx2 = seq(n_sets), idx1 = seq(n_sets)) | |
idx_df <- idx_df[idx_df$idx2 > idx_df$idx1, ] | |
result <- data.frame(name1 = set_names[idx_df[,2]], | |
name2 = set_names[idx_df[,1]], | |
num_shared = vec_num_shared, | |
overlap = vec_overlap, | |
jaccard = vec_jaccard, | |
stringsAsFactors = FALSE) | |
return(result) | |
} | |
check_sets <- function(sets) | |
{ | |
# Ensure that all sets are unique character vectors | |
sets_are_vectors <- vapply(sets, is.vector, logical(1)) | |
if (any(!sets_are_vectors)) { | |
stop("Sets must be vectors") | |
} | |
sets_are_atomic <- vapply(sets, is.atomic, logical(1)) | |
if (any(!sets_are_atomic)) { | |
stop("Sets must be atomic vectors, i.e. not lists") | |
} | |
sets <- lapply(sets, as.character) | |
is_unique <- function(x) length(unique(x)) == length(x) | |
sets_are_unique <- vapply(sets, is_unique, logical(1)) | |
if (any(!sets_are_unique)) { | |
stop("Sets must be unique, i.e. no duplicated elements") | |
} | |
invisible(sets) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment