|
library(data.table) |
|
library(tidyverse) |
|
library(tidygraph) |
|
library(ggraph) |
|
library(ggpubr) |
|
|
|
load("~/Downloads/pfam_all_objects.RData") |
|
# Read PFAM SSN ids and cluster sizes |
|
file <- fread("~/PFAM_components/cl_name_rep_pfam_ids_20805_cons.txt", header = FALSE, verbose = TRUE) |
|
f_size <- fread("~/PFAM_components/class_count.tsv", header = FALSE, verbose = TRUE) |
|
|
|
# Calculate mean and median |
|
f <- file %>% group_by(V1) %>% summarise(m = median(V4), m1 = mean(V4)) |
|
|
|
# Check PFAM clusters size distribution |
|
f_size %>% |
|
gghistogram(x = "V2", |
|
add = "none", rug = FALSE, |
|
bins = nclass.Sturges(.$V2), |
|
fill = "#4A636A", color = "#4A636A", alpha = 0.8) + |
|
theme_light() + |
|
scale_x_sqrt(labels = scales::comma, breaks = scales::trans_breaks("sqrt", function(x) x^2)) + |
|
scale_y_sqrt(labels = scales::comma, breaks = scales::trans_breaks("sqrt", function(x) x^2)) + |
|
ylab("Counts") + |
|
xlab("Number of ORF clusters") + |
|
theme(legend.position = "none") |
|
|
|
|
|
# Check where is the threshold we are going to use |
|
# We are using 2*MAD |
|
|
|
estimate_mode <- function(x) { |
|
d <- density(x) |
|
d$x[which.max(d$y)] |
|
} |
|
|
|
get_comp_weights <- function(X, G){ |
|
l <- G %>% |
|
activate(nodes) %>% |
|
filter(component == X) %>% |
|
activate(edges) %>% |
|
as_tibble() %>% |
|
select(weight) %>% |
|
summarise(mean = mean(weight), median = median(weight), n = n()) %>% |
|
mutate(comp = X) |
|
return(l) |
|
} |
|
|
|
use_louvain <- function(X, G) { |
|
dc <- G %>% |
|
activate(nodes) %>% |
|
filter(component == X) %>% |
|
mutate(group = paste(X,as.factor(group_louvain()), sep = "_")) |
|
return(dc) |
|
} |
|
|
|
gghistogram(f, x = "m", |
|
add = "none", rug = FALSE, |
|
bins = nclass.scott(f$m), |
|
color = "#4A636A", fill = "#4A636A", alpha = 0.8) + |
|
geom_vline(xintercept=median(f$m), color="red", linetype="dashed", size = 1) + |
|
geom_vline(xintercept=estimate_mode(f$m), color="#D6B82C", linetype="dashed", size = 1) + |
|
geom_vline(xintercept=median(f$m)-(2*mad(f$m, constant = 1)), color="#D6782C", linetype="dashed", size = 1) + |
|
theme_light() + |
|
scale_x_continuous(labels = scales::percent) + |
|
scale_y_continuous(labels = scales::comma) + |
|
ylab("Counts") + |
|
xlab("Average similarity") |
|
|
|
#cut_t <- median(f$m)-(2*mad(f$m, constant = 1)) |
|
cut_t <- estimate_mode(f$m) |
|
# Read GU and EU SSN |
|
eu_ssn <- fread("~/PFAM_components/ssnEU3.out", header = FALSE) |
|
eu_ids <- fread("~/PFAM_components/eu_ids_para_cons.tsv", header = FALSE) |
|
gu_ssn <- fread("~/PFAM_components/ssnGU3.out", header = FALSE) |
|
gu_ids <- fread("~/PFAM_components/gu_ids_para_cons.tsv", header = FALSE) |
|
kwp_ssn <- fread("~/PFAM_components/ssnKWP3.out", header = FALSE) |
|
kwp_ids <- fread("~/PFAM_components/kwp_ids_para_cons.tsv", header = FALSE) |
|
|
|
|
|
|
|
# Create the EU SSN |
|
g_eu <- eu_ssn %>% |
|
select(V1,V2,V4) %>% |
|
rename(node1 = V1, node2 = V2, weight = V4) %>% |
|
tbl_df() %>% |
|
igraph::graph_from_data_frame(directed = FALSE) %>% |
|
igraph::simplify() %>% |
|
as_tbl_graph() %>% |
|
activate(nodes) %>% |
|
left_join(eu_ids %>% rename(name = V2, cl_name = V1) %>% |
|
mutate(name = name - 1, name = as.character(name)), by = "name") |
|
|
|
# Filter edges |
|
g_c_eu <- g_eu %>% |
|
activate(edges) %>% |
|
#filter(weight >= estimate_mode(f$m)) %>% |
|
filter(weight >= cut_t) %>% |
|
activate(nodes) %>% |
|
mutate(component = group_components()) |
|
|
|
# Count components |
|
eu_d_c <- g_c_eu %>% |
|
as.tibble() %>% |
|
distinct(component) %>% |
|
.$component %>% |
|
length |
|
|
|
g_c_eu %>% |
|
as_tibble() %>% |
|
group_by(component) %>% |
|
count() %>% |
|
arrange(desc(n)) |
|
|
|
# we will decompose clusters with more than 10K using louvain |
|
g_c_eu_10k <- g_c_eu %>% |
|
as_tibble() %>% |
|
group_by(component) %>% |
|
count() %>% |
|
arrange(desc(n)) %>% |
|
filter(n > 9500) %>% |
|
.$component |
|
|
|
|
|
if (length(g_c_eu_10k) > 0) { |
|
g_c_eu_10k_l <- pbmcapply::pbmclapply(g_c_eu_10k, use_louvain, G = g_c_eu) |
|
if (length(g_c_eu_10k) > 1) { |
|
g_c_eu <- g_c_eu %>% |
|
graph_join(do.call(bind_graphs, g_c_eu_10k_l)) %>% |
|
activate(nodes) %>% |
|
mutate(component = ifelse(is.na(group), component, group)) %>% |
|
select(-group) |
|
}else{ |
|
g_c_eu <- g_c_eu %>% |
|
graph_join(g_c_eu_10k_l[[1]]) %>% |
|
activate(nodes) %>% |
|
mutate(component = ifelse(is.na(group), component, group)) %>% |
|
select(-group) |
|
} |
|
} |
|
|
|
|
|
# We plot component size |
|
g_c_eu %>% |
|
as_tibble() %>% |
|
group_by(component) %>% |
|
count() %>% |
|
ggpubr::gghistogram(x = "n", |
|
add = "none", rug = FALSE, |
|
bins = 15, |
|
color = "#CE442B", fill = "#CE442B", |
|
palette = c("#CE442B", "#CE442B"), alpha = 0.8) + |
|
theme_light() + |
|
#scale_x_sqrt(labels = scales::comma, breaks = scales::trans_breaks("sqrt", function(x) x^2)) + |
|
scale_x_log10(labels = scales::comma, breaks = c(1, 10, 100, 1000, 10000, 50000)) + #, breaks = scales::trans_breaks("sqrt", function(x) x^2)) + |
|
ylab("Counts") + |
|
xlab("Component size") + |
|
theme(legend.position = "none") + |
|
scale_y_sqrt(labels = scales::comma, breaks = scales::trans_breaks("sqrt", function(x) x^2)) |
|
|
|
# Select components with size > 1 |
|
g_c_eu_gt1 <- g_c_eu %>% |
|
as_tibble() %>% |
|
group_by(component) %>% |
|
count() %>% |
|
filter(n > 1) %>% |
|
.$component |
|
|
|
# calculate similarities in each component |
|
g_c_eu_sim <- bind_rows(pbmcapply::pbmclapply(g_c_eu_gt1, get_comp_weights, G = g_c_eu, mc.cores = 4)) |
|
|
|
# plot distribution |
|
gghistogram(g_c_eu_sim, x = "median", |
|
add = "none", rug = FALSE, |
|
bins = nclass.Sturges(g_c_eu_sim$median), |
|
color = "#CE442B", fill = "#CE442B", |
|
palette = c("#CE442B", "#CE442B"), alpha = 0.8) + |
|
#geom_vline(xintercept=summary(f$m1)[[2]], color="red", linetype="dashed") + |
|
#geom_vline(xintercept=estimate_mode(f$m1), color="#D6B82C", linetype="dashed", size = 1) + |
|
#geom_vline(xintercept=median(f$m1)-(2*mad(f$m1)), color="#D6B82C", linetype="dashed", size = 1) + |
|
theme_light() + |
|
scale_x_continuous(labels = scales::percent) + |
|
scale_y_continuous(labels = scales::comma) + |
|
ylab("Counts") + |
|
xlab("Median similarity") |
|
|
|
# get real cluster names |
|
g_c_eu_all <- g_c_eu %>% |
|
as.tibble() %>% |
|
right_join(eu_ids %>% rename(name = V2, cl_name = V1) %>% |
|
mutate(name = name - 1, name = as.character(name))) %>% |
|
arrange(component) |
|
|
|
# Add extra components |
|
g_c_eu_n <- g_c_eu_all %>% filter(!is.na(component)) %>% .$component %>% length |
|
eu_d_comp <- seq(1, dim(eu_ids)[1] - g_c_eu_n, by = 1) + eu_d_c |
|
g_c_eu_all$component <- c(g_c_eu_all %>% filter(!is.na(component)) %>% .$component, eu_d_comp) |
|
g_c_eu_all <- g_c_eu_all %>% |
|
mutate(component = paste("eu_comp",component, sep = "_")) |
|
# Save to a file |
|
write_tsv(g_c_eu_all, path = "~/PFAM_components/eu_cluster_components.tsv") |
|
|
|
|
|
|
|
# Create the gu SSN |
|
g_gu <- gu_ssn %>% |
|
select(V1,V2,V4) %>% |
|
rename(node1 = V1, node2 = V2, weight = V4) %>% |
|
tbl_df() %>% |
|
igraph::graph_from_data_frame(directed = FALSE) %>% |
|
igraph::simplify() %>% |
|
as_tbl_graph() %>% |
|
activate(nodes) %>% |
|
left_join(gu_ids %>% rename(name = V2, cl_name = V1) %>% |
|
mutate(name = name -1, name = as.character(name)), by = "name") |
|
|
|
# Filter edges |
|
g_c_gu <- g_gu %>% |
|
activate(edges) %>% |
|
#filter(weight >= estimate_mode(f$m)) %>% |
|
filter(weight >= cut_t) %>% |
|
activate(nodes) %>% |
|
mutate(component = group_components()) |
|
|
|
# Count components |
|
gu_d_c <- g_c_gu %>% |
|
as.tibble() %>% |
|
distinct(component) %>% |
|
.$component %>% |
|
length |
|
|
|
# we will decompose clusters with more than 10K using louvain |
|
g_c_gu_10k <- g_c_gu %>% |
|
as_tibble() %>% |
|
group_by(component) %>% |
|
count() %>% |
|
arrange(desc(n)) %>% |
|
filter(n > 9500) %>% |
|
.$component |
|
|
|
|
|
if (length(g_c_gu_10k) > 0) { |
|
g_c_gu_10k_l <- pbmcapply::pbmclapply(g_c_gu_10k, use_louvain, G = g_c_gu) |
|
if (length(g_c_gu_10k) > 1) { |
|
g_c_gu <- g_c_gu %>% |
|
graph_join(do.call(bind_graphs, g_c_gu_10k_l)) %>% |
|
activate(nodes) %>% |
|
mutate(component = ifelse(is.na(group), component, group)) %>% |
|
select(-group) |
|
}else{ |
|
g_c_gu <- g_c_gu %>% |
|
graph_join(g_c_gu_10k_l[[1]]) %>% |
|
activate(nodes) %>% |
|
mutate(component = ifelse(is.na(group), component, group)) %>% |
|
select(-group) |
|
} |
|
} |
|
|
|
|
|
# We plot component size |
|
g_c_gu %>% |
|
as_tibble() %>% |
|
group_by(component) %>% |
|
count() %>% |
|
ggpubr::gghistogram(x = "n", |
|
add = "none", rug = FALSE, |
|
bins = 15, |
|
color = "#E6A54C", fill = "#E6A54C", |
|
palette = c("#E6A54C", "#E6A54C"), alpha = 0.8) + |
|
theme_light() + |
|
scale_x_log10(labels = scales::comma, breaks = c(1, 10, 100, 1000, 10000, 50000)) + #, breaks = scales::trans_breaks("sqrt", function(x) x^2)) + |
|
ylab("Counts") + |
|
xlab("Component size") + |
|
theme(legend.position = "none") + |
|
scale_y_sqrt(labels = scales::comma, breaks = scales::trans_breaks("sqrt", function(x) x^2)) |
|
|
|
# Select components with size > 1 |
|
g_c_gu_gt1 <- g_c_gu %>% |
|
as_tibble() %>% |
|
group_by(component) %>% |
|
count() %>% |
|
filter(n > 1) %>% |
|
.$component |
|
|
|
# calculate similarities in each component |
|
g_c_gu_sim <- bind_rows(pbmcapply::pbmclapply(g_c_gu_gt1, get_comp_weights, G = g_c_gu, mc.cores = 64)) |
|
|
|
# plot distribution |
|
gghistogram(g_c_gu_sim, x = "median", |
|
add = "none", rug = FALSE, |
|
bins = nclass.Sturges(g_c_gu_sim$median), |
|
color = "#E6A54C", fill = "#E6A54C", |
|
palette = c("#E6A54C", "#E6A54C"), alpha = 0.8) + |
|
#geom_vline(xintercept=summary(f$m1)[[2]], color="red", linetype="dashed") + |
|
#geom_vline(xintercept=estimate_mode(f$m1), color="#D6B82C", linetype="dashed", size = 1) + |
|
#geom_vline(xintercept=median(f$m1)-(2*mad(f$m1)), color="#D6B82C", linetype="dashed", size = 1) + |
|
theme_light() + |
|
scale_x_continuous(labels = scales::percent, breaks = seq(0.3,1, by = 0.1)) + |
|
scale_y_continuous(labels = scales::comma) + |
|
ylab("Counts") + |
|
xlab("Median similarity") |
|
|
|
# get real cluster names |
|
g_c_gu_all <- g_c_gu %>% |
|
as.tibble() %>% |
|
right_join(gu_ids %>% rename(name = V2, cl_name = V1) %>% |
|
mutate(name = name -1, name = as.character(name))) %>% |
|
arrange(component) |
|
|
|
# Add extra components |
|
g_c_gu_n <- g_c_gu_all %>% filter(!is.na(component)) %>% .$component %>% length |
|
gu_d_comp <- seq(1, dim(gu_ids)[1] - g_c_gu_n, by = 1) + gu_d_c |
|
g_c_gu_all$component <- c(g_c_gu_all %>% filter(!is.na(component)) %>% .$component, gu_d_comp) |
|
g_c_gu_all <- g_c_gu_all %>% |
|
mutate(component = paste("gu_comp",component, sep = "_")) |
|
# Save to a file |
|
write_tsv(g_c_gu_all, path = "~/PFAM_components/gu_cluster_components.tsv") |
|
|
|
|
|
# Create the kwp SSN |
|
g_kwp <- kwp_ssn %>% |
|
select(V1,V2,V4) %>% |
|
rename(node1 = V1, node2 = V2, weight = V4) %>% |
|
tbl_df() %>% |
|
igraph::graph_from_data_frame(directed = FALSE) %>% |
|
igraph::simplify() %>% |
|
as_tbl_graph() %>% |
|
activate(nodes) %>% |
|
left_join(kwp_ids %>% rename(name = V2, cl_name = V1) %>% |
|
mutate(name = name -1, name = as.character(name)), by = "name") |
|
|
|
# Filter edges |
|
g_c_kwp <- g_kwp %>% |
|
activate(edges) %>% |
|
#filter(weight >= estimate_mode(f$m)) %>% |
|
filter(weight >= cut_t) %>% |
|
activate(nodes) %>% |
|
mutate(component = group_components()) |
|
|
|
# Count components |
|
kwp_d_c <- g_c_kwp %>% |
|
as.tibble() %>% |
|
distinct(component) %>% |
|
.$component %>% |
|
length |
|
|
|
# we will decompose clusters with more than 10K using louvain |
|
g_c_kwp_10k <- g_c_kwp %>% |
|
as_tibble() %>% |
|
group_by(component) %>% |
|
count() %>% |
|
arrange(desc(n)) %>% |
|
filter(n > 9500) %>% |
|
.$component |
|
|
|
if (length(g_c_kwp_10k) > 0) { |
|
g_c_kwp_10k_l <- pbmcapply::pbmclapply(g_c_kwp_10k, use_louvain, G = g_c_kwp) |
|
if (length(g_c_kwp_10k) > 1) { |
|
g_c_kwp <- g_c_kwp %>% |
|
graph_join(do.call(bind_graphs, g_c_kwp_10k_l)) %>% |
|
activate(nodes) %>% |
|
mutate(component = ifelse(is.na(group), component, group)) %>% |
|
select(-group) |
|
}else{ |
|
g_c_kwp <- g_c_kwp %>% |
|
graph_join(g_c_kwp_10k_l[[1]]) %>% |
|
activate(nodes) %>% |
|
mutate(component = ifelse(is.na(group), component, group)) %>% |
|
select(-group) |
|
} |
|
} |
|
|
|
# We plot component size |
|
g_c_kwp %>% |
|
as_tibble() %>% |
|
group_by(component) %>% |
|
count() %>% |
|
ggpubr::gghistogram(x = "n", |
|
add = "none", rug = FALSE, |
|
bins = 15, |
|
color = "#2A363B", fill = "#2A363B", |
|
palette = c("#2A363B", "#2A363B"), alpha = 0.8) + |
|
theme_light() + |
|
scale_x_log10(labels = scales::comma, breaks = c(1, 10, 100, 1000, 10000, 50000)) + #, breaks = scales::trans_breaks("sqrt", function(x) x^2)) + |
|
ylab("Counts") + |
|
xlab("Component size") + |
|
theme(legend.position = "none") + |
|
scale_y_sqrt(labels = scales::comma, breaks = scales::trans_breaks("sqrt", function(x) x^2)) |
|
|
|
# Select components with size > 1 |
|
g_c_kwp_gt1 <- g_c_kwp %>% |
|
as_tibble() %>% |
|
group_by(component) %>% |
|
count() %>% |
|
filter(n > 1) %>% |
|
.$component |
|
|
|
# calculate similarities in each component |
|
g_c_kwp_sim <- bind_rows(pbmcapply::pbmclapply(g_c_kwp_gt1, get_comp_weights, G = g_c_kwp, mc.cores = 64)) |
|
|
|
# plot distribution |
|
gghistogram(g_c_kwp_sim, x = "median", |
|
add = "none", rug = FALSE, |
|
bins = nclass.Sturges(g_c_kwp_sim$median), |
|
color = "#2A363B", fill = "#2A363B", |
|
palette = c("#2A363B", "#2A363B"), alpha = 0.8) + |
|
#geom_vline(xintercept=summary(f$m1)[[2]], color="red", linetype="dashed") + |
|
#geom_vline(xintercept=estimate_mode(f$m1), color="#D6B82C", linetype="dashed", size = 1) + |
|
#geom_vline(xintercept=median(f$m1)-(2*mad(f$m1)), color="#D6B82C", linetype="dashed", size = 1) + |
|
theme_light() + |
|
scale_x_continuous(labels = scales::percent, breaks = seq(0.3,1, by = 0.1)) + |
|
scale_y_continuous(labels = scales::comma) + |
|
ylab("Counts") + |
|
xlab("Median similarity") |
|
|
|
# get real cluster names |
|
g_c_kwp_all <- g_c_kwp %>% |
|
as.tibble() %>% |
|
right_join(kwp_ids %>% rename(name = V2, cl_name = V1) %>% |
|
mutate(name = name -1, name = as.character(name))) %>% |
|
arrange(component) |
|
|
|
# Add extra components |
|
g_c_kwp_n <- g_c_kwp_all %>% filter(!is.na(component)) %>% .$component %>% length |
|
kwp_d_comp <- seq(1, dim(kwp_ids)[1] - g_c_kwp_n, by = 1) + kwp_d_c |
|
g_c_kwp_all$component <- c(g_c_kwp_all %>% filter(!is.na(component)) %>% .$component, kwp_d_comp) |
|
g_c_kwp_all <- g_c_kwp_all %>% |
|
mutate(component = paste("kwp_comp",component, sep = "_")) |
|
# Save to a file |
|
write_tsv(g_c_kwp_all, path = "~/PFAM_components/kwp_cluster_components.tsv") |