genomewalker · February 5, 2018 14:26
diff --git a/README_cons.md b/README_cons.md
diff --git a/README_rep.md b/README_rep.md
diff --git a/combine_components.r b/combine_components.r
 library(data.table)
 library(tidyverse)

 gu_comp <- data.table::fread(input = "~/PFAM_components/gu_cluster_components.tsv", header = TRUE, sep = "\t") %>%
  dplyr::select(cl_name, component)
 eu_comp <- data.table::fread(input = "~/PFAM_components/eu_cluster_components.tsv", header = TRUE, sep = "\t") %>%
  dplyr::select(cl_name, component)
 kwp_comp <- data.table::fread(input = "~/PFAM_components/kwp_cluster_components.tsv", header = TRUE, sep = "\t") %>%
  dplyr::select(cl_name, component)
 k_comp <- data.table::fread(input = "~/PFAM_components/k_cluster_components.tsv", header = FALSE, sep = "\t") %>%
  rename(cl_name = V1, component = V2)

 all_comp <- bind_rows(gu_comp, eu_comp, kwp_comp, k_comp) %>%
  rename(clstr_name = cl_name)

 write_tsv(all_comp, path = "~/PFAM_components/all_cluster_components.tsv")
diff --git a/para_pfam.sh b/para_pfam.sh
 #!/bin/bash

 NAM=$(basename $1 .txt)
 FA=/scratch/antonio/cl_rep_pfam/pfam_ssn/${NAM}.fasta
 SSN=/scratch/antonio/cl_rep_pfam/pfam_ssn/${NAM}.ssn
 ID=/scratch/antonio/cl_rep_pfam/pfam_ssn/${NAM}.id
 STATS=/scratch/antonio/cl_rep_pfam/pfam_ssn/${NAM}.stats
 PARASAIL_BIN=/home/afernand/opt/parasail-2.0.5_bm_mod/bin/parasail_aligner

 if [ $(awk 'END{print NR}' ${1}) -gt 1 ]; then
 # extract fasta sequences
    seqtk subseq /scratch/antonio/cl_rep_pfam/cl_new_rep.fasta /scratch/antonio/cl_rep_pfam/pfam_files/${NAM}.txt > ${FA}
 # align sequences
    ${PARASAIL_BIN} -a sg_stats_scan_sse2_128_16 -i 5 -l 80 -s 20 -E -f "${FA}" -g ${SSN} -t 4 > /dev/null 2>&1
    mawk -vN=${NAM} 'BEGIN{FS=","}{print N" "$1" "$2" "$4}' ${SSN} > "${ID}"
    datamash -t " " min 4 mean 4 median 4 max 4 < ${ID} | awk -vN=${NAM} '{print N,$0}' > $STATS
 fi
diff --git a/para_pfam_cons.sh b/para_pfam_cons.sh
 #!/bin/bash

 NAM=$(basename $1 .txt)
 FA=/scratch/antonio/cl_rep_pfam/pfam_ssn/${NAM}.fasta
 SSN=/scratch/antonio/cl_rep_pfam/pfam_ssn/${NAM}.ssn
 ID=/scratch/antonio/cl_rep_pfam/pfam_ssn/${NAM}.id
 STATS=/scratch/antonio/cl_rep_pfam/pfam_ssn/${NAM}.stats
 PARASAIL_BIN=/home/afernand/opt/parasail-2.0.5_bm_mod/bin/parasail_aligner

 if [ $(awk 'END{print NR}' ${1}) -gt 1 ]; then
 # extract fasta sequences
    seqtk subseq /scratch/antonio/cl_rep_pfam/cl_new_cons.fasta /scratch/antonio/cl_rep_pfam/pfam_files/${NAM}.txt > ${FA}
 # align sequences
    ${PARASAIL_BIN} -a sg_stats_scan_sse2_128_16 -i 5 -l 80 -s 20 -E -f "${FA}" -g ${SSN} -t 4 > /dev/null 2>&1
    mawk -vN=${NAM} 'BEGIN{FS=","}{print N" "$1" "$2" "$4}' ${SSN} > "${ID}"
    datamash -t " " min 4 mean 4 median 4 max 4 < ${ID} | awk -vN=${NAM} '{print N,$0}' > $STATS
 fi
diff --git a/SSN_superclusters.r b/SSN_superclusters.r
 library(data.table)
 library(tidyverse)
 library(tidygraph)
 library(ggraph)
 library(ggpubr)

 load("~/Downloads/pfam_all_objects.RData")
 # Read PFAM SSN ids and cluster sizes
 file <- fread("~/PFAM_components/cl_name_rep_pfam_ids_20805_cons.txt", header = FALSE, verbose = TRUE)
 f_size <- fread("~/PFAM_components/class_count.tsv", header = FALSE, verbose = TRUE)

 # Calculate mean and median
 f <- file %>% group_by(V1) %>% summarise(m = median(V4), m1 = mean(V4))

 # Check PFAM clusters size distribution
 f_size %>% 
  gghistogram(x = "V2",
              add = "none", rug = FALSE, 
              bins = nclass.Sturges(.$V2),
              fill = "#4A636A", color = "#4A636A", alpha = 0.8) +
  theme_light() +
  scale_x_sqrt(labels = scales::comma, breaks = scales::trans_breaks("sqrt", function(x) x^2)) +
  scale_y_sqrt(labels = scales::comma, breaks = scales::trans_breaks("sqrt", function(x) x^2)) +
  ylab("Counts") +
  xlab("Number of ORF clusters") +
  theme(legend.position = "none")


 # Check where is the threshold we are going to use
 # We are using 2*MAD

 estimate_mode <- function(x) {
  d <- density(x)
  d$x[which.max(d$y)]
 }

 get_comp_weights <- function(X, G){
  l <- G %>% 
    activate(nodes) %>% 
    filter(component == X) %>%
    activate(edges) %>%
    as_tibble() %>% 
    select(weight) %>%
    summarise(mean = mean(weight), median = median(weight), n = n()) %>%
    mutate(comp = X)
  return(l)
 }

 use_louvain <- function(X, G) {
  dc <- G %>%
    activate(nodes) %>%
    filter(component == X) %>%
    mutate(group = paste(X,as.factor(group_louvain()), sep = "_")) 
  return(dc)
 }

 gghistogram(f, x = "m",
            add = "none", rug = FALSE, 
            bins = nclass.scott(f$m),
            color = "#4A636A", fill = "#4A636A", alpha = 0.8) +
  geom_vline(xintercept=median(f$m), color="red", linetype="dashed", size = 1) +
  geom_vline(xintercept=estimate_mode(f$m), color="#D6B82C", linetype="dashed", size = 1) +
  geom_vline(xintercept=median(f$m)-(2*mad(f$m, constant = 1)), color="#D6782C", linetype="dashed", size = 1) +
  theme_light() +
  scale_x_continuous(labels = scales::percent) +
  scale_y_continuous(labels = scales::comma) +
  ylab("Counts") +
  xlab("Average similarity")

 #cut_t <- median(f$m)-(2*mad(f$m, constant = 1))
 cut_t <- estimate_mode(f$m)
 # Read GU and EU SSN
 eu_ssn <- fread("~/PFAM_components/ssnEU3.out", header = FALSE)
 eu_ids <- fread("~/PFAM_components/eu_ids_para_cons.tsv", header = FALSE)
 gu_ssn <- fread("~/PFAM_components/ssnGU3.out", header = FALSE)
 gu_ids <- fread("~/PFAM_components/gu_ids_para_cons.tsv", header = FALSE)
 kwp_ssn <- fread("~/PFAM_components/ssnKWP3.out", header = FALSE)
 kwp_ids <- fread("~/PFAM_components/kwp_ids_para_cons.tsv", header = FALSE)



 # Create the EU SSN
 g_eu <- eu_ssn %>% 
  select(V1,V2,V4) %>% 
  rename(node1 = V1, node2 = V2, weight = V4) %>% 
  tbl_df() %>%
  igraph::graph_from_data_frame(directed = FALSE) %>% 
  igraph::simplify() %>%
  as_tbl_graph() %>%
  activate(nodes) %>%
  left_join(eu_ids %>% rename(name = V2, cl_name = V1) %>% 
              mutate(name = name - 1, name = as.character(name)), by = "name") 

 # Filter edges 
 g_c_eu <- g_eu %>% 
  activate(edges) %>%
  #filter(weight >= estimate_mode(f$m)) %>%
  filter(weight >= cut_t) %>%
  activate(nodes) %>%
  mutate(component = group_components())

 # Count components
 eu_d_c <- g_c_eu %>% 
  as.tibble() %>% 
  distinct(component) %>%
  .$component %>%
  length

 g_c_eu %>%
  as_tibble() %>%
  group_by(component) %>%
  count() %>%
  arrange(desc(n))

 # we will decompose clusters with more than 10K using louvain 
 g_c_eu_10k <- g_c_eu %>%
  as_tibble() %>%
  group_by(component) %>%
  count() %>%
  arrange(desc(n)) %>%
  filter(n > 9500) %>%
  .$component


 if (length(g_c_eu_10k) > 0) {
  g_c_eu_10k_l <- pbmcapply::pbmclapply(g_c_eu_10k, use_louvain, G = g_c_eu)
  if (length(g_c_eu_10k) > 1) {
    g_c_eu <- g_c_eu %>% 
      graph_join(do.call(bind_graphs, g_c_eu_10k_l)) %>%
      activate(nodes) %>%
      mutate(component = ifelse(is.na(group), component, group)) %>%
      select(-group)
  }else{
    g_c_eu <- g_c_eu %>% 
      graph_join(g_c_eu_10k_l[[1]]) %>% 
      activate(nodes) %>%
      mutate(component = ifelse(is.na(group), component, group)) %>%
      select(-group)
  }
 }


 # We plot component size
 g_c_eu %>%
  as_tibble() %>%
  group_by(component) %>%
  count() %>%
  ggpubr::gghistogram(x = "n",
                      add = "none", rug = FALSE, 
                      bins = 15,
                      color = "#CE442B", fill = "#CE442B",
                      palette = c("#CE442B", "#CE442B"), alpha = 0.8) +
  theme_light() +
  #scale_x_sqrt(labels = scales::comma, breaks = scales::trans_breaks("sqrt", function(x) x^2)) +
  scale_x_log10(labels = scales::comma, breaks = c(1, 10, 100, 1000, 10000, 50000)) + #, breaks = scales::trans_breaks("sqrt", function(x) x^2)) +
  ylab("Counts") +
  xlab("Component size") +
  theme(legend.position = "none") +
  scale_y_sqrt(labels = scales::comma, breaks = scales::trans_breaks("sqrt", function(x) x^2))

 # Select components with size > 1
 g_c_eu_gt1 <- g_c_eu %>%
  as_tibble() %>%
  group_by(component) %>%
  count() %>%
  filter(n > 1) %>%
  .$component

 # calculate similarities in each component
 g_c_eu_sim <- bind_rows(pbmcapply::pbmclapply(g_c_eu_gt1, get_comp_weights, G = g_c_eu, mc.cores = 4))

 # plot distribution
 gghistogram(g_c_eu_sim, x = "median",
            add = "none", rug = FALSE, 
            bins = nclass.Sturges(g_c_eu_sim$median),
            color = "#CE442B", fill = "#CE442B",
            palette = c("#CE442B", "#CE442B"), alpha = 0.8) +
  #geom_vline(xintercept=summary(f$m1)[[2]], color="red", linetype="dashed") +
  #geom_vline(xintercept=estimate_mode(f$m1), color="#D6B82C", linetype="dashed", size = 1) +
  #geom_vline(xintercept=median(f$m1)-(2*mad(f$m1)), color="#D6B82C", linetype="dashed", size = 1) +
  theme_light() +
  scale_x_continuous(labels = scales::percent) +
  scale_y_continuous(labels = scales::comma) +
  ylab("Counts") +
  xlab("Median similarity")

 # get real cluster names
 g_c_eu_all <- g_c_eu %>% 
  as.tibble() %>%
  right_join(eu_ids %>% rename(name = V2, cl_name = V1) %>% 
               mutate(name = name - 1, name = as.character(name))) %>%
  arrange(component)

 # Add extra components
 g_c_eu_n <- g_c_eu_all %>% filter(!is.na(component)) %>% .$component %>% length
 eu_d_comp <- seq(1, dim(eu_ids)[1] - g_c_eu_n, by = 1) + eu_d_c
 g_c_eu_all$component <- c(g_c_eu_all %>% filter(!is.na(component)) %>% .$component, eu_d_comp)
 g_c_eu_all <- g_c_eu_all %>%
  mutate(component = paste("eu_comp",component, sep = "_"))
 # Save to a file
 write_tsv(g_c_eu_all, path = "~/PFAM_components/eu_cluster_components.tsv")



 # Create the gu SSN
 g_gu <- gu_ssn %>% 
  select(V1,V2,V4) %>% 
  rename(node1 = V1, node2 = V2, weight = V4) %>% 
  tbl_df() %>%
  igraph::graph_from_data_frame(directed = FALSE) %>% 
  igraph::simplify() %>%
  as_tbl_graph() %>%
  activate(nodes) %>%
  left_join(gu_ids %>% rename(name = V2, cl_name = V1) %>% 
              mutate(name = name -1, name = as.character(name)), by = "name") 

 # Filter edges 
 g_c_gu <- g_gu %>% 
  activate(edges) %>%
  #filter(weight >= estimate_mode(f$m)) %>%
  filter(weight >= cut_t) %>%
  activate(nodes) %>%
  mutate(component = group_components())

 # Count components
 gu_d_c <- g_c_gu %>% 
  as.tibble() %>% 
  distinct(component) %>%
  .$component %>%
  length

 # we will decompose clusters with more than 10K using louvain 
 g_c_gu_10k <- g_c_gu %>%
  as_tibble() %>%
  group_by(component) %>%
  count() %>%
  arrange(desc(n)) %>%
  filter(n > 9500) %>%
  .$component


 if (length(g_c_gu_10k) > 0) {
  g_c_gu_10k_l <- pbmcapply::pbmclapply(g_c_gu_10k, use_louvain, G = g_c_gu)
  if (length(g_c_gu_10k) > 1) {
    g_c_gu <- g_c_gu %>% 
      graph_join(do.call(bind_graphs, g_c_gu_10k_l)) %>%
      activate(nodes) %>%
      mutate(component = ifelse(is.na(group), component, group)) %>%
      select(-group)
  }else{
    g_c_gu <- g_c_gu %>% 
      graph_join(g_c_gu_10k_l[[1]]) %>% 
      activate(nodes) %>%
      mutate(component = ifelse(is.na(group), component, group)) %>%
      select(-group)
  }
 }


 # We plot component size
 g_c_gu %>%
  as_tibble() %>%
  group_by(component) %>%
  count() %>%
  ggpubr::gghistogram(x = "n",
                      add = "none", rug = FALSE, 
                      bins = 15,
                      color = "#E6A54C", fill = "#E6A54C",
                      palette = c("#E6A54C", "#E6A54C"), alpha = 0.8) +
  theme_light() +
  scale_x_log10(labels = scales::comma, breaks = c(1, 10, 100, 1000, 10000, 50000)) + #, breaks = scales::trans_breaks("sqrt", function(x) x^2)) +
  ylab("Counts") +
  xlab("Component size") +
  theme(legend.position = "none") +
  scale_y_sqrt(labels = scales::comma, breaks = scales::trans_breaks("sqrt", function(x) x^2))

 # Select components with size > 1
 g_c_gu_gt1 <- g_c_gu %>%
  as_tibble() %>%
  group_by(component) %>%
  count() %>%
  filter(n > 1) %>%
  .$component

 # calculate similarities in each component
 g_c_gu_sim <- bind_rows(pbmcapply::pbmclapply(g_c_gu_gt1, get_comp_weights, G = g_c_gu, mc.cores = 64))

 # plot distribution
 gghistogram(g_c_gu_sim, x = "median",
            add = "none", rug = FALSE, 
            bins = nclass.Sturges(g_c_gu_sim$median),
            color = "#E6A54C", fill = "#E6A54C",
            palette = c("#E6A54C", "#E6A54C"), alpha = 0.8) +
  #geom_vline(xintercept=summary(f$m1)[[2]], color="red", linetype="dashed") +
  #geom_vline(xintercept=estimate_mode(f$m1), color="#D6B82C", linetype="dashed", size = 1) +
  #geom_vline(xintercept=median(f$m1)-(2*mad(f$m1)), color="#D6B82C", linetype="dashed", size = 1) +
  theme_light() +
  scale_x_continuous(labels = scales::percent, breaks = seq(0.3,1, by = 0.1)) +
  scale_y_continuous(labels = scales::comma) +
  ylab("Counts") +
  xlab("Median similarity") 

 # get real cluster names
 g_c_gu_all <- g_c_gu %>% 
  as.tibble() %>%
  right_join(gu_ids %>% rename(name = V2, cl_name = V1) %>% 
               mutate(name = name -1, name = as.character(name))) %>%
  arrange(component)

 # Add extra components
 g_c_gu_n <- g_c_gu_all %>% filter(!is.na(component)) %>% .$component %>% length
 gu_d_comp <- seq(1, dim(gu_ids)[1] - g_c_gu_n, by = 1) + gu_d_c
 g_c_gu_all$component <- c(g_c_gu_all %>% filter(!is.na(component)) %>% .$component, gu_d_comp)
 g_c_gu_all <- g_c_gu_all %>%
  mutate(component = paste("gu_comp",component, sep = "_"))
 # Save to a file
 write_tsv(g_c_gu_all, path = "~/PFAM_components/gu_cluster_components.tsv")


 # Create the kwp SSN
 g_kwp <- kwp_ssn %>% 
  select(V1,V2,V4) %>% 
  rename(node1 = V1, node2 = V2, weight = V4) %>% 
  tbl_df() %>%
  igraph::graph_from_data_frame(directed = FALSE) %>% 
  igraph::simplify() %>%
  as_tbl_graph() %>%
  activate(nodes) %>%
  left_join(kwp_ids %>% rename(name = V2, cl_name = V1) %>% 
              mutate(name = name -1, name = as.character(name)), by = "name") 

 # Filter edges 
 g_c_kwp <- g_kwp %>% 
  activate(edges) %>%
  #filter(weight >= estimate_mode(f$m)) %>%
  filter(weight >= cut_t) %>%
  activate(nodes) %>%
  mutate(component = group_components())

 # Count components
 kwp_d_c <- g_c_kwp %>% 
  as.tibble() %>% 
  distinct(component) %>%
  .$component %>%
  length

 # we will decompose clusters with more than 10K using louvain 
 g_c_kwp_10k <- g_c_kwp %>%
  as_tibble() %>%
  group_by(component) %>%
  count() %>%
  arrange(desc(n)) %>%
  filter(n > 9500) %>%
  .$component

 if (length(g_c_kwp_10k) > 0) {
  g_c_kwp_10k_l <- pbmcapply::pbmclapply(g_c_kwp_10k, use_louvain, G = g_c_kwp)
  if (length(g_c_kwp_10k) > 1) {
    g_c_kwp <- g_c_kwp %>% 
      graph_join(do.call(bind_graphs, g_c_kwp_10k_l)) %>%
      activate(nodes) %>%
      mutate(component = ifelse(is.na(group), component, group)) %>%
      select(-group)
  }else{
    g_c_kwp <- g_c_kwp %>% 
      graph_join(g_c_kwp_10k_l[[1]]) %>% 
      activate(nodes) %>%
      mutate(component = ifelse(is.na(group), component, group)) %>%
      select(-group)
  }
 }

 # We plot component size
 g_c_kwp %>%
  as_tibble() %>%
  group_by(component) %>%
  count() %>%
  ggpubr::gghistogram(x = "n",
                      add = "none", rug = FALSE, 
                      bins = 15,
                      color = "#2A363B", fill = "#2A363B",
                      palette = c("#2A363B", "#2A363B"), alpha = 0.8) +
  theme_light() +
  scale_x_log10(labels = scales::comma, breaks = c(1, 10, 100, 1000, 10000, 50000)) + #, breaks = scales::trans_breaks("sqrt", function(x) x^2)) +
  ylab("Counts") +
  xlab("Component size") +
  theme(legend.position = "none") +
  scale_y_sqrt(labels = scales::comma, breaks = scales::trans_breaks("sqrt", function(x) x^2))

 # Select components with size > 1
 g_c_kwp_gt1 <- g_c_kwp %>%
  as_tibble() %>%
  group_by(component) %>%
  count() %>%
  filter(n > 1) %>%
  .$component

 # calculate similarities in each component
 g_c_kwp_sim <- bind_rows(pbmcapply::pbmclapply(g_c_kwp_gt1, get_comp_weights, G = g_c_kwp, mc.cores = 64))

 # plot distribution
 gghistogram(g_c_kwp_sim, x = "median",
            add = "none", rug = FALSE, 
            bins = nclass.Sturges(g_c_kwp_sim$median),
            color = "#2A363B", fill = "#2A363B",
            palette = c("#2A363B", "#2A363B"), alpha = 0.8) +
  #geom_vline(xintercept=summary(f$m1)[[2]], color="red", linetype="dashed") +
  #geom_vline(xintercept=estimate_mode(f$m1), color="#D6B82C", linetype="dashed", size = 1) +
  #geom_vline(xintercept=median(f$m1)-(2*mad(f$m1)), color="#D6B82C", linetype="dashed", size = 1) +
  theme_light() +
  scale_x_continuous(labels = scales::percent, breaks = seq(0.3,1, by = 0.1)) +
  scale_y_continuous(labels = scales::comma) +
  ylab("Counts") +
  xlab("Median similarity") 

 # get real cluster names
 g_c_kwp_all <- g_c_kwp %>% 
  as.tibble() %>%
  right_join(kwp_ids %>% rename(name = V2, cl_name = V1) %>% 
               mutate(name = name -1, name = as.character(name))) %>%
  arrange(component)

 # Add extra components
 g_c_kwp_n <- g_c_kwp_all %>% filter(!is.na(component)) %>% .$component %>% length
 kwp_d_comp <- seq(1, dim(kwp_ids)[1] - g_c_kwp_n, by = 1) + kwp_d_c
 g_c_kwp_all$component <- c(g_c_kwp_all %>% filter(!is.na(component)) %>% .$component, kwp_d_comp)
 g_c_kwp_all <- g_c_kwp_all %>%
  mutate(component = paste("kwp_comp",component, sep = "_"))
 # Save to a file
 write_tsv(g_c_kwp_all, path = "~/PFAM_components/kwp_cluster_components.tsv")
	library(data.table)
	library(tidyverse)

	gu_comp <- data.table::fread(input = "~/PFAM_components/gu_cluster_components.tsv", header = TRUE, sep = "\t") %>%
	dplyr::select(cl_name, component)
	eu_comp <- data.table::fread(input = "~/PFAM_components/eu_cluster_components.tsv", header = TRUE, sep = "\t") %>%
	dplyr::select(cl_name, component)
	kwp_comp <- data.table::fread(input = "~/PFAM_components/kwp_cluster_components.tsv", header = TRUE, sep = "\t") %>%
	dplyr::select(cl_name, component)
	k_comp <- data.table::fread(input = "~/PFAM_components/k_cluster_components.tsv", header = FALSE, sep = "\t") %>%
	rename(cl_name = V1, component = V2)

	all_comp <- bind_rows(gu_comp, eu_comp, kwp_comp, k_comp) %>%
	rename(clstr_name = cl_name)

	write_tsv(all_comp, path = "~/PFAM_components/all_cluster_components.tsv")
	#!/bin/bash

	NAM=$(basename $1 .txt)
	FA=/scratch/antonio/cl_rep_pfam/pfam_ssn/${NAM}.fasta
	SSN=/scratch/antonio/cl_rep_pfam/pfam_ssn/${NAM}.ssn
	ID=/scratch/antonio/cl_rep_pfam/pfam_ssn/${NAM}.id
	STATS=/scratch/antonio/cl_rep_pfam/pfam_ssn/${NAM}.stats
	PARASAIL_BIN=/home/afernand/opt/parasail-2.0.5_bm_mod/bin/parasail_aligner

	if [ $(awk 'END{print NR}' ${1}) -gt 1 ]; then
	# extract fasta sequences
	seqtk subseq /scratch/antonio/cl_rep_pfam/cl_new_rep.fasta /scratch/antonio/cl_rep_pfam/pfam_files/${NAM}.txt > ${FA}
	# align sequences
	${PARASAIL_BIN} -a sg_stats_scan_sse2_128_16 -i 5 -l 80 -s 20 -E -f "${FA}" -g ${SSN} -t 4 > /dev/null 2>&1
	mawk -vN=${NAM} 'BEGIN{FS=","}{print N" "$1" "$2" "$4}' ${SSN} > "${ID}"
	datamash -t " " min 4 mean 4 median 4 max 4 < ${ID} \| awk -vN=${NAM} '{print N,$0}' > $STATS
	fi
	library(data.table)
	library(tidyverse)
	library(tidygraph)
	library(ggraph)
	library(ggpubr)

	load("~/Downloads/pfam_all_objects.RData")
	# Read PFAM SSN ids and cluster sizes
	file <- fread("~/PFAM_components/cl_name_rep_pfam_ids_20805_cons.txt", header = FALSE, verbose = TRUE)
	f_size <- fread("~/PFAM_components/class_count.tsv", header = FALSE, verbose = TRUE)

	# Calculate mean and median
	f <- file %>% group_by(V1) %>% summarise(m = median(V4), m1 = mean(V4))

	# Check PFAM clusters size distribution
	f_size %>%
	gghistogram(x = "V2",
	add = "none", rug = FALSE,
	bins = nclass.Sturges(.$V2),
	fill = "#4A636A", color = "#4A636A", alpha = 0.8) +
	theme_light() +
	scale_x_sqrt(labels = scales::comma, breaks = scales::trans_breaks("sqrt", function(x) x^2)) +
	scale_y_sqrt(labels = scales::comma, breaks = scales::trans_breaks("sqrt", function(x) x^2)) +
	ylab("Counts") +
	xlab("Number of ORF clusters") +
	theme(legend.position = "none")


	# Check where is the threshold we are going to use
	# We are using 2*MAD

	estimate_mode <- function(x) {
	d <- density(x)
	d$x[which.max(d$y)]
	}

	get_comp_weights <- function(X, G){
	l <- G %>%
	activate(nodes) %>%
	filter(component == X) %>%
	activate(edges) %>%
	as_tibble() %>%
	select(weight) %>%
	summarise(mean = mean(weight), median = median(weight), n = n()) %>%
	mutate(comp = X)
	return(l)
	}

	use_louvain <- function(X, G) {
	dc <- G %>%
	activate(nodes) %>%
	filter(component == X) %>%
	mutate(group = paste(X,as.factor(group_louvain()), sep = "_"))
	return(dc)
	}

	gghistogram(f, x = "m",
	add = "none", rug = FALSE,
	bins = nclass.scott(f$m),
	color = "#4A636A", fill = "#4A636A", alpha = 0.8) +
	geom_vline(xintercept=median(f$m), color="red", linetype="dashed", size = 1) +
	geom_vline(xintercept=estimate_mode(f$m), color="#D6B82C", linetype="dashed", size = 1) +
	geom_vline(xintercept=median(f$m)-(2*mad(f$m, constant = 1)), color="#D6782C", linetype="dashed", size = 1) +
	theme_light() +
	scale_x_continuous(labels = scales::percent) +
	scale_y_continuous(labels = scales::comma) +
	ylab("Counts") +
	xlab("Average similarity")

	#cut_t <- median(f$m)-(2*mad(f$m, constant = 1))
	cut_t <- estimate_mode(f$m)
	# Read GU and EU SSN
	eu_ssn <- fread("~/PFAM_components/ssnEU3.out", header = FALSE)
	eu_ids <- fread("~/PFAM_components/eu_ids_para_cons.tsv", header = FALSE)
	gu_ssn <- fread("~/PFAM_components/ssnGU3.out", header = FALSE)
	gu_ids <- fread("~/PFAM_components/gu_ids_para_cons.tsv", header = FALSE)
	kwp_ssn <- fread("~/PFAM_components/ssnKWP3.out", header = FALSE)
	kwp_ids <- fread("~/PFAM_components/kwp_ids_para_cons.tsv", header = FALSE)



	# Create the EU SSN
	g_eu <- eu_ssn %>%
	select(V1,V2,V4) %>%
	rename(node1 = V1, node2 = V2, weight = V4) %>%
	tbl_df() %>%
	igraph::graph_from_data_frame(directed = FALSE) %>%
	igraph::simplify() %>%
	as_tbl_graph() %>%
	activate(nodes) %>%
	left_join(eu_ids %>% rename(name = V2, cl_name = V1) %>%
	mutate(name = name - 1, name = as.character(name)), by = "name")

	# Filter edges
	g_c_eu <- g_eu %>%
	activate(edges) %>%
	#filter(weight >= estimate_mode(f$m)) %>%
	filter(weight >= cut_t) %>%
	activate(nodes) %>%
	mutate(component = group_components())

	# Count components
	eu_d_c <- g_c_eu %>%
	as.tibble() %>%
	distinct(component) %>%
	.$component %>%
	length

	g_c_eu %>%
	as_tibble() %>%
	group_by(component) %>%
	count() %>%
	arrange(desc(n))

	# we will decompose clusters with more than 10K using louvain
	g_c_eu_10k <- g_c_eu %>%
	as_tibble() %>%
	group_by(component) %>%
	count() %>%
	arrange(desc(n)) %>%
	filter(n > 9500) %>%
	.$component


	if (length(g_c_eu_10k) > 0) {
	g_c_eu_10k_l <- pbmcapply::pbmclapply(g_c_eu_10k, use_louvain, G = g_c_eu)
	if (length(g_c_eu_10k) > 1) {
	g_c_eu <- g_c_eu %>%
	graph_join(do.call(bind_graphs, g_c_eu_10k_l)) %>%
	activate(nodes) %>%
	mutate(component = ifelse(is.na(group), component, group)) %>%
	select(-group)
	}else{
	g_c_eu <- g_c_eu %>%
	graph_join(g_c_eu_10k_l[[1]]) %>%
	activate(nodes) %>%
	mutate(component = ifelse(is.na(group), component, group)) %>%
	select(-group)
	}
	}


	# We plot component size
	g_c_eu %>%
	as_tibble() %>%
	group_by(component) %>%
	count() %>%
	ggpubr::gghistogram(x = "n",
	add = "none", rug = FALSE,
	bins = 15,
	color = "#CE442B", fill = "#CE442B",
	palette = c("#CE442B", "#CE442B"), alpha = 0.8) +
	theme_light() +
	#scale_x_sqrt(labels = scales::comma, breaks = scales::trans_breaks("sqrt", function(x) x^2)) +
	scale_x_log10(labels = scales::comma, breaks = c(1, 10, 100, 1000, 10000, 50000)) + #, breaks = scales::trans_breaks("sqrt", function(x) x^2)) +
	ylab("Counts") +
	xlab("Component size") +
	theme(legend.position = "none") +
	scale_y_sqrt(labels = scales::comma, breaks = scales::trans_breaks("sqrt", function(x) x^2))

	# Select components with size > 1
	g_c_eu_gt1 <- g_c_eu %>%
	as_tibble() %>%
	group_by(component) %>%
	count() %>%
	filter(n > 1) %>%
	.$component

	# calculate similarities in each component
	g_c_eu_sim <- bind_rows(pbmcapply::pbmclapply(g_c_eu_gt1, get_comp_weights, G = g_c_eu, mc.cores = 4))

	# plot distribution
	gghistogram(g_c_eu_sim, x = "median",
	add = "none", rug = FALSE,
	bins = nclass.Sturges(g_c_eu_sim$median),
	color = "#CE442B", fill = "#CE442B",
	palette = c("#CE442B", "#CE442B"), alpha = 0.8) +
	#geom_vline(xintercept=summary(f$m1)[[2]], color="red", linetype="dashed") +
	#geom_vline(xintercept=estimate_mode(f$m1), color="#D6B82C", linetype="dashed", size = 1) +
	#geom_vline(xintercept=median(f$m1)-(2*mad(f$m1)), color="#D6B82C", linetype="dashed", size = 1) +
	theme_light() +
	scale_x_continuous(labels = scales::percent) +
	scale_y_continuous(labels = scales::comma) +
	ylab("Counts") +
	xlab("Median similarity")

	# get real cluster names
	g_c_eu_all <- g_c_eu %>%
	as.tibble() %>%
	right_join(eu_ids %>% rename(name = V2, cl_name = V1) %>%
	mutate(name = name - 1, name = as.character(name))) %>%
	arrange(component)

	# Add extra components
	g_c_eu_n <- g_c_eu_all %>% filter(!is.na(component)) %>% .$component %>% length
	eu_d_comp <- seq(1, dim(eu_ids)[1] - g_c_eu_n, by = 1) + eu_d_c
	g_c_eu_all$component <- c(g_c_eu_all %>% filter(!is.na(component)) %>% .$component, eu_d_comp)
	g_c_eu_all <- g_c_eu_all %>%
	mutate(component = paste("eu_comp",component, sep = "_"))
	# Save to a file
	write_tsv(g_c_eu_all, path = "~/PFAM_components/eu_cluster_components.tsv")



	# Create the gu SSN
	g_gu <- gu_ssn %>%
	select(V1,V2,V4) %>%
	rename(node1 = V1, node2 = V2, weight = V4) %>%
	tbl_df() %>%
	igraph::graph_from_data_frame(directed = FALSE) %>%
	igraph::simplify() %>%
	as_tbl_graph() %>%
	activate(nodes) %>%
	left_join(gu_ids %>% rename(name = V2, cl_name = V1) %>%
	mutate(name = name -1, name = as.character(name)), by = "name")

	# Filter edges
	g_c_gu <- g_gu %>%
	activate(edges) %>%
	#filter(weight >= estimate_mode(f$m)) %>%
	filter(weight >= cut_t) %>%
	activate(nodes) %>%
	mutate(component = group_components())

	# Count components
	gu_d_c <- g_c_gu %>%
	as.tibble() %>%
	distinct(component) %>%
	.$component %>%
	length

	# we will decompose clusters with more than 10K using louvain
	g_c_gu_10k <- g_c_gu %>%
	as_tibble() %>%
	group_by(component) %>%
	count() %>%
	arrange(desc(n)) %>%
	filter(n > 9500) %>%
	.$component


	if (length(g_c_gu_10k) > 0) {
	g_c_gu_10k_l <- pbmcapply::pbmclapply(g_c_gu_10k, use_louvain, G = g_c_gu)
	if (length(g_c_gu_10k) > 1) {
	g_c_gu <- g_c_gu %>%
	graph_join(do.call(bind_graphs, g_c_gu_10k_l)) %>%
	activate(nodes) %>%
	mutate(component = ifelse(is.na(group), component, group)) %>%
	select(-group)
	}else{
	g_c_gu <- g_c_gu %>%
	graph_join(g_c_gu_10k_l[[1]]) %>%
	activate(nodes) %>%
	mutate(component = ifelse(is.na(group), component, group)) %>%
	select(-group)
	}
	}


	# We plot component size
	g_c_gu %>%
	as_tibble() %>%
	group_by(component) %>%
	count() %>%
	ggpubr::gghistogram(x = "n",
	add = "none", rug = FALSE,
	bins = 15,
	color = "#E6A54C", fill = "#E6A54C",
	palette = c("#E6A54C", "#E6A54C"), alpha = 0.8) +
	theme_light() +
	scale_x_log10(labels = scales::comma, breaks = c(1, 10, 100, 1000, 10000, 50000)) + #, breaks = scales::trans_breaks("sqrt", function(x) x^2)) +
	ylab("Counts") +
	xlab("Component size") +
	theme(legend.position = "none") +
	scale_y_sqrt(labels = scales::comma, breaks = scales::trans_breaks("sqrt", function(x) x^2))

	# Select components with size > 1
	g_c_gu_gt1 <- g_c_gu %>%
	as_tibble() %>%
	group_by(component) %>%
	count() %>%
	filter(n > 1) %>%
	.$component

	# calculate similarities in each component
	g_c_gu_sim <- bind_rows(pbmcapply::pbmclapply(g_c_gu_gt1, get_comp_weights, G = g_c_gu, mc.cores = 64))

	# plot distribution
	gghistogram(g_c_gu_sim, x = "median",
	add = "none", rug = FALSE,
	bins = nclass.Sturges(g_c_gu_sim$median),
	color = "#E6A54C", fill = "#E6A54C",
	palette = c("#E6A54C", "#E6A54C"), alpha = 0.8) +
	#geom_vline(xintercept=summary(f$m1)[[2]], color="red", linetype="dashed") +
	#geom_vline(xintercept=estimate_mode(f$m1), color="#D6B82C", linetype="dashed", size = 1) +
	#geom_vline(xintercept=median(f$m1)-(2*mad(f$m1)), color="#D6B82C", linetype="dashed", size = 1) +
	theme_light() +
	scale_x_continuous(labels = scales::percent, breaks = seq(0.3,1, by = 0.1)) +
	scale_y_continuous(labels = scales::comma) +
	ylab("Counts") +
	xlab("Median similarity")

	# get real cluster names
	g_c_gu_all <- g_c_gu %>%
	as.tibble() %>%
	right_join(gu_ids %>% rename(name = V2, cl_name = V1) %>%
	mutate(name = name -1, name = as.character(name))) %>%
	arrange(component)

	# Add extra components
	g_c_gu_n <- g_c_gu_all %>% filter(!is.na(component)) %>% .$component %>% length
	gu_d_comp <- seq(1, dim(gu_ids)[1] - g_c_gu_n, by = 1) + gu_d_c
	g_c_gu_all$component <- c(g_c_gu_all %>% filter(!is.na(component)) %>% .$component, gu_d_comp)
	g_c_gu_all <- g_c_gu_all %>%
	mutate(component = paste("gu_comp",component, sep = "_"))
	# Save to a file
	write_tsv(g_c_gu_all, path = "~/PFAM_components/gu_cluster_components.tsv")


	# Create the kwp SSN
	g_kwp <- kwp_ssn %>%
	select(V1,V2,V4) %>%
	rename(node1 = V1, node2 = V2, weight = V4) %>%
	tbl_df() %>%
	igraph::graph_from_data_frame(directed = FALSE) %>%
	igraph::simplify() %>%
	as_tbl_graph() %>%
	activate(nodes) %>%
	left_join(kwp_ids %>% rename(name = V2, cl_name = V1) %>%
	mutate(name = name -1, name = as.character(name)), by = "name")

	# Filter edges
	g_c_kwp <- g_kwp %>%
	activate(edges) %>%
	#filter(weight >= estimate_mode(f$m)) %>%
	filter(weight >= cut_t) %>%
	activate(nodes) %>%
	mutate(component = group_components())

	# Count components
	kwp_d_c <- g_c_kwp %>%
	as.tibble() %>%
	distinct(component) %>%
	.$component %>%
	length

	# we will decompose clusters with more than 10K using louvain
	g_c_kwp_10k <- g_c_kwp %>%
	as_tibble() %>%
	group_by(component) %>%
	count() %>%
	arrange(desc(n)) %>%
	filter(n > 9500) %>%
	.$component

	if (length(g_c_kwp_10k) > 0) {
	g_c_kwp_10k_l <- pbmcapply::pbmclapply(g_c_kwp_10k, use_louvain, G = g_c_kwp)
	if (length(g_c_kwp_10k) > 1) {
	g_c_kwp <- g_c_kwp %>%
	graph_join(do.call(bind_graphs, g_c_kwp_10k_l)) %>%
	activate(nodes) %>%
	mutate(component = ifelse(is.na(group), component, group)) %>%
	select(-group)
	}else{
	g_c_kwp <- g_c_kwp %>%
	graph_join(g_c_kwp_10k_l[[1]]) %>%
	activate(nodes) %>%
	mutate(component = ifelse(is.na(group), component, group)) %>%
	select(-group)
	}
	}

	# We plot component size
	g_c_kwp %>%
	as_tibble() %>%
	group_by(component) %>%
	count() %>%
	ggpubr::gghistogram(x = "n",
	add = "none", rug = FALSE,
	bins = 15,
	color = "#2A363B", fill = "#2A363B",
	palette = c("#2A363B", "#2A363B"), alpha = 0.8) +
	theme_light() +
	scale_x_log10(labels = scales::comma, breaks = c(1, 10, 100, 1000, 10000, 50000)) + #, breaks = scales::trans_breaks("sqrt", function(x) x^2)) +
	ylab("Counts") +
	xlab("Component size") +
	theme(legend.position = "none") +
	scale_y_sqrt(labels = scales::comma, breaks = scales::trans_breaks("sqrt", function(x) x^2))

	# Select components with size > 1
	g_c_kwp_gt1 <- g_c_kwp %>%
	as_tibble() %>%
	group_by(component) %>%
	count() %>%
	filter(n > 1) %>%
	.$component

	# calculate similarities in each component
	g_c_kwp_sim <- bind_rows(pbmcapply::pbmclapply(g_c_kwp_gt1, get_comp_weights, G = g_c_kwp, mc.cores = 64))

	# plot distribution
	gghistogram(g_c_kwp_sim, x = "median",
	add = "none", rug = FALSE,
	bins = nclass.Sturges(g_c_kwp_sim$median),
	color = "#2A363B", fill = "#2A363B",
	palette = c("#2A363B", "#2A363B"), alpha = 0.8) +
	#geom_vline(xintercept=summary(f$m1)[[2]], color="red", linetype="dashed") +
	#geom_vline(xintercept=estimate_mode(f$m1), color="#D6B82C", linetype="dashed", size = 1) +
	#geom_vline(xintercept=median(f$m1)-(2*mad(f$m1)), color="#D6B82C", linetype="dashed", size = 1) +
	theme_light() +
	scale_x_continuous(labels = scales::percent, breaks = seq(0.3,1, by = 0.1)) +
	scale_y_continuous(labels = scales::comma) +
	ylab("Counts") +
	xlab("Median similarity")

	# get real cluster names
	g_c_kwp_all <- g_c_kwp %>%
	as.tibble() %>%
	right_join(kwp_ids %>% rename(name = V2, cl_name = V1) %>%
	mutate(name = name -1, name = as.character(name))) %>%
	arrange(component)

	# Add extra components
	g_c_kwp_n <- g_c_kwp_all %>% filter(!is.na(component)) %>% .$component %>% length
	kwp_d_comp <- seq(1, dim(kwp_ids)[1] - g_c_kwp_n, by = 1) + kwp_d_c
	g_c_kwp_all$component <- c(g_c_kwp_all %>% filter(!is.na(component)) %>% .$component, kwp_d_comp)
	g_c_kwp_all <- g_c_kwp_all %>%
	mutate(component = paste("kwp_comp",component, sep = "_"))
	# Save to a file
	write_tsv(g_c_kwp_all, path = "~/PFAM_components/kwp_cluster_components.tsv")