Create needed folders
mkdir ft_files gb_files gb_files_filt gb_files_slice gb_files_ptt gb_files_genoGet the assembly file from NCBI
We will screen the RefSeq genomes for genoimc unknowns. We will use the non-redundant set of genomes identified by MASH+t-SNE+PAM clustering.
cd /bioinf/home/afernand/SANDBOX/unk_vs_refseq/
mkdir genomes
rsync -Pauvz /bioinf/projects/megx/UNKNOWNS/chiara/NETWORK/RefSeq82.ref.gen/proteomes/downloads/ genomes/We want to know how the different clusters created by MMSEQS2 aggregate together at the domain architecture level.
We need the files:
| #include <igraph/igraph.h> | |
| #include <math.h> | |
| int compare (const void * a, const void * b) | |
| { | |
| if (*(double*)a > *(double*)b) return 1; | |
| else if (*(double*)a < *(double*)b) return -1; | |
| else return 0; | |
| } |
| #!/bin/bash -l | |
| #$ -cwd | |
| #$ -j y | |
| #$ -t 1-10:1 | |
| #$ -tc 25 | |
| #$ -N hmm_unk | |
| #$ -pe threaded 2 | |
| #$ -V | |
| # Where the models are |
| mmseqs concatdbs /PROCESSING/OSD/UNKNOWNS/MMSEQ_MPI/unkdb_update_hmp/unkprot_db \ | |
| /PROCESSING/OSD/UNKNOWNS/MMSEQ_MPI/unkdb_update_hmp/HMPSOAP_db \ | |
| /PROCESSING/OSD/UNKNOWNS/MMSEQ_MPI/unkdb_update_hmp/unkprot_db.withNewSequences | |
| mmseqs concatdbs /PROCESSING/OSD/UNKNOWNS/MMSEQ_MPI/unkdb_update_hmp/unkprot_db_h \ | |
| /PROCESSING/OSD/UNKNOWNS/MMSEQ_MPI/unkdb_update_hmp/HMPSOAP_db_h \ | |
| /PROCESSING/OSD/UNKNOWNS/MMSEQ_MPI/unkdb_update_hmp/unkprot_db.withNewSequences_h | |
| mawk '{print NR"\t"$1}' unkprot_db.withNewSequences_h | sed 's/\x0//g' > unkprot_db.withNewSequences.lookup |
| big_dada <- function(X){ | |
| cat("Processing:", X, "\n") | |
| cat("Derep F:", X, "\n") | |
| derepF <- derepFastq(filtpathF[[X]]) | |
| cat("Inferring F:", X, "\n") | |
| ddF <- dada(derepF, err=errF, multithread=TRUE) | |
| cat("Derep R:", X, "\n") | |
| derepR <- derepFastq(filtpathR[[X]]) | |
| cat("Inferring R:", X, "\n") | |
| ddR <- dada(derepR, err=errR, multithread=TRUE) |
| # UCLUST workflow --------------------------------------------------------- | |
| library(tidyverse) | |
| library(cowplot) | |
| library(ggpubr) | |
| setwd("~/Downloads/alma_original_test") | |
| uclust_df <- read_tsv("16s.insilico_mock.tsv", col_names = T) | |
| uclust_16S_summary <- uclust_df%>% dplyr::rename(otu_name = `#OTU ID`) %>% |
| #' ## Storing R Objects in a SQLite Database | |
| #' Two packages we are using. The first is the ```RSQLite``` which will be used to create and manage an in-memory SQLite database. The second is ```igraph``` which I will use to create and visualize a random network. Some of the work I do is on network simulation. I often don't know the metrics I need from a simulated network when it's created, so I want to be able to store the networks that are created so that I can go back later and analyze them. | |
| library(RSQLite) | |
| library(igraph) | |
| #' Create a database in memory. | |
| con <- dbConnect(SQLite(), ":memory:") | |
| #' The table has two columns, an *id* column and a column called *graph* which is a **blob** type. This type just stores binary data. |