Create needed folders
mkdir ft_files gb_files gb_files_filt gb_files_slice gb_files_ptt gb_files_geno
Get the assembly file from NCBI
We will screen the RefSeq genomes for genoimc unknowns. We will use the non-redundant set of genomes identified by MASH+t-SNE+PAM clustering.
cd /bioinf/home/afernand/SANDBOX/unk_vs_refseq/
mkdir genomes
rsync -Pauvz /bioinf/projects/megx/UNKNOWNS/chiara/NETWORK/RefSeq82.ref.gen/proteomes/downloads/ genomes/
We want to know how the different clusters created by MMSEQS2 aggregate together at the domain architecture level.
We need the files:
#include <igraph/igraph.h> | |
#include <math.h> | |
int compare (const void * a, const void * b) | |
{ | |
if (*(double*)a > *(double*)b) return 1; | |
else if (*(double*)a < *(double*)b) return -1; | |
else return 0; | |
} |
#!/bin/bash -l | |
#$ -cwd | |
#$ -j y | |
#$ -t 1-10:1 | |
#$ -tc 25 | |
#$ -N hmm_unk | |
#$ -pe threaded 2 | |
#$ -V | |
# Where the models are |
mmseqs concatdbs /PROCESSING/OSD/UNKNOWNS/MMSEQ_MPI/unkdb_update_hmp/unkprot_db \ | |
/PROCESSING/OSD/UNKNOWNS/MMSEQ_MPI/unkdb_update_hmp/HMPSOAP_db \ | |
/PROCESSING/OSD/UNKNOWNS/MMSEQ_MPI/unkdb_update_hmp/unkprot_db.withNewSequences | |
mmseqs concatdbs /PROCESSING/OSD/UNKNOWNS/MMSEQ_MPI/unkdb_update_hmp/unkprot_db_h \ | |
/PROCESSING/OSD/UNKNOWNS/MMSEQ_MPI/unkdb_update_hmp/HMPSOAP_db_h \ | |
/PROCESSING/OSD/UNKNOWNS/MMSEQ_MPI/unkdb_update_hmp/unkprot_db.withNewSequences_h | |
mawk '{print NR"\t"$1}' unkprot_db.withNewSequences_h | sed 's/\x0//g' > unkprot_db.withNewSequences.lookup |
big_dada <- function(X){ | |
cat("Processing:", X, "\n") | |
cat("Derep F:", X, "\n") | |
derepF <- derepFastq(filtpathF[[X]]) | |
cat("Inferring F:", X, "\n") | |
ddF <- dada(derepF, err=errF, multithread=TRUE) | |
cat("Derep R:", X, "\n") | |
derepR <- derepFastq(filtpathR[[X]]) | |
cat("Inferring R:", X, "\n") | |
ddR <- dada(derepR, err=errR, multithread=TRUE) |
# UCLUST workflow --------------------------------------------------------- | |
library(tidyverse) | |
library(cowplot) | |
library(ggpubr) | |
setwd("~/Downloads/alma_original_test") | |
uclust_df <- read_tsv("16s.insilico_mock.tsv", col_names = T) | |
uclust_16S_summary <- uclust_df%>% dplyr::rename(otu_name = `#OTU ID`) %>% |
#' ## Storing R Objects in a SQLite Database | |
#' Two packages we are using. The first is the ```RSQLite``` which will be used to create and manage an in-memory SQLite database. The second is ```igraph``` which I will use to create and visualize a random network. Some of the work I do is on network simulation. I often don't know the metrics I need from a simulated network when it's created, so I want to be able to store the networks that are created so that I can go back later and analyze them. | |
library(RSQLite) | |
library(igraph) | |
#' Create a database in memory. | |
con <- dbConnect(SQLite(), ":memory:") | |
#' The table has two columns, an *id* column and a column called *graph* which is a **blob** type. This type just stores binary data. |