Last active
June 15, 2023 21:49
-
-
Save genomewalker/c671b8864c33fc0b9c5ccce68f40dc9d to your computer and use it in GitHub Desktop.
Combine taxdb data
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(tidyverse) | |
# Read in the data | |
setwd("/maps/projects/lundbeck/scratch/taxDB/v6/metadata/src_files") | |
ncbi_assm_stats <- list.files(".", pattern = "genome_metadata.txt", full.names = TRUE) | |
ncbi_assm_stats <- map_dfr(ncbi_assm_stats, function(X) { | |
read_tsv(X, col_names = TRUE) | |
}) %>% | |
select(-filename) %>% | |
rename( | |
L50 = ctg_N50, | |
N50 = ctg_L50, | |
N90 = ctg_L90, | |
L90 = ctg_N90 | |
) | |
kraken2_select_files <- list.files(".", pattern = "kraken2_select_accessions.txt", full.names = TRUE) | |
kraken2_select_acc <- map_dfr(kraken2_select_files, function(X) { | |
read_tsv(X, col_names = FALSE) %>% | |
select(strain = X1) %>% | |
mutate(accession = gsub("S__", "", strain)) | |
}) | |
fungi <- read_tsv("fungi-assembly-derep-genomes_results.tsv") %>% | |
select(taxonomy, accession, representative, file = src) %>% | |
mutate( | |
library = "fungi", | |
source = "NCBI_nuclear", | |
file = basename(file) | |
) | |
invertebrate <- read_tsv("invertebrate-assembly-derep-genomes_results.tsv") %>% | |
select(taxonomy, accession, representative, file = src) %>% | |
mutate( | |
library = "invertebrate", | |
source = "NCBI_nuclear", | |
file = basename(file) | |
) | |
organelle <- read_tsv("organelle-derep-genomes_results.tsv") %>% | |
select(taxonomy, accession, representative, file = src) %>% | |
mutate( | |
library = "organelle", | |
source = "NCBI_organelle", | |
file = basename(file) | |
) | |
organelle_post <- read_tsv("norway_taxid_accession-refined.tsv") %>% | |
select(taxonomy = taxon, accession, representative, file = src) %>% | |
mutate( | |
library = "organelle", | |
source = "PhyloNorway_organelle", | |
file = basename(file) | |
) | |
plant <- read_tsv("plant-assembly-derep-genomes_results.tsv") %>% | |
select(taxonomy, accession, representative, file = src) %>% | |
mutate( | |
library = "plant", | |
source = "NCBI_nuclear", | |
file = basename(file) | |
) | |
protozoa <- read_tsv("protozoa-assembly-derep-genomes_results.tsv") %>% | |
select(taxonomy, accession, representative, file = src) %>% | |
mutate( | |
library = "protozoa", | |
source = "NCBI_nuclear", | |
file = basename(file) | |
) | |
vertebrate_mammalian <- read_tsv("vertebrate_mammalian-assembly-derep-genomes_results.tsv") %>% | |
select(taxonomy, accession, representative, file = src) %>% | |
mutate( | |
library = "vertebrate_mammalian", | |
source = "NCBI_nuclear", | |
file = basename(file) | |
) | |
vertebrate_other <- read_tsv("vertebrate_other-assembly-derep-genomes_results.tsv") %>% | |
select(taxonomy, accession, representative, file = src) %>% | |
mutate( | |
library = "vertebrate_other", | |
source = "NCBI_nuclear", | |
file = basename(file) | |
) | |
gtdb <- read_tsv("gtdb_reps_tax.txt", col_names = c("accession", "taxonomy")) %>% | |
mutate( | |
library = NA, | |
source = "GTDB", | |
file = paste0(accession, "_genomic.fna.gz") | |
) | |
imgvr <- read_tsv("imgvr-derep-genomes_results.tsv") %>% | |
select(taxonomy, accession, representative, file = src) %>% | |
mutate( | |
library = NA, | |
source = "IMGVR", | |
file = basename(file) | |
) | |
euk_post <- read_tsv("euk_custom_post_derep_taxonomy.txt", col_names = c("accession", "taxonomy")) %>% | |
mutate( | |
library = NA, | |
file = paste0(accession, ".fa.gz") | |
) %>% | |
mutate(source = case_when( | |
(grepl("TARA", accession) | grepl("TOSAG", accession)) ~ "TARA_euk", | |
grepl("MMETSP", accession) ~ "MMETSP", | |
TRUE ~ "Phylonorway_nuclear" | |
)) | |
acc2taxid <- read_tsv("/maps/projects/lundbeck/scratch/taxDB/v6/taxonomy/acc2taxid.map.gz") %>% | |
select(accession, taxid) | |
q %>% | |
write_tsv("taxdb-genome_stats-broad-v6.tsv.gz") | |
# drep_db <- "/maps/projects/lundbeck/scratch/antonio/projects/taxDB/20220926/TaxDB_v6/results/invertebrate/derep_genomes/derep.db" | |
# library(DBI) | |
# library(RSQLite) | |
# library(dbplyr) | |
# con <- dbConnect(SQLite(), drep_db) | |
# tbl(con, "stats") %>% | |
# collect() %>% | |
# separate( | |
# col = taxonomy, | |
# sep = ";", | |
# into = c( | |
# "domain", | |
# "lineage", | |
# "kingdom", | |
# "phylum", | |
# "class", | |
# "order", | |
# "family", | |
# "genus", | |
# "species" | |
# ) | |
# ) %>% | |
# filter(species == "s__Heliconius melpomene") %>% View() | |
# dbListTables(con) | |
tax_d <- fungi %>% | |
bind_rows(invertebrate) %>% | |
bind_rows(organelle) %>% | |
bind_rows(organelle_post) %>% | |
bind_rows(plant) %>% | |
bind_rows(protozoa) %>% | |
bind_rows(vertebrate_mammalian) %>% | |
bind_rows(vertebrate_other) %>% | |
bind_rows(gtdb) %>% | |
bind_rows(imgvr) %>% | |
bind_rows(euk_post) %>% | |
filter(accession %in% kraken2_select_acc$accession) | |
tax_d %>% | |
separate( | |
col = taxonomy, | |
sep = ";", | |
into = c( | |
"domain", | |
"lineage", | |
"kingdom", | |
"phylum", | |
"class", | |
"order", | |
"family", | |
"genus", | |
"species" | |
) | |
) %>% | |
group_by(domain, species) %>% | |
distinct() %>% | |
group_by(domain) %>% | |
count(sort = T) | |
tax_d %>% | |
group_by(source) %>% | |
count(sort = T) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment