Skip to content

Instantly share code, notes, and snippets.

@genomewalker
Last active June 15, 2023 21:49
Show Gist options
  • Save genomewalker/c671b8864c33fc0b9c5ccce68f40dc9d to your computer and use it in GitHub Desktop.
Save genomewalker/c671b8864c33fc0b9c5ccce68f40dc9d to your computer and use it in GitHub Desktop.
Combine taxdb data
library(tidyverse)
# Read in the data
setwd("/maps/projects/lundbeck/scratch/taxDB/v6/metadata/src_files")
ncbi_assm_stats <- list.files(".", pattern = "genome_metadata.txt", full.names = TRUE)
ncbi_assm_stats <- map_dfr(ncbi_assm_stats, function(X) {
read_tsv(X, col_names = TRUE)
}) %>%
select(-filename) %>%
rename(
L50 = ctg_N50,
N50 = ctg_L50,
N90 = ctg_L90,
L90 = ctg_N90
)
kraken2_select_files <- list.files(".", pattern = "kraken2_select_accessions.txt", full.names = TRUE)
kraken2_select_acc <- map_dfr(kraken2_select_files, function(X) {
read_tsv(X, col_names = FALSE) %>%
select(strain = X1) %>%
mutate(accession = gsub("S__", "", strain))
})
fungi <- read_tsv("fungi-assembly-derep-genomes_results.tsv") %>%
select(taxonomy, accession, representative, file = src) %>%
mutate(
library = "fungi",
source = "NCBI_nuclear",
file = basename(file)
)
invertebrate <- read_tsv("invertebrate-assembly-derep-genomes_results.tsv") %>%
select(taxonomy, accession, representative, file = src) %>%
mutate(
library = "invertebrate",
source = "NCBI_nuclear",
file = basename(file)
)
organelle <- read_tsv("organelle-derep-genomes_results.tsv") %>%
select(taxonomy, accession, representative, file = src) %>%
mutate(
library = "organelle",
source = "NCBI_organelle",
file = basename(file)
)
organelle_post <- read_tsv("norway_taxid_accession-refined.tsv") %>%
select(taxonomy = taxon, accession, representative, file = src) %>%
mutate(
library = "organelle",
source = "PhyloNorway_organelle",
file = basename(file)
)
plant <- read_tsv("plant-assembly-derep-genomes_results.tsv") %>%
select(taxonomy, accession, representative, file = src) %>%
mutate(
library = "plant",
source = "NCBI_nuclear",
file = basename(file)
)
protozoa <- read_tsv("protozoa-assembly-derep-genomes_results.tsv") %>%
select(taxonomy, accession, representative, file = src) %>%
mutate(
library = "protozoa",
source = "NCBI_nuclear",
file = basename(file)
)
vertebrate_mammalian <- read_tsv("vertebrate_mammalian-assembly-derep-genomes_results.tsv") %>%
select(taxonomy, accession, representative, file = src) %>%
mutate(
library = "vertebrate_mammalian",
source = "NCBI_nuclear",
file = basename(file)
)
vertebrate_other <- read_tsv("vertebrate_other-assembly-derep-genomes_results.tsv") %>%
select(taxonomy, accession, representative, file = src) %>%
mutate(
library = "vertebrate_other",
source = "NCBI_nuclear",
file = basename(file)
)
gtdb <- read_tsv("gtdb_reps_tax.txt", col_names = c("accession", "taxonomy")) %>%
mutate(
library = NA,
source = "GTDB",
file = paste0(accession, "_genomic.fna.gz")
)
imgvr <- read_tsv("imgvr-derep-genomes_results.tsv") %>%
select(taxonomy, accession, representative, file = src) %>%
mutate(
library = NA,
source = "IMGVR",
file = basename(file)
)
euk_post <- read_tsv("euk_custom_post_derep_taxonomy.txt", col_names = c("accession", "taxonomy")) %>%
mutate(
library = NA,
file = paste0(accession, ".fa.gz")
) %>%
mutate(source = case_when(
(grepl("TARA", accession) | grepl("TOSAG", accession)) ~ "TARA_euk",
grepl("MMETSP", accession) ~ "MMETSP",
TRUE ~ "Phylonorway_nuclear"
))
acc2taxid <- read_tsv("/maps/projects/lundbeck/scratch/taxDB/v6/taxonomy/acc2taxid.map.gz") %>%
select(accession, taxid)
q %>%
write_tsv("taxdb-genome_stats-broad-v6.tsv.gz")
# drep_db <- "/maps/projects/lundbeck/scratch/antonio/projects/taxDB/20220926/TaxDB_v6/results/invertebrate/derep_genomes/derep.db"
# library(DBI)
# library(RSQLite)
# library(dbplyr)
# con <- dbConnect(SQLite(), drep_db)
# tbl(con, "stats") %>%
# collect() %>%
# separate(
# col = taxonomy,
# sep = ";",
# into = c(
# "domain",
# "lineage",
# "kingdom",
# "phylum",
# "class",
# "order",
# "family",
# "genus",
# "species"
# )
# ) %>%
# filter(species == "s__Heliconius melpomene") %>% View()
# dbListTables(con)
tax_d <- fungi %>%
bind_rows(invertebrate) %>%
bind_rows(organelle) %>%
bind_rows(organelle_post) %>%
bind_rows(plant) %>%
bind_rows(protozoa) %>%
bind_rows(vertebrate_mammalian) %>%
bind_rows(vertebrate_other) %>%
bind_rows(gtdb) %>%
bind_rows(imgvr) %>%
bind_rows(euk_post) %>%
filter(accession %in% kraken2_select_acc$accession)
tax_d %>%
separate(
col = taxonomy,
sep = ";",
into = c(
"domain",
"lineage",
"kingdom",
"phylum",
"class",
"order",
"family",
"genus",
"species"
)
) %>%
group_by(domain, species) %>%
distinct() %>%
group_by(domain) %>%
count(sort = T)
tax_d %>%
group_by(source) %>%
count(sort = T)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment