Last active
January 17, 2020 21:14
-
-
Save ramongallego/1bb1301081d79a9519b9a06f55b3c758 to your computer and use it in GitHub Desktop.
An R script to query genbank using org name and locus.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(tidyverse) | |
library(insect) | |
library(rentrez) | |
set_entrez_key (YOUR_KEY_HERE) | |
library(here) | |
worlds.taxonomy <- read_rds(here("data", "all.taxonomy.rds")) | |
# make your own taxonomy db using the command | |
# worlds.taxonomy <- taxonomy(), then save it for later use | |
############# | |
############# | |
# The easiest way is to specify everything in the query, so you don't have to wrangle with the results too much | |
## Example 1: 16S from Chordata mitochondrial genomes | |
query <- "Chordata[ORGN]+AND+16S[TITL]" | |
Full.search <- searchGB(query,taxIDs = T, sequences = T, prompt=F ) | |
Fwd_primer <- "GYAATCACTTGTCTTTTAAATGAAGACC" | |
Rev_primer <- "GGATTGCGCTGTTATCCCTA" | |
Trimmed_fragments <- virtualPCR(Full.search, up = Fwd_primer, down = Rev_primer, | |
trimprimers = TRUE,minamplen = 250, maxamplen = 400, cores = "autodetect") | |
## Now let's assume that you found new sequences that you want to add - maybe full mt genomes that | |
## might have been missed in the first query | |
new.query <- "Chordata[ORGN]+AND+13000:18000[SLEN]+mitochondrion[TITL]" | |
search_mits <- searchGB(new.query,taxIDs = T, sequences = T, prompt=F) | |
Trimmed_mtos <- virtualPCR(search_mits, up = Fwd_primer, down = Rev_primer, | |
trimprimers = TRUE,minamplen = 250, maxamplen = 400, cores = "autodetect") | |
## How to add both lists? | |
#### Case 1: Everything went fine (Nothing is repeated in both lists?) | |
All.seqs <- append(Trimmed_fragments,Trimmed_mtos) | |
### Case 2: we have to subset, pluck and so on. | |
### Remove sequences without taxID | |
Seq.names.16S <- tibble(seq.name = names(Trimmed_fragments), | |
Search = "16S" ) | |
Seq.names.16S %>% | |
separate(seq.name, into = c("Accession", "taxID"), sep = "\\|", remove = F) %>% | |
left_join(worlds.taxonomy %>% | |
mutate(taxID = as.character(taxID))) %>% | |
filter(is.na(name)) %>% | |
pull (seq.name) -> to.throw # Keep the names of the sequences to remove | |
Seq.names.16S %>% | |
filter (! seq.name %in% to.throw ) %>% | |
pull (seq.name) -> to.keep | |
## Now select only the seqs we want | |
Trimmed_fragments [to.keep] -> seqs.to.keep | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment