Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save padpadpadpad/d2fe494b8392d0c84b40fe158caca834 to your computer and use it in GitHub Desktop.
Save padpadpadpad/d2fe494b8392d0c84b40fe158caca834 to your computer and use it in GitHub Desktop.
## ---- setup, echo = FALSE------------------------------------------------
knitr::opts_knit$set(root.dir = normalizePath('../..'))
## ---- load_packages, message = FALSE, warning = FALSE--------------------
# load packages into R
library(rBLAST)
library(taxize)
library(dplyr)
library(tidyr)
library(purrr)
## ---- list_files, eval = FALSE-------------------------------------------
## # list files in the data folder
## seq_files <- list.files('raw_data', pattern = '.seq')
##
## # store file path
## seq_file_path <- 'where_your_files_are_saved'
##
## # check they been found/we are in the right place
## seq_files
##
## # open a single file
## read.table(file.path(seq_file_path,seq_file[1]), blank.lines.skip = TRUE, as.is = TRUE)
##
## ---- list_files_true, echo = FALSE--------------------------------------
# list files in the data folder
seq_files <- list.files('data/20171008_sanger/seq_files', pattern = '.seq', full.names = FALSE)
seq_file_path <- 'data/20171008_sanger/seq_files'
# check they been found/we are in the right place
seq_files
# open a single file
read.table(file.path(seq_file_path,seq_file[1]), blank.lines.skip = TRUE, as.is = TRUE)
## ---- download_database, eval = FALSE------------------------------------
## # download file
## download.file(url = "ftp://ftp.ncbi.nlm.nih.gov/blast/db/16SMicrobial.tar.gz", destfile = "16SMicrobial.tar.gz", mode = 'wb')
##
## # extract files from zipped file
## untar("16SMicrobial.tar.gz", exdir = "blast_16SMicrobial_db")
##
## # load database into R
## bl <- blast(db = "blast_16SMicrobial_db/16SMicrobial")
##
## ---- load_database, echo = FALSE----------------------------------------
# load database into R
bl <- blast(db = "data/20171008_sanger/Blast_16SMicrobial_db/16SMicrobial")
## ---- rBLAST_all---------------------------------------------------------
# function for a single local BLAST search for a single file
rBLAST_all <- function(seq_file, database, keep = 0.95){
# read in and create single sequence line
test <- read.table(seq_file, blank.lines.skip = TRUE, as.is = TRUE)
seq1 <- paste(test$V1, collapse = '')
# make a string set necessary for rBLAST and name file
seq2 <- Biostrings::BStringSet(seq1)
names(seq2) <- basename(seq_file)
# use rBLAST and add sequence to subsequent dataframe
seq_pred <- predict(database, seq2) %>%
dplyr::filter(., Perc.Ident > keep*100) %>%
dplyr::mutate_at(., c('QueryID', 'SubjectID'), as.character) %>%
dplyr::mutate(seq = seq1)
# return dataframe
return(seq_pred)
}
## ---- single_run---------------------------------------------------------
# run on the first file
test <- rBLAST_all(file.path(seq_file_path, seq_files[1]), database = bl)
# look at names of columnes
names(test)
# look at first 6 rows of data, seq column removed (LONG!)
head(select(test, -seq))
## ---- run_rBlast_all-----------------------------------------------------
# update our files to have the full names
seq_files = paste(seq_file_path, seq_files, sep = '/')
# run rBlast all
seq_IDs <- purrr::map_df(seq_files, rBLAST_all, database = bl, keep = 0.97) %>%
# create column for nrow
mutate(., num = 1:n())
# check number of rows
nrow(seq_IDs)
## ---- GenbankID_2_UID, warning = FALSE-----------------------------------
# reduce number of columns so we drop all of the gumph
seq_IDs <- select(seq_IDs, num, QueryID, SubjectID, Perc.Ident)
# work out the uid from the genbank id - given as SubjectID
# approach - nest each row in a list and call the API separately - return NA if it does not work
seq_IDs_nest <- seq_IDs %>%
nest(., SubjectID) %>%
mutate(., uid = map(data, possibly(genbank2uid, otherwise = NA, quiet = TRUE)))
# unnest these columns to get the dataframes out. There shall be warnings but they are ok as the correct uids are preserved
seq_IDs <- seq_IDs_nest %>%
unnest(data, uid)
head(seq_IDs)
## ---- UID_taxonomy, warning = FALSE--------------------------------------
# now nest uids of each row and run the get_taxon_summary API call on each list element
seq_IDs_nest <- seq_IDs %>%
nest(., uid) %>%
mutate(., tax_info = map(data, possibly(ncbi_get_taxon_summary, otherwise = NA, quiet = TRUE)))
# unnest this dataframe for final dataframe of what everything is
seq_IDs <- seq_IDs_nest %>%
unnest(tax_info) %>%
select(., -data)
head(seq_IDs)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment