Created
October 8, 2017 14:27
-
-
Save padpadpadpad/d2fe494b8392d0c84b40fe158caca834 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## ---- setup, echo = FALSE------------------------------------------------ | |
knitr::opts_knit$set(root.dir = normalizePath('../..')) | |
## ---- load_packages, message = FALSE, warning = FALSE-------------------- | |
# load packages into R | |
library(rBLAST) | |
library(taxize) | |
library(dplyr) | |
library(tidyr) | |
library(purrr) | |
## ---- list_files, eval = FALSE------------------------------------------- | |
## # list files in the data folder | |
## seq_files <- list.files('raw_data', pattern = '.seq') | |
## | |
## # store file path | |
## seq_file_path <- 'where_your_files_are_saved' | |
## | |
## # check they been found/we are in the right place | |
## seq_files | |
## | |
## # open a single file | |
## read.table(file.path(seq_file_path,seq_file[1]), blank.lines.skip = TRUE, as.is = TRUE) | |
## | |
## ---- list_files_true, echo = FALSE-------------------------------------- | |
# list files in the data folder | |
seq_files <- list.files('data/20171008_sanger/seq_files', pattern = '.seq', full.names = FALSE) | |
seq_file_path <- 'data/20171008_sanger/seq_files' | |
# check they been found/we are in the right place | |
seq_files | |
# open a single file | |
read.table(file.path(seq_file_path,seq_file[1]), blank.lines.skip = TRUE, as.is = TRUE) | |
## ---- download_database, eval = FALSE------------------------------------ | |
## # download file | |
## download.file(url = "ftp://ftp.ncbi.nlm.nih.gov/blast/db/16SMicrobial.tar.gz", destfile = "16SMicrobial.tar.gz", mode = 'wb') | |
## | |
## # extract files from zipped file | |
## untar("16SMicrobial.tar.gz", exdir = "blast_16SMicrobial_db") | |
## | |
## # load database into R | |
## bl <- blast(db = "blast_16SMicrobial_db/16SMicrobial") | |
## | |
## ---- load_database, echo = FALSE---------------------------------------- | |
# load database into R | |
bl <- blast(db = "data/20171008_sanger/Blast_16SMicrobial_db/16SMicrobial") | |
## ---- rBLAST_all--------------------------------------------------------- | |
# function for a single local BLAST search for a single file | |
rBLAST_all <- function(seq_file, database, keep = 0.95){ | |
# read in and create single sequence line | |
test <- read.table(seq_file, blank.lines.skip = TRUE, as.is = TRUE) | |
seq1 <- paste(test$V1, collapse = '') | |
# make a string set necessary for rBLAST and name file | |
seq2 <- Biostrings::BStringSet(seq1) | |
names(seq2) <- basename(seq_file) | |
# use rBLAST and add sequence to subsequent dataframe | |
seq_pred <- predict(database, seq2) %>% | |
dplyr::filter(., Perc.Ident > keep*100) %>% | |
dplyr::mutate_at(., c('QueryID', 'SubjectID'), as.character) %>% | |
dplyr::mutate(seq = seq1) | |
# return dataframe | |
return(seq_pred) | |
} | |
## ---- single_run--------------------------------------------------------- | |
# run on the first file | |
test <- rBLAST_all(file.path(seq_file_path, seq_files[1]), database = bl) | |
# look at names of columnes | |
names(test) | |
# look at first 6 rows of data, seq column removed (LONG!) | |
head(select(test, -seq)) | |
## ---- run_rBlast_all----------------------------------------------------- | |
# update our files to have the full names | |
seq_files = paste(seq_file_path, seq_files, sep = '/') | |
# run rBlast all | |
seq_IDs <- purrr::map_df(seq_files, rBLAST_all, database = bl, keep = 0.97) %>% | |
# create column for nrow | |
mutate(., num = 1:n()) | |
# check number of rows | |
nrow(seq_IDs) | |
## ---- GenbankID_2_UID, warning = FALSE----------------------------------- | |
# reduce number of columns so we drop all of the gumph | |
seq_IDs <- select(seq_IDs, num, QueryID, SubjectID, Perc.Ident) | |
# work out the uid from the genbank id - given as SubjectID | |
# approach - nest each row in a list and call the API separately - return NA if it does not work | |
seq_IDs_nest <- seq_IDs %>% | |
nest(., SubjectID) %>% | |
mutate(., uid = map(data, possibly(genbank2uid, otherwise = NA, quiet = TRUE))) | |
# unnest these columns to get the dataframes out. There shall be warnings but they are ok as the correct uids are preserved | |
seq_IDs <- seq_IDs_nest %>% | |
unnest(data, uid) | |
head(seq_IDs) | |
## ---- UID_taxonomy, warning = FALSE-------------------------------------- | |
# now nest uids of each row and run the get_taxon_summary API call on each list element | |
seq_IDs_nest <- seq_IDs %>% | |
nest(., uid) %>% | |
mutate(., tax_info = map(data, possibly(ncbi_get_taxon_summary, otherwise = NA, quiet = TRUE))) | |
# unnest this dataframe for final dataframe of what everything is | |
seq_IDs <- seq_IDs_nest %>% | |
unnest(tax_info) %>% | |
select(., -data) | |
head(seq_IDs) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment