padpadpadpad · October 8, 2017 14:27
diff --git a/RStudio__2017-10-08-using-blast-on-sanger-sequences-in-r.R b/RStudio__2017-10-08-using-blast-on-sanger-sequences-in-r.R
 ## ---- setup, echo = FALSE------------------------------------------------
 knitr::opts_knit$set(root.dir = normalizePath('../..'))

 ## ---- load_packages, message = FALSE, warning = FALSE--------------------
 # load packages into R
 library(rBLAST)
 library(taxize)
 library(dplyr)
 library(tidyr)
 library(purrr)

 ## ---- list_files, eval = FALSE-------------------------------------------
 ## # list files in the data folder
 ## seq_files <- list.files('raw_data', pattern = '.seq')
 ##
 ## # store file path
 ## seq_file_path <- 'where_your_files_are_saved'
 ##
 ## # check they been found/we are in the right place
 ## seq_files
 ##
 ## # open a single file
 ## read.table(file.path(seq_file_path,seq_file[1]), blank.lines.skip = TRUE, as.is = TRUE)
 ##

 ## ---- list_files_true, echo = FALSE--------------------------------------
 # list files in the data folder
 seq_files <- list.files('data/20171008_sanger/seq_files', pattern = '.seq', full.names = FALSE)
 seq_file_path <- 'data/20171008_sanger/seq_files'

 # check they been found/we are in the right place
 seq_files

 # open a single file
 read.table(file.path(seq_file_path,seq_file[1]), blank.lines.skip = TRUE, as.is = TRUE)

 ## ---- download_database, eval = FALSE------------------------------------
 ## # download file
 ## download.file(url = "ftp://ftp.ncbi.nlm.nih.gov/blast/db/16SMicrobial.tar.gz", destfile = "16SMicrobial.tar.gz", mode = 'wb')
 ##
 ## # extract files from zipped file
 ## untar("16SMicrobial.tar.gz", exdir = "blast_16SMicrobial_db")
 ##
 ## # load database into R
 ## bl <- blast(db = "blast_16SMicrobial_db/16SMicrobial")
 ##

 ## ---- load_database, echo = FALSE----------------------------------------
 # load database into R
 bl <- blast(db = "data/20171008_sanger/Blast_16SMicrobial_db/16SMicrobial")

 ## ---- rBLAST_all---------------------------------------------------------
 # function for a single local BLAST search for a single file
 rBLAST_all <- function(seq_file, database, keep = 0.95){

 # read in and create single sequence line
 test <- read.table(seq_file, blank.lines.skip = TRUE, as.is = TRUE)
 seq1 <- paste(test$V1, collapse = '')

 # make a string set necessary for rBLAST and name file
 seq2 <- Biostrings::BStringSet(seq1)
 names(seq2) <- basename(seq_file)

 # use rBLAST and add sequence to subsequent dataframe
 seq_pred <- predict(database, seq2) %>%
 dplyr::filter(., Perc.Ident > keep*100) %>%
 dplyr::mutate_at(., c('QueryID', 'SubjectID'), as.character) %>%
 dplyr::mutate(seq = seq1)

 # return dataframe
 return(seq_pred)
 }

 ## ---- single_run---------------------------------------------------------
 # run on the first file
 test <- rBLAST_all(file.path(seq_file_path, seq_files[1]), database = bl)

 # look at names of columnes
 names(test)

 # look at first 6 rows of data, seq column removed (LONG!)
 head(select(test, -seq))

 ## ---- run_rBlast_all-----------------------------------------------------
 # update our files to have the full names
 seq_files = paste(seq_file_path, seq_files, sep = '/')

 # run rBlast all
 seq_IDs <- purrr::map_df(seq_files, rBLAST_all, database = bl, keep = 0.97) %>%
 # create column for nrow
 mutate(., num = 1:n())

 # check number of rows
 nrow(seq_IDs)

 ## ---- GenbankID_2_UID, warning = FALSE-----------------------------------
 # reduce number of columns so we drop all of the gumph
 seq_IDs <- select(seq_IDs, num, QueryID, SubjectID, Perc.Ident)

 # work out the uid from the genbank id - given as SubjectID
 # approach - nest each row in a list and call the API separately - return NA if it does not work
 seq_IDs_nest <- seq_IDs %>%
 nest(., SubjectID) %>%
 mutate(., uid = map(data, possibly(genbank2uid, otherwise = NA, quiet = TRUE)))

 # unnest these columns to get the dataframes out. There shall be warnings but they are ok as the correct uids are preserved
 seq_IDs <- seq_IDs_nest %>%
 unnest(data, uid)

 head(seq_IDs)


 ## ---- UID_taxonomy, warning = FALSE--------------------------------------
 # now nest uids of each row and run the get_taxon_summary API call on each list element
 seq_IDs_nest <- seq_IDs %>%
 nest(., uid) %>%
 mutate(., tax_info = map(data, possibly(ncbi_get_taxon_summary, otherwise = NA, quiet = TRUE)))

 # unnest this dataframe for final dataframe of what everything is
 seq_IDs <- seq_IDs_nest %>%
 unnest(tax_info) %>%
 select(., -data)

 head(seq_IDs)
	## ---- setup, echo = FALSE------------------------------------------------
	knitr::opts_knit$set(root.dir = normalizePath('../..'))

	## ---- load_packages, message = FALSE, warning = FALSE--------------------
	# load packages into R
	library(rBLAST)
	library(taxize)
	library(dplyr)
	library(tidyr)
	library(purrr)

	## ---- list_files, eval = FALSE-------------------------------------------
	## # list files in the data folder
	## seq_files <- list.files('raw_data', pattern = '.seq')
	##
	## # store file path
	## seq_file_path <- 'where_your_files_are_saved'
	##
	## # check they been found/we are in the right place
	## seq_files
	##
	## # open a single file
	## read.table(file.path(seq_file_path,seq_file[1]), blank.lines.skip = TRUE, as.is = TRUE)
	##

	## ---- list_files_true, echo = FALSE--------------------------------------
	# list files in the data folder
	seq_files <- list.files('data/20171008_sanger/seq_files', pattern = '.seq', full.names = FALSE)
	seq_file_path <- 'data/20171008_sanger/seq_files'

	# check they been found/we are in the right place
	seq_files

	# open a single file
	read.table(file.path(seq_file_path,seq_file[1]), blank.lines.skip = TRUE, as.is = TRUE)

	## ---- download_database, eval = FALSE------------------------------------
	## # download file
	## download.file(url = "ftp://ftp.ncbi.nlm.nih.gov/blast/db/16SMicrobial.tar.gz", destfile = "16SMicrobial.tar.gz", mode = 'wb')
	##
	## # extract files from zipped file
	## untar("16SMicrobial.tar.gz", exdir = "blast_16SMicrobial_db")
	##
	## # load database into R
	## bl <- blast(db = "blast_16SMicrobial_db/16SMicrobial")
	##

	## ---- load_database, echo = FALSE----------------------------------------
	# load database into R
	bl <- blast(db = "data/20171008_sanger/Blast_16SMicrobial_db/16SMicrobial")

	## ---- rBLAST_all---------------------------------------------------------
	# function for a single local BLAST search for a single file
	rBLAST_all <- function(seq_file, database, keep = 0.95){

	# read in and create single sequence line
	test <- read.table(seq_file, blank.lines.skip = TRUE, as.is = TRUE)
	seq1 <- paste(test$V1, collapse = '')

	# make a string set necessary for rBLAST and name file
	seq2 <- Biostrings::BStringSet(seq1)
	names(seq2) <- basename(seq_file)

	# use rBLAST and add sequence to subsequent dataframe
	seq_pred <- predict(database, seq2) %>%
	dplyr::filter(., Perc.Ident > keep*100) %>%
	dplyr::mutate_at(., c('QueryID', 'SubjectID'), as.character) %>%
	dplyr::mutate(seq = seq1)

	# return dataframe
	return(seq_pred)
	}

	## ---- single_run---------------------------------------------------------
	# run on the first file
	test <- rBLAST_all(file.path(seq_file_path, seq_files[1]), database = bl)

	# look at names of columnes
	names(test)

	# look at first 6 rows of data, seq column removed (LONG!)
	head(select(test, -seq))

	## ---- run_rBlast_all-----------------------------------------------------
	# update our files to have the full names
	seq_files = paste(seq_file_path, seq_files, sep = '/')

	# run rBlast all
	seq_IDs <- purrr::map_df(seq_files, rBLAST_all, database = bl, keep = 0.97) %>%
	# create column for nrow
	mutate(., num = 1:n())

	# check number of rows
	nrow(seq_IDs)

	## ---- GenbankID_2_UID, warning = FALSE-----------------------------------
	# reduce number of columns so we drop all of the gumph
	seq_IDs <- select(seq_IDs, num, QueryID, SubjectID, Perc.Ident)

	# work out the uid from the genbank id - given as SubjectID
	# approach - nest each row in a list and call the API separately - return NA if it does not work
	seq_IDs_nest <- seq_IDs %>%
	nest(., SubjectID) %>%
	mutate(., uid = map(data, possibly(genbank2uid, otherwise = NA, quiet = TRUE)))

	# unnest these columns to get the dataframes out. There shall be warnings but they are ok as the correct uids are preserved
	seq_IDs <- seq_IDs_nest %>%
	unnest(data, uid)

	head(seq_IDs)


	## ---- UID_taxonomy, warning = FALSE--------------------------------------
	# now nest uids of each row and run the get_taxon_summary API call on each list element
	seq_IDs_nest <- seq_IDs %>%
	nest(., uid) %>%
	mutate(., tax_info = map(data, possibly(ncbi_get_taxon_summary, otherwise = NA, quiet = TRUE)))

	# unnest this dataframe for final dataframe of what everything is
	seq_IDs <- seq_IDs_nest %>%
	unnest(tax_info) %>%
	select(., -data)

	head(seq_IDs)