telatin · January 22, 2021 18:04
diff --git a/dada-its.R b/dada-its.R
 library(dada2)
 library(ggplot)
 packageVersion("dada2")
 library(ShortRead)
 packageVersion("ShortRead")
 library(Biostrings)
 packageVersion("Biostrings")
 path <- "/Volumes/Informatics/telatin/qibx/ITS/"
 prmr_funcn <- function(prmr) {
  prmr_dna <- DNAString(prmr)
  cmplmnt <- toString(complement(prmr_dna))
  rvrse <- toString(reverse(prmr_dna))
  rvrse_cmplmnt <- toString(reverseComplement(prmr_dna))
  return(c(prmr, cmplmnt, rvrse, rvrse_cmplmnt))
 }


 vcnt_funcn <- function(prmr, filt_seq) {
  vcount_pattern <- vcountPattern(prmr, DNAStringSet(filt_seq), fixed = FALSE)
  return(sum(vcount_pattern))
 } 

 ITS1f <- "CTTGGTCATTTAGAGGAAGTAA"  ## Define your primer sequence here
 ITS4 <- "GCTGCGTTCTTCATCGATGC"
 ITS1f.form <- prmr_funcn(ITS1f)
 ITS4.form <- prmr_funcn(ITS4)
 primer.form <- c(ITS1f.form, ITS4.form)

 Mock.fnFs <- sort(list.files(path, pattern = "R1.fastq.gz", full.names = TRUE))
 Mock.fnRs <- sort(list.files(path, pattern = "R2.fastq.gz", full.names = TRUE))
 mock.names <- sapply(strsplit(basename(Mock.fnFs), "_"), "[", 1)
 mock.names  ## Lists the names of all the sequence files

 # Remove Ns
 Mock.fnFs.filtN <- file.path(path, "filtN", paste0(mock.names, "_R1_filtN.fastq.gz"))
 Mock.fnRs.filtN <- file.path(path, "filtN", paste0(mock.names, "_R2_filtN.fastq.gz"))
 filterAndTrim(Mock.fnFs, Mock.fnFs.filtN, Mock.fnRs, Mock.fnRs.filtN, maxN = 0, 
              multithread = TRUE)

 #Here we read in the Ns removed FASTQ file using the readFastq function from the ShortRead package and then store it as a character vector.

 Mock.R1.filtN <- as.character(sread(readFastq(Mock.fnFs[1])))
 Mock.R2.filtN <- as.character(sread(readFastq(Mock.fnRs[1])))

 #The outputs from applying the vnct_funcn show the presence of the primers on the FASTQ files.

 # Checking and counting the presence of the primer set in the forward read
 sapply(primer.form, vcnt_funcn, filt_seq = Mock.R1.filtN)
 # Checking and counting the presence of the primer set in the reverse read
 sapply(primer.form, vcnt_funcn, filt_seq = Mock.R2.filtN)

 dir.create(file.path(path, "cutadapt_trimmed"))
 path.cutadapt <-"/Volumes/Informatics/telatin/qibx/ITS/cutadapt"
 Mock.fnFs.cutadapt <- file.path(path.cutadapt, paste0(mock.names, "_R1_trimmed.fastq.gz"))
 Mock.fnRs.cutadapt <- file.path(path.cutadapt, paste0(mock.names, "_R2_trimmed.fastq.gz"))
 trim.cmd <- "-n 2 -m 50"
 primers <- c("-g CTTGGTCATTTAGAGGAAGTAA", "-a GCATCGATGAAGAACGCAGC", "-G GCTGCGTTCTTCATCGATGC", 
             "-A TTACTTCCTCTAAATGACCAAG")
 cutadapt.path <-"/Users/telatina/miniconda3/envs/cutadapt/bin/cutadapt"
 for (i in 1:length(mock.names)) {
  system2(cutadapt.path, args = c(primers, trim.cmd, "-o", Mock.fnFs.cutadapt[i], 
                                  "-p", Mock.fnRs.cutadapt[i], Mock.fnFs.filtN[i], Mock.fnRs.filtN[i]))
 }

 Mock.R1.trimmed <- as.character(sread(readFastq(Mock.fnFs.cutadapt[1])))
 Mock.R2.trimmed <- as.character(sread(readFastq(Mock.fnRs.cutadapt[2])))

 # Checking and counting the presence of the primer set in the forward read
 sapply(primer.form, vcnt_funcn, filt_seq = Mock.R1.trimmed)
 # Checking and counting the presence of the primer set in the reverse read
 sapply(primer.form, vcnt_funcn, filt_seq = Mock.R2.trimmed)

 # Forward and reverse fastq filenames have the format:
 fnFs <- sort(list.files(path.cutadapt, pattern = "R1_trimmed.fastq", full.names = TRUE))
 fnRs <- sort(list.files(path.cutadapt, pattern = "R2_trimmed.fastq", full.names = TRUE))
 sample.names <- sapply(strsplit(basename(fnFs), "_"), "[", 1)
 sample.names
 plotQualityProfile(fnFs[1:2])
 filtFs <- file.path(path.cutadapt, "filtered", paste0(sample.names, "_F_filt.fastq.gz"))
 filtRs <- file.path(path.cutadapt, "filtered", paste0(sample.names, "_R_filt.fastq.gz"))
 out <- filterAndTrim(fnFs, filtFs, fnRs, filtRs, maxN = 0, maxEE = c(2, 2), 
                     truncQ = 2, rm.phix = TRUE, compress = TRUE, multithread = TRUE)
 errF <- learnErrors(filtFs, multithread = TRUE)
 errR <- learnErrors(filtRs, multithread = TRUE)

 plotErrors(errF, nominalQ = TRUE)


 derepFs <- derepFastq(filtFs, verbose = TRUE)
 derepRs <- derepFastq(filtRs, verbose = TRUE)
 names(derepFs) <- sample.names
 names(derepRs) <- sample.names

 dadaFs <- dada(derepFs, err = errF, multithread = TRUE)
 dadaRs <- dada(derepRs, err = errR, multithread = TRUE)


 dadaFs[1]
 mergers <- mergePairs(dadaFs, derepFs, dadaRs, derepRs, verbose=TRUE)

 head(mergers[[1]])
 seqtab <- makeSequenceTable(mergers)
 seqtab.nochim <- removeBimeraDenovo(seqtab, method="consensus", multithread=TRUE, verbose=TRUE)

 getN <- function(x) sum(getUniques(x))
 track <- cbind(out, sapply(dadaFs, getN), sapply(dadaRs, getN), sapply(mergers, 
                                                                       getN), rowSums(seqtab.nochim))
 # If processing a single sample, remove the sapply calls: e.g. replace
 # sapply(dadaFs, getN) with getN(dadaFs)
 colnames(track) <- c("input", "filtered", "denoisedF", "denoisedR", "merged", 
                     "nonchim")
 rownames(track) <- sample.names
 UNITE <- "~/git/dadaist2/refs/uniref.fa.gz"
 taxa <- assignTaxonomy(seqtab.nochim, UNITE, 
                       multithread = TRUE, tryRC = TRUE)

 taxa.print <- taxa  # Removing sequence rownames for display only
 rownames(taxa.print) <- NULL
 head(taxa.print)
 write.table(taxa.print,
            file.path("~/git/dadaist2/qibx_ITS/taxonomy_R.txt"),
            row.names=TRUE,
            quote=FALSE
 )

 seqtab.nochim2 <- t(seqtab.nochim) # QIIME has OTUs as rows
 col.names <- basename(filtFs)
 feature_table_header = '#OTU ID';
 col.names[[1]] <- paste0(feature_table_header,"\t", col.names[[1]])
 write.table(seqtab.nochim2, file.path("~/git/dadaist2/qibx_ITS/table_R.txt"), sep="\t",
            row.names=TRUE, col.names=col.names,quote=FALSE)
	library(dada2)
	library(ggplot)
	packageVersion("dada2")
	library(ShortRead)
	packageVersion("ShortRead")
	library(Biostrings)
	packageVersion("Biostrings")
	path <- "/Volumes/Informatics/telatin/qibx/ITS/"
	prmr_funcn <- function(prmr) {
	prmr_dna <- DNAString(prmr)
	cmplmnt <- toString(complement(prmr_dna))
	rvrse <- toString(reverse(prmr_dna))
	rvrse_cmplmnt <- toString(reverseComplement(prmr_dna))
	return(c(prmr, cmplmnt, rvrse, rvrse_cmplmnt))
	}


	vcnt_funcn <- function(prmr, filt_seq) {
	vcount_pattern <- vcountPattern(prmr, DNAStringSet(filt_seq), fixed = FALSE)
	return(sum(vcount_pattern))
	}

	ITS1f <- "CTTGGTCATTTAGAGGAAGTAA" ## Define your primer sequence here
	ITS4 <- "GCTGCGTTCTTCATCGATGC"
	ITS1f.form <- prmr_funcn(ITS1f)
	ITS4.form <- prmr_funcn(ITS4)
	primer.form <- c(ITS1f.form, ITS4.form)

	Mock.fnFs <- sort(list.files(path, pattern = "R1.fastq.gz", full.names = TRUE))
	Mock.fnRs <- sort(list.files(path, pattern = "R2.fastq.gz", full.names = TRUE))
	mock.names <- sapply(strsplit(basename(Mock.fnFs), "_"), "[", 1)
	mock.names ## Lists the names of all the sequence files

	# Remove Ns
	Mock.fnFs.filtN <- file.path(path, "filtN", paste0(mock.names, "_R1_filtN.fastq.gz"))
	Mock.fnRs.filtN <- file.path(path, "filtN", paste0(mock.names, "_R2_filtN.fastq.gz"))
	filterAndTrim(Mock.fnFs, Mock.fnFs.filtN, Mock.fnRs, Mock.fnRs.filtN, maxN = 0,
	multithread = TRUE)

	#Here we read in the Ns removed FASTQ file using the readFastq function from the ShortRead package and then store it as a character vector.

	Mock.R1.filtN <- as.character(sread(readFastq(Mock.fnFs[1])))
	Mock.R2.filtN <- as.character(sread(readFastq(Mock.fnRs[1])))

	#The outputs from applying the vnct_funcn show the presence of the primers on the FASTQ files.

	# Checking and counting the presence of the primer set in the forward read
	sapply(primer.form, vcnt_funcn, filt_seq = Mock.R1.filtN)
	# Checking and counting the presence of the primer set in the reverse read
	sapply(primer.form, vcnt_funcn, filt_seq = Mock.R2.filtN)

	dir.create(file.path(path, "cutadapt_trimmed"))
	path.cutadapt <-"/Volumes/Informatics/telatin/qibx/ITS/cutadapt"
	Mock.fnFs.cutadapt <- file.path(path.cutadapt, paste0(mock.names, "_R1_trimmed.fastq.gz"))
	Mock.fnRs.cutadapt <- file.path(path.cutadapt, paste0(mock.names, "_R2_trimmed.fastq.gz"))
	trim.cmd <- "-n 2 -m 50"
	primers <- c("-g CTTGGTCATTTAGAGGAAGTAA", "-a GCATCGATGAAGAACGCAGC", "-G GCTGCGTTCTTCATCGATGC",
	"-A TTACTTCCTCTAAATGACCAAG")
	cutadapt.path <-"/Users/telatina/miniconda3/envs/cutadapt/bin/cutadapt"
	for (i in 1:length(mock.names)) {
	system2(cutadapt.path, args = c(primers, trim.cmd, "-o", Mock.fnFs.cutadapt[i],
	"-p", Mock.fnRs.cutadapt[i], Mock.fnFs.filtN[i], Mock.fnRs.filtN[i]))
	}

	Mock.R1.trimmed <- as.character(sread(readFastq(Mock.fnFs.cutadapt[1])))
	Mock.R2.trimmed <- as.character(sread(readFastq(Mock.fnRs.cutadapt[2])))

	# Checking and counting the presence of the primer set in the forward read
	sapply(primer.form, vcnt_funcn, filt_seq = Mock.R1.trimmed)
	# Checking and counting the presence of the primer set in the reverse read
	sapply(primer.form, vcnt_funcn, filt_seq = Mock.R2.trimmed)

	# Forward and reverse fastq filenames have the format:
	fnFs <- sort(list.files(path.cutadapt, pattern = "R1_trimmed.fastq", full.names = TRUE))
	fnRs <- sort(list.files(path.cutadapt, pattern = "R2_trimmed.fastq", full.names = TRUE))
	sample.names <- sapply(strsplit(basename(fnFs), "_"), "[", 1)
	sample.names
	plotQualityProfile(fnFs[1:2])
	filtFs <- file.path(path.cutadapt, "filtered", paste0(sample.names, "_F_filt.fastq.gz"))
	filtRs <- file.path(path.cutadapt, "filtered", paste0(sample.names, "_R_filt.fastq.gz"))
	out <- filterAndTrim(fnFs, filtFs, fnRs, filtRs, maxN = 0, maxEE = c(2, 2),
	truncQ = 2, rm.phix = TRUE, compress = TRUE, multithread = TRUE)
	errF <- learnErrors(filtFs, multithread = TRUE)
	errR <- learnErrors(filtRs, multithread = TRUE)

	plotErrors(errF, nominalQ = TRUE)


	derepFs <- derepFastq(filtFs, verbose = TRUE)
	derepRs <- derepFastq(filtRs, verbose = TRUE)
	names(derepFs) <- sample.names
	names(derepRs) <- sample.names

	dadaFs <- dada(derepFs, err = errF, multithread = TRUE)
	dadaRs <- dada(derepRs, err = errR, multithread = TRUE)


	dadaFs[1]
	mergers <- mergePairs(dadaFs, derepFs, dadaRs, derepRs, verbose=TRUE)

	head(mergers[[1]])
	seqtab <- makeSequenceTable(mergers)
	seqtab.nochim <- removeBimeraDenovo(seqtab, method="consensus", multithread=TRUE, verbose=TRUE)

	getN <- function(x) sum(getUniques(x))
	track <- cbind(out, sapply(dadaFs, getN), sapply(dadaRs, getN), sapply(mergers,
	getN), rowSums(seqtab.nochim))
	# If processing a single sample, remove the sapply calls: e.g. replace
	# sapply(dadaFs, getN) with getN(dadaFs)
	colnames(track) <- c("input", "filtered", "denoisedF", "denoisedR", "merged",
	"nonchim")
	rownames(track) <- sample.names
	UNITE <- "~/git/dadaist2/refs/uniref.fa.gz"
	taxa <- assignTaxonomy(seqtab.nochim, UNITE,
	multithread = TRUE, tryRC = TRUE)

	taxa.print <- taxa # Removing sequence rownames for display only
	rownames(taxa.print) <- NULL
	head(taxa.print)
	write.table(taxa.print,
	file.path("~/git/dadaist2/qibx_ITS/taxonomy_R.txt"),
	row.names=TRUE,
	quote=FALSE
	)

	seqtab.nochim2 <- t(seqtab.nochim) # QIIME has OTUs as rows
	col.names <- basename(filtFs)
	feature_table_header = '#OTU ID';
	col.names[[1]] <- paste0(feature_table_header,"\t", col.names[[1]])
	write.table(seqtab.nochim2, file.path("~/git/dadaist2/qibx_ITS/table_R.txt"), sep="\t",
	row.names=TRUE, col.names=col.names,quote=FALSE)