jelber2 · November 9, 2018 08:34
diff --git a/Running-PBJelly-example-with-indel-correction-with-BBMap-no-Pilon.txt b/Running-PBJelly-example-with-indel-correction-with-BBMap-no-Pilon.txt
 # goes along with http://seqanswers.com/forums/showthread.php?p=220925#post220925
 #
 # assumes you have PBJelly, blasr, tabix, bcftools, samtools installed
 # below I am using a machine with 70 cores on a single node, adjust to the number of cores to your machine
 # The scripts below are obviously not designed for use with a cluster, but can be modified
 #
 #########################
 # STEP 1 Combine the FASTQ files and remove the originals to save space
 #########################
 ## first combine files and delete the originals to save space
 WorkDir=/genetics/elbers/pacbio

 cd ${WorkDir}
 cat ?.subreads.fastq > pacbio-genome-pbjelly.fastq
 rm ?.subreads.fastq

 ## second add fake quality values (from Q0=! to Q30=> in Sanger encoding) for pbjelly to work
 cat pacbio-genome-pbjelly.fastq | tr "!" ">" > pacbio-genome-pbjelly-fqual.fastq

 #########################
 # STEP 2 Create pbjelly-config-assemble.xml
 #########################
 ## use nano to create the file then paste everything between the lines starting with #### and ending with ####
 nano pbjelly-config-assemble.xml
 ####pbjelly-config-assemble.xml####
 <jellyProtocol>
    <reference>/genetics/pacbio/genome.fasta</reference>
    <outputDir>/genetics/pacbio/</outputDir>
    <cluster>
        <command notes="For single node, multi-core machines" >${CMD} ${JOBNAME} 2> ${STDERR} 1> ${STDOUT} &amp;</command>
        <nJobs>70</nJobs>
    </cluster>
    <blasr>-minMatch 8 -minPctIdentity 70 -bestn 1 -nCandidates 10 -maxScore -500 -nproc 1 -noSplitSubreads</blasr>
    <input baseDir="/genetics/pacbio/">
        <job>pacbio-genome-pbjelly-fqual.fastq</job>
    </input>
 </jellyProtocol>
 ####pbjelly-config-assemble.xml####

 #########################
 # STEP 3 Create fake qualities for reference to fill in gaps for
 #########################
 /opt/PBSuite_15.8.24/bin/fakeQuals.py genome.fasta genome.qual

 #########################
 # STEP 4 Summarize the assembly and reads
 #########################
 ## first Summarize the assembly
 /opt/PBSuite_15.8.24/bin/summarizeAssembly.py genome.fasta > genome.fasta.summary

 ## second Summarize the reads
 /opt/PBSuite_15.8.24/bin/readSummary.py pbjelly-config-assemble.xml > pacbio-genome-pbjelly-fqual.fastq.summary

 #########################
 # STEP 5 Setup pbjelly
 #########################
 ## note - this step takes about 1 hour
 ## note2 - pbjelly steps are submitted in the background, but even though it says step is 
 ## finished, it might not be, you need to see if there are still pbjelly processes running
 ## for example "ps aux|grep -v "root"|less -S" to see if you still see PBJellySuite or
 ## blasr or Setup.py or Extraction.py or Support.py or Assemble.py or Out.py
 /opt/PBSuite_15.8.24/bin/Jelly.py setup pbjelly-config-assemble.xml

 #########################
 # STEP 6 Map FASTQ reads to genome.fasta reference
 #########################
 /opt/PBSuite_15.8.24/bin/Jelly.py mapping pbjelly-config-mapping.xml

 #########################
 # STEP 7 Summarize mapping results
 #########################
 /opt/PBSuite_15.8.24/bin/Jelly.py support pbjelly-config-assemble.xml -x "--minMapq=50"

 #########################
 # STEP 8 Extract reads that appear to bridge gaps
 #########################
 /opt/PBSuite_15.8.24/bin/Jelly.py extraction pbjelly-config-assemble.xml

 #########################
 # STEP 9 Assemble reads to fill in gaps
 #########################
 /opt/PBSuite_15.8.24/bin/Jelly.py assembly pbjelly-config-assemble.xml

 #########################
 # STEP 10 Produce output with gaps filled in
 #########################
 /opt/PBSuite_15.8.24/bin/Jelly.py output pbjelly-config-assemble.xml

 #########################
 # STEP 11 Convert het bases to hom and convert lower to uppercase
 #########################
 ## see https://github.com/lh3/seqtk for a link to download seqtk
 /opt/seqtk/seqtk randbase jelly.out.fasta |/opt/seqtk/seqtk seq -U > ../jelly.out.upper.fasta

 #########################
 # STEP 12 Run a variant caller or use Pilon (twice)
 # to correct indels in jelly.out.fasta PacBio filled in Gaps using Illumina reads
 #########################
 ## personally, whether you use Pilon or a Variant Caller (such as callvariants.sh from BBTools), I found that mapping with BBMap with the vslow=t option had better indel correction for simulated data
 ## for real data, using the steps below outperformed Pilon in terms of better RNA-Seq mapping rates
 ##
 ## presumably you can use your 10x reads to correct indels?
 ## calling this paired-end read file as 10x-illumina-quality-controlled-and-trimmed-reads.fq.gz in steps below

 #########################
 # STEP 13 Error correct with BBMap once
 #########################
 cd /genetics/user
 # download latest version of BBTools
 wget https://sourceforge.net/projects/bbmap/files/BBMap_38.25.tar.gz
 tar xzf BBMap_38.25.tar.gz
 cd bbmap
 # compile jni components
 cd jni
 # add to ~/.bashrc     export JAVA_HOME="/usr/lib/jvm/java-1.8.0-openjdk-amd64" or where ever your jvm machine is installed
 # then source ~/.bashrc
 make -f makefile.linux

 cd ${WorkDir}

 # map reads with bbmap 38.25 in vslow mode using jni
 /genetics/user/bbmap/bbmap.sh -Xmx350g -eoom usejni=t pigz=t threads=75 vslow=t ref=jelly.out.upper.fasta in=10x-illumina-quality-controlled-and-trimmed-reads.fq.gz \
 out=10x-illumina-quality-controlled-and-trimmed-reads-mapped-to-jelly.out.upper.fasta-using-bbmap.bam overwrite=t 2> bbmap.log &

 # callvariants 38.25
 /genetics/user/bbmap/callvariants.sh -Xmx350g threads=75 ref=jelly.out.upper.fasta in=10x-illumina-quality-controlled-and-trimmed-reads-mapped-to-jelly.out.upper.fasta-using-bbmap.sorted.bam \
 ploidy=2 vcf=10x-illumina-quality-controlled-and-trimmed-reads-mapped-to-jelly.out.upper.fasta-using-bbmap.vcf minscore=20.0 overwrite=t 2> callvariants.log &

 # use bgzip and tabix to compress then index vcf file
 bgzip -f 10x-illumina-quality-controlled-and-trimmed-reads-mapped-to-jelly.out.upper.fasta-using-bbmap.vcf
 tabix -p vcf 10x-illumina-quality-controlled-and-trimmed-reads-mapped-to-jelly.out.upper.fasta-using-bbmap.vcf.gz

 # use bcftools to get consensus sequence
 bcftools consensus -f jelly.out.upper.fasta 10x-illumina-quality-controlled-and-trimmed-reads-mapped-to-jelly.out.upper.fasta-using-bbmap.vcf.gz -o jelly.out.upper.bbmap-1.fasta 2> bcftools.log &
	# goes along with http://seqanswers.com/forums/showthread.php?p=220925#post220925
	#
	# assumes you have PBJelly, blasr, tabix, bcftools, samtools installed
	# below I am using a machine with 70 cores on a single node, adjust to the number of cores to your machine
	# The scripts below are obviously not designed for use with a cluster, but can be modified
	#
	#########################
	# STEP 1 Combine the FASTQ files and remove the originals to save space
	#########################
	## first combine files and delete the originals to save space
	WorkDir=/genetics/elbers/pacbio

	cd ${WorkDir}
	cat ?.subreads.fastq > pacbio-genome-pbjelly.fastq
	rm ?.subreads.fastq

	## second add fake quality values (from Q0=! to Q30=> in Sanger encoding) for pbjelly to work
	cat pacbio-genome-pbjelly.fastq \| tr "!" ">" > pacbio-genome-pbjelly-fqual.fastq

	#########################
	# STEP 2 Create pbjelly-config-assemble.xml
	#########################
	## use nano to create the file then paste everything between the lines starting with #### and ending with ####
	nano pbjelly-config-assemble.xml
	####pbjelly-config-assemble.xml####
	<jellyProtocol>
	<reference>/genetics/pacbio/genome.fasta</reference>
	<outputDir>/genetics/pacbio/</outputDir>
	<cluster>
	<command notes="For single node, multi-core machines" >${CMD} ${JOBNAME} 2> ${STDERR} 1> ${STDOUT} &</command>
	<nJobs>70</nJobs>
	</cluster>
	<blasr>-minMatch 8 -minPctIdentity 70 -bestn 1 -nCandidates 10 -maxScore -500 -nproc 1 -noSplitSubreads</blasr>
	<input baseDir="/genetics/pacbio/">
	<job>pacbio-genome-pbjelly-fqual.fastq</job>
	</input>
	</jellyProtocol>
	####pbjelly-config-assemble.xml####

	#########################
	# STEP 3 Create fake qualities for reference to fill in gaps for
	#########################
	/opt/PBSuite_15.8.24/bin/fakeQuals.py genome.fasta genome.qual

	#########################
	# STEP 4 Summarize the assembly and reads
	#########################
	## first Summarize the assembly
	/opt/PBSuite_15.8.24/bin/summarizeAssembly.py genome.fasta > genome.fasta.summary

	## second Summarize the reads
	/opt/PBSuite_15.8.24/bin/readSummary.py pbjelly-config-assemble.xml > pacbio-genome-pbjelly-fqual.fastq.summary

	#########################
	# STEP 5 Setup pbjelly
	#########################
	## note - this step takes about 1 hour
	## note2 - pbjelly steps are submitted in the background, but even though it says step is
	## finished, it might not be, you need to see if there are still pbjelly processes running
	## for example "ps aux\|grep -v "root"\|less -S" to see if you still see PBJellySuite or
	## blasr or Setup.py or Extraction.py or Support.py or Assemble.py or Out.py
	/opt/PBSuite_15.8.24/bin/Jelly.py setup pbjelly-config-assemble.xml

	#########################
	# STEP 6 Map FASTQ reads to genome.fasta reference
	#########################
	/opt/PBSuite_15.8.24/bin/Jelly.py mapping pbjelly-config-mapping.xml

	#########################
	# STEP 7 Summarize mapping results
	#########################
	/opt/PBSuite_15.8.24/bin/Jelly.py support pbjelly-config-assemble.xml -x "--minMapq=50"

	#########################
	# STEP 8 Extract reads that appear to bridge gaps
	#########################
	/opt/PBSuite_15.8.24/bin/Jelly.py extraction pbjelly-config-assemble.xml

	#########################
	# STEP 9 Assemble reads to fill in gaps
	#########################
	/opt/PBSuite_15.8.24/bin/Jelly.py assembly pbjelly-config-assemble.xml

	#########################
	# STEP 10 Produce output with gaps filled in
	#########################
	/opt/PBSuite_15.8.24/bin/Jelly.py output pbjelly-config-assemble.xml

	#########################
	# STEP 11 Convert het bases to hom and convert lower to uppercase
	#########################
	## see https://github.com/lh3/seqtk for a link to download seqtk
	/opt/seqtk/seqtk randbase jelly.out.fasta \|/opt/seqtk/seqtk seq -U > ../jelly.out.upper.fasta

	#########################
	# STEP 12 Run a variant caller or use Pilon (twice)
	# to correct indels in jelly.out.fasta PacBio filled in Gaps using Illumina reads
	#########################
	## personally, whether you use Pilon or a Variant Caller (such as callvariants.sh from BBTools), I found that mapping with BBMap with the vslow=t option had better indel correction for simulated data
	## for real data, using the steps below outperformed Pilon in terms of better RNA-Seq mapping rates
	##
	## presumably you can use your 10x reads to correct indels?
	## calling this paired-end read file as 10x-illumina-quality-controlled-and-trimmed-reads.fq.gz in steps below

	#########################
	# STEP 13 Error correct with BBMap once
	#########################
	cd /genetics/user
	# download latest version of BBTools
	wget https://sourceforge.net/projects/bbmap/files/BBMap_38.25.tar.gz
	tar xzf BBMap_38.25.tar.gz
	cd bbmap
	# compile jni components
	cd jni
	# add to ~/.bashrc export JAVA_HOME="/usr/lib/jvm/java-1.8.0-openjdk-amd64" or where ever your jvm machine is installed
	# then source ~/.bashrc
	make -f makefile.linux

	cd ${WorkDir}

	# map reads with bbmap 38.25 in vslow mode using jni
	/genetics/user/bbmap/bbmap.sh -Xmx350g -eoom usejni=t pigz=t threads=75 vslow=t ref=jelly.out.upper.fasta in=10x-illumina-quality-controlled-and-trimmed-reads.fq.gz \
	out=10x-illumina-quality-controlled-and-trimmed-reads-mapped-to-jelly.out.upper.fasta-using-bbmap.bam overwrite=t 2> bbmap.log &

	# callvariants 38.25
	/genetics/user/bbmap/callvariants.sh -Xmx350g threads=75 ref=jelly.out.upper.fasta in=10x-illumina-quality-controlled-and-trimmed-reads-mapped-to-jelly.out.upper.fasta-using-bbmap.sorted.bam \
	ploidy=2 vcf=10x-illumina-quality-controlled-and-trimmed-reads-mapped-to-jelly.out.upper.fasta-using-bbmap.vcf minscore=20.0 overwrite=t 2> callvariants.log &

	# use bgzip and tabix to compress then index vcf file
	bgzip -f 10x-illumina-quality-controlled-and-trimmed-reads-mapped-to-jelly.out.upper.fasta-using-bbmap.vcf
	tabix -p vcf 10x-illumina-quality-controlled-and-trimmed-reads-mapped-to-jelly.out.upper.fasta-using-bbmap.vcf.gz

	# use bcftools to get consensus sequence
	bcftools consensus -f jelly.out.upper.fasta 10x-illumina-quality-controlled-and-trimmed-reads-mapped-to-jelly.out.upper.fasta-using-bbmap.vcf.gz -o jelly.out.upper.bbmap-1.fasta 2> bcftools.log &