Last active
November 9, 2018 08:34
-
-
Save jelber2/730f8d9d05ea5bbc933d5da2c97bedea to your computer and use it in GitHub Desktop.
Running-PBJelly-example-with-indel-correction-with-BBMap-no-Pilon
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# goes along with http://seqanswers.com/forums/showthread.php?p=220925#post220925 | |
# | |
# assumes you have PBJelly, blasr, tabix, bcftools, samtools installed | |
# below I am using a machine with 70 cores on a single node, adjust to the number of cores to your machine | |
# The scripts below are obviously not designed for use with a cluster, but can be modified | |
# | |
######################### | |
# STEP 1 Combine the FASTQ files and remove the originals to save space | |
######################### | |
## first combine files and delete the originals to save space | |
WorkDir=/genetics/elbers/pacbio | |
cd ${WorkDir} | |
cat ?.subreads.fastq > pacbio-genome-pbjelly.fastq | |
rm ?.subreads.fastq | |
## second add fake quality values (from Q0=! to Q30=> in Sanger encoding) for pbjelly to work | |
cat pacbio-genome-pbjelly.fastq | tr "!" ">" > pacbio-genome-pbjelly-fqual.fastq | |
######################### | |
# STEP 2 Create pbjelly-config-assemble.xml | |
######################### | |
## use nano to create the file then paste everything between the lines starting with #### and ending with #### | |
nano pbjelly-config-assemble.xml | |
####pbjelly-config-assemble.xml#### | |
<jellyProtocol> | |
<reference>/genetics/pacbio/genome.fasta</reference> | |
<outputDir>/genetics/pacbio/</outputDir> | |
<cluster> | |
<command notes="For single node, multi-core machines" >${CMD} ${JOBNAME} 2> ${STDERR} 1> ${STDOUT} &</command> | |
<nJobs>70</nJobs> | |
</cluster> | |
<blasr>-minMatch 8 -minPctIdentity 70 -bestn 1 -nCandidates 10 -maxScore -500 -nproc 1 -noSplitSubreads</blasr> | |
<input baseDir="/genetics/pacbio/"> | |
<job>pacbio-genome-pbjelly-fqual.fastq</job> | |
</input> | |
</jellyProtocol> | |
####pbjelly-config-assemble.xml#### | |
######################### | |
# STEP 3 Create fake qualities for reference to fill in gaps for | |
######################### | |
/opt/PBSuite_15.8.24/bin/fakeQuals.py genome.fasta genome.qual | |
######################### | |
# STEP 4 Summarize the assembly and reads | |
######################### | |
## first Summarize the assembly | |
/opt/PBSuite_15.8.24/bin/summarizeAssembly.py genome.fasta > genome.fasta.summary | |
## second Summarize the reads | |
/opt/PBSuite_15.8.24/bin/readSummary.py pbjelly-config-assemble.xml > pacbio-genome-pbjelly-fqual.fastq.summary | |
######################### | |
# STEP 5 Setup pbjelly | |
######################### | |
## note - this step takes about 1 hour | |
## note2 - pbjelly steps are submitted in the background, but even though it says step is | |
## finished, it might not be, you need to see if there are still pbjelly processes running | |
## for example "ps aux|grep -v "root"|less -S" to see if you still see PBJellySuite or | |
## blasr or Setup.py or Extraction.py or Support.py or Assemble.py or Out.py | |
/opt/PBSuite_15.8.24/bin/Jelly.py setup pbjelly-config-assemble.xml | |
######################### | |
# STEP 6 Map FASTQ reads to genome.fasta reference | |
######################### | |
/opt/PBSuite_15.8.24/bin/Jelly.py mapping pbjelly-config-mapping.xml | |
######################### | |
# STEP 7 Summarize mapping results | |
######################### | |
/opt/PBSuite_15.8.24/bin/Jelly.py support pbjelly-config-assemble.xml -x "--minMapq=50" | |
######################### | |
# STEP 8 Extract reads that appear to bridge gaps | |
######################### | |
/opt/PBSuite_15.8.24/bin/Jelly.py extraction pbjelly-config-assemble.xml | |
######################### | |
# STEP 9 Assemble reads to fill in gaps | |
######################### | |
/opt/PBSuite_15.8.24/bin/Jelly.py assembly pbjelly-config-assemble.xml | |
######################### | |
# STEP 10 Produce output with gaps filled in | |
######################### | |
/opt/PBSuite_15.8.24/bin/Jelly.py output pbjelly-config-assemble.xml | |
######################### | |
# STEP 11 Convert het bases to hom and convert lower to uppercase | |
######################### | |
## see https://github.com/lh3/seqtk for a link to download seqtk | |
/opt/seqtk/seqtk randbase jelly.out.fasta |/opt/seqtk/seqtk seq -U > ../jelly.out.upper.fasta | |
######################### | |
# STEP 12 Run a variant caller or use Pilon (twice) | |
# to correct indels in jelly.out.fasta PacBio filled in Gaps using Illumina reads | |
######################### | |
## personally, whether you use Pilon or a Variant Caller (such as callvariants.sh from BBTools), I found that mapping with BBMap with the vslow=t option had better indel correction for simulated data | |
## for real data, using the steps below outperformed Pilon in terms of better RNA-Seq mapping rates | |
## | |
## presumably you can use your 10x reads to correct indels? | |
## calling this paired-end read file as 10x-illumina-quality-controlled-and-trimmed-reads.fq.gz in steps below | |
######################### | |
# STEP 13 Error correct with BBMap once | |
######################### | |
cd /genetics/user | |
# download latest version of BBTools | |
wget https://sourceforge.net/projects/bbmap/files/BBMap_38.25.tar.gz | |
tar xzf BBMap_38.25.tar.gz | |
cd bbmap | |
# compile jni components | |
cd jni | |
# add to ~/.bashrc export JAVA_HOME="/usr/lib/jvm/java-1.8.0-openjdk-amd64" or where ever your jvm machine is installed | |
# then source ~/.bashrc | |
make -f makefile.linux | |
cd ${WorkDir} | |
# map reads with bbmap 38.25 in vslow mode using jni | |
/genetics/user/bbmap/bbmap.sh -Xmx350g -eoom usejni=t pigz=t threads=75 vslow=t ref=jelly.out.upper.fasta in=10x-illumina-quality-controlled-and-trimmed-reads.fq.gz \ | |
out=10x-illumina-quality-controlled-and-trimmed-reads-mapped-to-jelly.out.upper.fasta-using-bbmap.bam overwrite=t 2> bbmap.log & | |
# callvariants 38.25 | |
/genetics/user/bbmap/callvariants.sh -Xmx350g threads=75 ref=jelly.out.upper.fasta in=10x-illumina-quality-controlled-and-trimmed-reads-mapped-to-jelly.out.upper.fasta-using-bbmap.sorted.bam \ | |
ploidy=2 vcf=10x-illumina-quality-controlled-and-trimmed-reads-mapped-to-jelly.out.upper.fasta-using-bbmap.vcf minscore=20.0 overwrite=t 2> callvariants.log & | |
# use bgzip and tabix to compress then index vcf file | |
bgzip -f 10x-illumina-quality-controlled-and-trimmed-reads-mapped-to-jelly.out.upper.fasta-using-bbmap.vcf | |
tabix -p vcf 10x-illumina-quality-controlled-and-trimmed-reads-mapped-to-jelly.out.upper.fasta-using-bbmap.vcf.gz | |
# use bcftools to get consensus sequence | |
bcftools consensus -f jelly.out.upper.fasta 10x-illumina-quality-controlled-and-trimmed-reads-mapped-to-jelly.out.upper.fasta-using-bbmap.vcf.gz -o jelly.out.upper.bbmap-1.fasta 2> bcftools.log & |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment