Last active
November 9, 2018 08:31
-
-
Save jelber2/0c7fc56fa71a7f45d1cef40b5946a393 to your computer and use it in GitHub Desktop.
Runs Pilon twice on a file called genome.pilon-0.fasta
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /bin/bash | |
set -e | |
# installing fasta-splitter.pl | |
## wget http://kirill-kryukov.com/study/tools/fasta-splitter/files/fasta-splitter-0.2.6.zip | |
## unzip fasta-splitter-0.2.6.zip | |
# assumes initial genome to be error-corrected by pilon is called | |
## genome.pilon-0.fasta | |
# set variables | |
# RAM in gigabytes | |
RAM=350 | |
interleavedReadsPath=/genetics/pacbio/pilon-vs-racon/chr20.pbjelly.fastq.gz | |
workDir=/genetics/pacbio/pilon-vs-racon/pilon | |
fastaSplitterPath=/genetics/pacbio/fasta-splitter.pl | |
samtoolsPath=samtools | |
bwaPath=bwa | |
pilonPath=/opt/pilon/pilon-1.22.jar | |
seqtkPath=/opt/seqtk/seqtk | |
# number of cores on computer | |
numThreads=75 | |
# start of script | |
x=0 | |
while [ $x -lt 2 ] | |
do | |
y=$(($x+1)) | |
### a get the contig lengths with ${samtoolsPath} faidx | |
cd ${workDir} | |
${samtoolsPath} faidx genome.pilon-${x}.fasta | |
### b split genome into 39 parts with fasta-splitter.pl | |
perl ${fastaSplitterPath} --n-parts 39 genome.pilon-${x}.fasta --out-dir pilon-${y}/ | |
cd ${workDir}/pilon-${y} | |
### c create a sequence from 01,02,...,39 | |
seq -w 1 39 > samples | |
### d get the contig names | |
while read i;do grep ">" genome.pilon-${x}.part-${i}.fasta|perl -pe "s/>//g" > targetlist${i}; done < samples | |
### e get the contig lengths | |
cut -f 1-2 ../genome.pilon-${x}.fasta.fai > pilon.chromosome.lengths.txt | |
### f get the names of the contigs that are in split files where there are less than 10 contigs | |
cp samples samples-to-split | |
### g write script to make target lists for pilon | |
### The script below breaks up the task into ${numThreads} pieces | |
while read i | |
do | |
lines="$(wc -l targetlist${i}|cut -d " " -f 1)" | |
while read line; | |
do | |
chrlength="$(grep -P "^$line\t" pilon.chromosome.lengths.txt | cut -f 2)" | |
numberofsplits="$(perl -w -e "use POSIX; print ceil(${numThreads}/$lines)")" | |
numberofsplits2=$(($numberofsplits - 1)) | |
splitlength="$(perl -w -e "use POSIX; print ceil($chrlength/$numberofsplits)")" | |
splits=1 | |
while [[ $splits -lt $numberofsplits ]] | |
do | |
if [[ $splits -eq 1 ]] | |
then | |
splitlength3="$(($splitlength + 1))" | |
stopbase=$(($splitlength * ($splits + 1))) | |
echo "${line}:1-${splitlength}" >> targetlist-${i} | |
echo "${line}:${splitlength3}-${stopbase}" >> targetlist-${i} | |
((splits++)) | |
elif [[ $splits -eq $numberofsplits2 ]] | |
then | |
startbase=$((($splitlength * ($splits)) + 1)) | |
echo "${line}:${startbase}-${chrlength}" >> targetlist-${i} | |
((splits++)) | |
else | |
stopbase=$(($splitlength * ($splits + 1))) | |
startbase=$((($splitlength * $splits) + 1)) | |
echo "${line}:${startbase}-${stopbase}" >> targetlist-${i} | |
((splits++)) | |
fi | |
done | |
done < targetlist${i} | |
done < samples-to-split | |
### h rename targetlist-01 to targetlist01 | |
while read i;do | |
mv -f targetlist-${i} targetlist${i} | |
done < samples-to-split | |
### i make a list of the targetlists | |
while read i;do | |
ls targetlist${i} >> targetlists | |
done < samples | |
### j bwa index, bwa mem, sam2bam, samtools sort, samtools index | |
cd ${workDir} | |
${bwaPath} index -a bwtsw genome.pilon-${x}.fasta > bwa-index.log 2>&1 | |
${bwaPath} mem -p -M -t ${numThreads} genome.pilon-${x}.fasta \ | |
${interleavedReadsPath} 2> bwa.log |\ | |
${samtoolsPath} view -@${numThreads} -F 4 -Sb - |${samtoolsPath} sort -@${numThreads} - > reads-mapped-to-genome.pilon-${x}.fasta.bam | |
${samtoolsPath} index -@${numThreads} reads-mapped-to-genome.pilon-${x}.fasta.bam | |
### k run pilon (takes about 1-2 days) | |
cd ${workDir}/pilon-${y} | |
while read i;do | |
java -Xmx${RAM}g -jar ${pilonPath} \ | |
--genome ../genome.pilon-${x}.fasta \ | |
--frags ../reads-mapped-to-genome.pilon-${x}.fasta.bam \ | |
--output ${i}-pilon \ | |
--targets ${i} \ | |
--diploid \ | |
--changes \ | |
--fix bases --threads ${numThreads} > ${i}-pilon.log 2>&1 | |
done < targetlists | |
### l combine output files | |
#### i get rid of extra headers in each output file | |
while read i;do | |
${seqtkPath} seq -l0 targetlist${i}-pilon.fasta | awk '!seen[$1]++'|${seqtkPath} seq -l80 > targetlist${i}-pilon.fasta2 | |
done < samples | |
#### ii combine the output files | |
cat $(find ./ -name "targetlist*-pilon.fasta2" | sort -V) > pilon.fasta | |
#### iii change output file name and remove "_pilon" from end of contig/scaffold names | |
${seqtkPath} randbase pilon.fasta | ${seqtkPath} seq -U > ../genome.pilon-${y}.fasta | |
perl -pi -e "s/_pilon//g" ../genome.pilon-${y}.fasta | |
### m increase loop value | |
((x++)) | |
done |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment