Skip to content

Instantly share code, notes, and snippets.

@jelber2
Last active November 9, 2018 08:31
Show Gist options
  • Save jelber2/0c7fc56fa71a7f45d1cef40b5946a393 to your computer and use it in GitHub Desktop.
Save jelber2/0c7fc56fa71a7f45d1cef40b5946a393 to your computer and use it in GitHub Desktop.
Runs Pilon twice on a file called genome.pilon-0.fasta
#! /bin/bash
set -e
# installing fasta-splitter.pl
## wget http://kirill-kryukov.com/study/tools/fasta-splitter/files/fasta-splitter-0.2.6.zip
## unzip fasta-splitter-0.2.6.zip
# assumes initial genome to be error-corrected by pilon is called
## genome.pilon-0.fasta
# set variables
# RAM in gigabytes
RAM=350
interleavedReadsPath=/genetics/pacbio/pilon-vs-racon/chr20.pbjelly.fastq.gz
workDir=/genetics/pacbio/pilon-vs-racon/pilon
fastaSplitterPath=/genetics/pacbio/fasta-splitter.pl
samtoolsPath=samtools
bwaPath=bwa
pilonPath=/opt/pilon/pilon-1.22.jar
seqtkPath=/opt/seqtk/seqtk
# number of cores on computer
numThreads=75
# start of script
x=0
while [ $x -lt 2 ]
do
y=$(($x+1))
### a get the contig lengths with ${samtoolsPath} faidx
cd ${workDir}
${samtoolsPath} faidx genome.pilon-${x}.fasta
### b split genome into 39 parts with fasta-splitter.pl
perl ${fastaSplitterPath} --n-parts 39 genome.pilon-${x}.fasta --out-dir pilon-${y}/
cd ${workDir}/pilon-${y}
### c create a sequence from 01,02,...,39
seq -w 1 39 > samples
### d get the contig names
while read i;do grep ">" genome.pilon-${x}.part-${i}.fasta|perl -pe "s/>//g" > targetlist${i}; done < samples
### e get the contig lengths
cut -f 1-2 ../genome.pilon-${x}.fasta.fai > pilon.chromosome.lengths.txt
### f get the names of the contigs that are in split files where there are less than 10 contigs
cp samples samples-to-split
### g write script to make target lists for pilon
### The script below breaks up the task into ${numThreads} pieces
while read i
do
lines="$(wc -l targetlist${i}|cut -d " " -f 1)"
while read line;
do
chrlength="$(grep -P "^$line\t" pilon.chromosome.lengths.txt | cut -f 2)"
numberofsplits="$(perl -w -e "use POSIX; print ceil(${numThreads}/$lines)")"
numberofsplits2=$(($numberofsplits - 1))
splitlength="$(perl -w -e "use POSIX; print ceil($chrlength/$numberofsplits)")"
splits=1
while [[ $splits -lt $numberofsplits ]]
do
if [[ $splits -eq 1 ]]
then
splitlength3="$(($splitlength + 1))"
stopbase=$(($splitlength * ($splits + 1)))
echo "${line}:1-${splitlength}" >> targetlist-${i}
echo "${line}:${splitlength3}-${stopbase}" >> targetlist-${i}
((splits++))
elif [[ $splits -eq $numberofsplits2 ]]
then
startbase=$((($splitlength * ($splits)) + 1))
echo "${line}:${startbase}-${chrlength}" >> targetlist-${i}
((splits++))
else
stopbase=$(($splitlength * ($splits + 1)))
startbase=$((($splitlength * $splits) + 1))
echo "${line}:${startbase}-${stopbase}" >> targetlist-${i}
((splits++))
fi
done
done < targetlist${i}
done < samples-to-split
### h rename targetlist-01 to targetlist01
while read i;do
mv -f targetlist-${i} targetlist${i}
done < samples-to-split
### i make a list of the targetlists
while read i;do
ls targetlist${i} >> targetlists
done < samples
### j bwa index, bwa mem, sam2bam, samtools sort, samtools index
cd ${workDir}
${bwaPath} index -a bwtsw genome.pilon-${x}.fasta > bwa-index.log 2>&1
${bwaPath} mem -p -M -t ${numThreads} genome.pilon-${x}.fasta \
${interleavedReadsPath} 2> bwa.log |\
${samtoolsPath} view -@${numThreads} -F 4 -Sb - |${samtoolsPath} sort -@${numThreads} - > reads-mapped-to-genome.pilon-${x}.fasta.bam
${samtoolsPath} index -@${numThreads} reads-mapped-to-genome.pilon-${x}.fasta.bam
### k run pilon (takes about 1-2 days)
cd ${workDir}/pilon-${y}
while read i;do
java -Xmx${RAM}g -jar ${pilonPath} \
--genome ../genome.pilon-${x}.fasta \
--frags ../reads-mapped-to-genome.pilon-${x}.fasta.bam \
--output ${i}-pilon \
--targets ${i} \
--diploid \
--changes \
--fix bases --threads ${numThreads} > ${i}-pilon.log 2>&1
done < targetlists
### l combine output files
#### i get rid of extra headers in each output file
while read i;do
${seqtkPath} seq -l0 targetlist${i}-pilon.fasta | awk '!seen[$1]++'|${seqtkPath} seq -l80 > targetlist${i}-pilon.fasta2
done < samples
#### ii combine the output files
cat $(find ./ -name "targetlist*-pilon.fasta2" | sort -V) > pilon.fasta
#### iii change output file name and remove "_pilon" from end of contig/scaffold names
${seqtkPath} randbase pilon.fasta | ${seqtkPath} seq -U > ../genome.pilon-${y}.fasta
perl -pi -e "s/_pilon//g" ../genome.pilon-${y}.fasta
### m increase loop value
((x++))
done
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment