tkrahn · July 27, 2020 18:25
diff --git a/bwa_hg38_pipeline_release.sh b/bwa_hg38_pipeline_release.sh
 #!/bin/bash

 START=$(date +%s.%N)
 clear
 # setup parameters

 YSEQID=${PWD##*/}
 # YSEQID="1234" # (the above command simply gets the name of the last segment of the current working directory)

 NUM_THREADS=$(getconf _NPROCESSORS_ONLN)
 echo "We can use ${NUM_THREADS} threads."

 BWA='bwa'
 if test -x "/usr/local/bin/bwa-mem2"; then
    echo "We can use bwa-mem2"
    BWA='bwa-mem2'
 fi


 UPS='upload.server.yseq.net'

 # Create a random password
 PASSWD=`date +%s | sha256sum | base64 | head -c 8`
 echo "Random password created: ${PASSWD}"

 # Generate summary file
 cat /genomes/0/script-templates/x_result_summary.txt | sed "s/<kitnumber>/${YSEQID}/g" | sed "s/<passwd>/${PASSWD}/g" >${YSEQID}_result_summary.txt
 echo "${YSEQID}_result_summary.txt created"

 REF="/genomes/0/refseq/hg38/hg38.fa"

 READS_1="fastq/${YSEQID}_R1.fastq.gz"
 READS_2="fastq/${YSEQID}_R2.fastq.gz"
 #SAMFILE="${YSEQID}_bwa-mem_hg38.sam" # no longer needed due to piping
 BAMFILE="${YSEQID}_bwa-mem_hg38.bam"

 BAMFILE_SORTED="${YSEQID}_bwa-mem_hg38_sorted.bam"
 VCF_FILE="${YSEQID}_hg38.vcf"

 cp /genomes/0/tree/trees/yfull/latest_YFull_YTree.json .
 YFULLTREE="latest_YFull_YTree.json"

 cp /genomes/0/script-templates/cladeFinder.py .
 CLADEFINDER="cladeFinder.py"

 #ANN_VCF_FILE="${YSEQID}_hg38_dbSNP150_ann.vcf"
 #ANN_CLINVAR_VCF_FILE="${YSEQID}_hg38_clinvar_ann.vcf"

 #DBSNP="/genomes/0/refseq/hg19/All_chr_20161121.vcf.gz"

 # Read Group(s)
 # READ_GROUPS="\"@RG\tID:1\tSM:${YSEQID}\tPL:ILLUMINA\tPU:UNIT1\tLB:${YSEQID}\""

 if [[ $1 != "-resume" ]]
 then


 	# Pipeline commands:
 	${BWA} mem -t $NUM_THREADS -M $REF $READS_1 $READS_2 | \
 	samtools view -@ $NUM_THREADS -b -t $REF -o $BAMFILE -
 	samtools sort -@ $NUM_THREADS -T /usr/local/geospiza/var/tmp/sorted -o $BAMFILE_SORTED $BAMFILE
 	samtools index $BAMFILE_SORTED
 	samtools idxstats $BAMFILE_SORTED > ${BAMFILE_SORTED}.idxstats.tsv
 fi

 # Update summary file with BAM filesize:
 BAM_FILESIZE=`du -kh "${BAMFILE_SORTED}" | cut -f1`
 echo "Size of ${BAMFILE_SORTED} = ${BAM_FILESIZE}"
 cat ${YSEQID}_result_summary.txt | sed "s/<bamsize>/${BAM_FILESIZE}byte/g" >${YSEQID}_result_summary.txt.tmp
 rm -f ${YSEQID}_result_summary.txt
 mv ${YSEQID}_result_summary.txt.tmp ${YSEQID}_result_summary.txt

 if [[ $1 != "-resume" ]]
 then

 	# Delete no longer needed large files
 	#rm -f $SAMFILE
 	rm -f $BAMFILE # keep $BAMFILE_SORTED
 fi

 # Easy tview file
 echo "#!/bin/bash" > tview_${YSEQID}.sh
 echo "samtools tview ${BAMFILE_SORTED} ${REF}" >> tview_${YSEQID}.sh
 chmod a+x tview_${YSEQID}.sh


 if [[ $1 != "-resume" ]]
 then

 	# Separate chrY & mtDNA BAM files
 	samtools view -@ $NUM_THREADS -b -o "${YSEQID}_bwa-mem_hg38_chrY.bam" $BAMFILE_SORTED chrY &
 	samtools view -@ $NUM_THREADS -b -o "${YSEQID}_bwa-mem_rCRS_chrM.bam" $BAMFILE_SORTED chrM &
 	wait
 	samtools index "${YSEQID}_bwa-mem_hg38_chrY.bam" &
 	samtools index "${YSEQID}_bwa-mem_rCRS_chrM.bam" &
 	wait


 	#rsync -e "ssh" --progress -v ${YSEQID}_bwa-mem_rCRS_chrM.bam ${UPS}:/genomes/${YSEQID}/
 	#rsync -e "ssh" --progress -v ${YSEQID}_bwa-mem_rCRS_chrM.bam.bai ${UPS}:/genomes/${YSEQID}/

 	#rsync -e "ssh" --progress -v ${YSEQID}_bwa-mem_hg*_chrY.bam ${UPS}:/genomes/${YSEQID}/
 	#rsync -e "ssh" --progress -v ${YSEQID}_bwa-mem_hg*_chrY.bam.bai ${UPS}:/genomes/${YSEQID}/

 	# mtDNA allele calling $ FASTA file generation
 	bcftools mpileup -r chrM -Ou -C 50 -f ${REF} ${BAMFILE_SORTED} | bcftools call --threads $NUM_THREADS -O z -v -m -P 0  > chrM_${VCF_FILE}.gz
 	tabix chrM_${VCF_FILE}.gz
 	samtools faidx $REF chrM | bcftools consensus chrM_${VCF_FILE}.gz -o ${YSEQID}_mtDNA.fasta


 	# Annotate all known Y chromosome SNPs from Ybrowse
 	wget http://ybrowse.org/gbrowse2/gff/snps_hg38.vcf.gz
 	wget http://ybrowse.org/gbrowse2/gff/snps_hg38.vcf.gz.tbi

 	bcftools mpileup -r chrY -C 50 --threads $NUM_THREADS -O z -f $REF $BAMFILE_SORTED > chrY_raw_${VCF_FILE}.gz
 	tabix chrY_raw_${VCF_FILE}.gz

 	bcftools merge -m both -O z snps_hg38.vcf.gz chrY_raw_${VCF_FILE}.gz > chrY_merged_${VCF_FILE}.gz
 	tabix chrY_merged_${VCF_FILE}.gz
 	echo "merging completed"

 	bcftools call -O z -m -P 0 chrY_merged_${VCF_FILE}.gz > chrY_called_${VCF_FILE}.gz
 	tabix chrY_called_${VCF_FILE}.gz
 	echo "calling completed"

 	bcftools filter -O z -i '%ID!="."' chrY_called_${VCF_FILE}.gz | bcftools view -O z -s ^HARRY,ALIEN >chrY_cleaned_${VCF_FILE}.gz
 	tabix chrY_cleaned_${VCF_FILE}.gz
 	echo "cleanup completed"

 	bcftools filter -O z -i '(GT=="1/1" && AA==REF) || (GT=="0/0" && AA==ALT)' chrY_cleaned_${VCF_FILE}.gz >chrY_derived_${VCF_FILE}.gz &
 	bcftools filter -O z -i '(GT=="0/0" && AA==REF) || (GT=="1/1" && AA==ALT)' chrY_cleaned_${VCF_FILE}.gz >chrY_ancestral_${VCF_FILE}.gz &
 	wait
 	tabix chrY_derived_${VCF_FILE}.gz &
 	tabix chrY_ancestral_${VCF_FILE}.gz &
 	wait
 	echo "ancestral & derived filters completed"

 	# Find the Y haplogroup
 	bcftools query -f '%ID,' chrY_derived_${VCF_FILE}.gz > ${YSEQID}_positives.txt &
 	bcftools query -f '%ID,' chrY_ancestral_${VCF_FILE}.gz > ${YSEQID}_negatives.txt &
 	wait
 fi


 python ${CLADEFINDER} ${YFULLTREE} ${YSEQID}_positives.txt ${YSEQID}_negatives.txt ${YSEQID}_cladeFinderOutput.csv
 echo "Y haplogroup checked"
 echo "The 5 best hits are:"
 head -n 5 ${YSEQID}_cladeFinderOutput.csv


 YFULLHG="unknown"
 YFULLPATH="unknown"
 OLDIFS=$IFS
 IFS=$'\t'
 LINECOUNTER=$((0))

 [ ! -f ${YSEQID}_cladeFinderOutput.csv ] && { echo "${YSEQID}_cladeFinderOutput.csv file not found"; }
 while read -r haplogroup path purity depth score conflicting_negatives
 do
 	LINECOUNTER=$((LINECOUNTER + 1))
 	if [ $LINECOUNTER -eq $((2)) ]
 	then
 		YFULLHG=$haplogroup
 		YFULLPATH=$path
 	fi		

 done < ${YSEQID}_cladeFinderOutput.csv
 IFS=$OLDIFS


 echo "YFULLHG: ${YFULLHG}"
 echo "YFULLPATH: ${YFULLPATH}"

 # We're reading the first 100k sequences from the first fastq file to determine the average read length
 READLENGTH=$(samtools bam2fq -@ ${NUM_THREADS} ${BAMFILE_SORTED} | head -n 100000 | awk '{if(NR%4==2) {count++; bases += length} } END{print bases/count}')
 echo $READLENGTH

 # Calculating the coverage from the idxstats output
 COVERAGE_HG38=$(awk -v rl="$READLENGTH" '{x+=$2;m+=$3}END{print m*rl/x "x"}' < ${YSEQID}_bwa-mem_hg38_sorted.bam.idxstats.tsv)
 echo $COVERAGE_HG38

 cp /genomes/0/script-templates/getEquivalentAndDownstreamSNPs.py .
 PHYLOEQ_SNPS_FILE=${YSEQID}_PHYLOEQ_SNPS.tsv
 DOWNSTR_SNPS_FILE=${YSEQID}_DOWNSTR_SNPS.tsv
 python3 getEquivalentAndDownstreamSNPs.py ${YFULLTREE} "${YFULLHG}" chrY_cleaned_${VCF_FILE}.gz ${YSEQID}_positives.txt ${YSEQID}_negatives.txt ${PHYLOEQ_SNPS_FILE} ${DOWNSTR_SNPS_FILE} ${YSEQID}_bwa-mem_hg38_sorted.bam.idxstats.tsv

 cp /genomes/0/script-templates/getMTDNADifferences.py .
 MTDNA_SNPS_FILE=${YSEQID}_MTDNA_SNPS.tsv
 HAPLOGREP_JAR=haplogrep-2.1.25.jar
 cp /genomes/0/haplogrep/${HAPLOGREP_JAR} .

 MTHAPLOGROUP=$(python getMTDNADifferences.py -process chrM_${VCF_FILE}.gz ${HAPLOGREP_JAR} ${MTDNA_SNPS_FILE})


 # Update summary file with Y haplogroup:

 # Note: Since the YFull tree uses forward slashes for identical SNPs with different names, we replace the sed separator "/" with "|"
 # This keeps us from escaping every forward slash. But don't confuse it with the pipe character in the same command!

 cat ${YSEQID}_result_summary.txt | \
    sed "s|<MTHAPLOGROUP>|${MTHAPLOGROUP}|g" | \
    sed "s|<YHAPLOGROUP>|${YFULLHG}|g" | \
    sed "s|<YFULLPATH>|${YFULLPATH}|g" | \
    sed "s|<COVERAGE_HG38>|${COVERAGE_HG38}|g" | \
    sed "s|<READLENGTH>|${READLENGTH}|g" | \
    sed "s|<TERMINALSNP>|${YFULLHG}|g" >${YSEQID}_result_summary.txt.tmp


 mv ${YSEQID}_result_summary.txt ${YSEQID}_result_summary.txt.backup

 mv ${YSEQID}_result_summary.txt.tmp ${YSEQID}_result_summary.txt

 bcftools filter -O z -i '(%ID==".") && TYPE="snp" && QUAL>=30 && GT=="1/1" && DP4[2] + DP4[3] >=2 && (DP4[0] + DP4[1]) / DP < 0.1' chrY_called_${VCF_FILE}.gz | bcftools view -O z -s ^HARRY,ALIEN >chrY_novel_SNPs_${VCF_FILE}.gz
 tabix chrY_novel_SNPs_${VCF_FILE}.gz
 zcat chrY_novel_SNPs_${VCF_FILE}.gz | grep -v "##" >chrY_novel_SNPs_${VCF_FILE}.tsv
 echo "novel SNP calling completed"

 # Create Novel SNP template after checking for unreliable ranges and identity within Y Chromosome
 cp /genomes/0/script-templates/identityResolutionTemplateCreator.py .
 python identityResolutionTemplateCreator.py -batch chrY_novel_SNPs_${VCF_FILE}.tsv chrY_novel_SNPs_${YSEQID}_hg38.tsv ${REF}

 bcftools filter -O z -i '(%ID==".") && TYPE="indel" && QUAL>=30 && GT=="1/1" && DP4[2] + DP4[3] >=2 && (DP4[0] + DP4[1]) / DP < 0.1' chrY_called_${VCF_FILE}.gz | bcftools view -O z -s ^HARRY,ALIEN >chrY_INDELs_${VCF_FILE}.gz
 tabix chrY_INDELs_${VCF_FILE}.gz
 echo "INDEL calling completed"



 # Parallel SNP calling by chromosome
 bcftools mpileup -r chr1 -Ou -C 50 -f $REF $BAMFILE_SORTED | bcftools call -O z --threads $NUM_THREADS -v -V indels -m -P 0 > chr1_${VCF_FILE}.gz &
 bcftools mpileup -r chr2 -Ou -C 50 -f $REF $BAMFILE_SORTED | bcftools call -O z --threads $NUM_THREADS -v -V indels -m -P 0 > chr2_${VCF_FILE}.gz &
 bcftools mpileup -r chr3 -Ou -C 50 -f $REF $BAMFILE_SORTED | bcftools call -O z --threads $NUM_THREADS -v -V indels -m -P 0 > chr3_${VCF_FILE}.gz &
 bcftools mpileup -r chr4 -Ou -C 50 -f $REF $BAMFILE_SORTED | bcftools call -O z --threads $NUM_THREADS -v -V indels -m -P 0 > chr4_${VCF_FILE}.gz &
 bcftools mpileup -r chr5 -Ou -C 50 -f $REF $BAMFILE_SORTED | bcftools call -O z --threads $NUM_THREADS -v -V indels -m -P 0 > chr5_${VCF_FILE}.gz &
 bcftools mpileup -r chr6 -Ou -C 50 -f $REF $BAMFILE_SORTED | bcftools call -O z --threads $NUM_THREADS -v -V indels -m -P 0 > chr6_${VCF_FILE}.gz &
 bcftools mpileup -r chr7 -Ou -C 50 -f $REF $BAMFILE_SORTED | bcftools call -O z --threads $NUM_THREADS -v -V indels -m -P 0 > chr7_${VCF_FILE}.gz &
 bcftools mpileup -r chr8 -Ou -C 50 -f $REF $BAMFILE_SORTED | bcftools call -O z --threads $NUM_THREADS -v -V indels -m -P 0 > chr8_${VCF_FILE}.gz &
 bcftools mpileup -r chr9 -Ou -C 50 -f $REF $BAMFILE_SORTED | bcftools call -O z --threads $NUM_THREADS -v -V indels -m -P 0 > chr9_${VCF_FILE}.gz &
 bcftools mpileup -r chr10 -Ou -C 50 -f $REF $BAMFILE_SORTED | bcftools call -O z --threads $NUM_THREADS -v -V indels -m -P 0 > chr10_${VCF_FILE}.gz &
 bcftools mpileup -r chr11 -Ou -C 50 -f $REF $BAMFILE_SORTED | bcftools call -O z --threads $NUM_THREADS -v -V indels -m -P 0 > chr11_${VCF_FILE}.gz &
 bcftools mpileup -r chr12 -Ou -C 50 -f $REF $BAMFILE_SORTED | bcftools call -O z --threads $NUM_THREADS -v -V indels -m -P 0 > chr12_${VCF_FILE}.gz &
 bcftools mpileup -r chr13 -Ou -C 50 -f $REF $BAMFILE_SORTED | bcftools call -O z --threads $NUM_THREADS -v -V indels -m -P 0 > chr13_${VCF_FILE}.gz &
 bcftools mpileup -r chr14 -Ou -C 50 -f $REF $BAMFILE_SORTED | bcftools call -O z --threads $NUM_THREADS -v -V indels -m -P 0 > chr14_${VCF_FILE}.gz &
 bcftools mpileup -r chr15 -Ou -C 50 -f $REF $BAMFILE_SORTED | bcftools call -O z --threads $NUM_THREADS -v -V indels -m -P 0 > chr15_${VCF_FILE}.gz &
 bcftools mpileup -r chr16 -Ou -C 50 -f $REF $BAMFILE_SORTED | bcftools call -O z --threads $NUM_THREADS -v -V indels -m -P 0 > chr16_${VCF_FILE}.gz &
 bcftools mpileup -r chr17 -Ou -C 50 -f $REF $BAMFILE_SORTED | bcftools call -O z --threads $NUM_THREADS -v -V indels -m -P 0 > chr17_${VCF_FILE}.gz &
 bcftools mpileup -r chr18 -Ou -C 50 -f $REF $BAMFILE_SORTED | bcftools call -O z --threads $NUM_THREADS -v -V indels -m -P 0 > chr18_${VCF_FILE}.gz &
 bcftools mpileup -r chr19 -Ou -C 50 -f $REF $BAMFILE_SORTED | bcftools call -O z --threads $NUM_THREADS -v -V indels -m -P 0 > chr19_${VCF_FILE}.gz &
 bcftools mpileup -r chr20 -Ou -C 50 -f $REF $BAMFILE_SORTED | bcftools call -O z --threads $NUM_THREADS -v -V indels -m -P 0 > chr20_${VCF_FILE}.gz &
 bcftools mpileup -r chr21 -Ou -C 50 -f $REF $BAMFILE_SORTED | bcftools call -O z --threads $NUM_THREADS -v -V indels -m -P 0 > chr21_${VCF_FILE}.gz &
 bcftools mpileup -r chr22 -Ou -C 50 -f $REF $BAMFILE_SORTED | bcftools call -O z --threads $NUM_THREADS -v -V indels -m -P 0 > chr22_${VCF_FILE}.gz &
 bcftools mpileup -r chrX -Ou -C 50 -f $REF $BAMFILE_SORTED | bcftools call -O z --threads $NUM_THREADS -v -V indels -m -P 0  > chrX_${VCF_FILE}.gz &
 bcftools mpileup -r chrY -Ou -C 50 -f $REF $BAMFILE_SORTED | bcftools call -O z --threads $NUM_THREADS -v -V indels -m -P 0  > chrY_${VCF_FILE}.gz &
 wait

 # Concatenate all chromosome VCFs to one big file
 bcftools concat -O z chr[1-9]_${VCF_FILE}.gz chr[1-2][0-9]_${VCF_FILE}.gz chr[M,X-Y]_${VCF_FILE}.gz > ${VCF_FILE}.gz
 tabix ${VCF_FILE}.gz

 # Delete no longer needed VCFs
 rm -f chr[0-9]_${VCF_FILE}.gz chr[1-2][0-9]_${VCF_FILE}.gz chrX_${VCF_FILE}.gz



 END=$(date +%s.%N)

 DIFF=$(echo "$END - $START" | bc)
 echo ${DIFF}
	#!/bin/bash

	START=$(date +%s.%N)
	clear
	# setup parameters

	YSEQID=${PWD##*/}
	# YSEQID="1234" # (the above command simply gets the name of the last segment of the current working directory)

	NUM_THREADS=$(getconf _NPROCESSORS_ONLN)
	echo "We can use ${NUM_THREADS} threads."

	BWA='bwa'
	if test -x "/usr/local/bin/bwa-mem2"; then
	echo "We can use bwa-mem2"
	BWA='bwa-mem2'
	fi


	UPS='upload.server.yseq.net'

	# Create a random password
	PASSWD=`date +%s \| sha256sum \| base64 \| head -c 8`
	echo "Random password created: ${PASSWD}"

	# Generate summary file
	cat /genomes/0/script-templates/x_result_summary.txt \| sed "s/<kitnumber>/${YSEQID}/g" \| sed "s/<passwd>/${PASSWD}/g" >${YSEQID}_result_summary.txt
	echo "${YSEQID}_result_summary.txt created"

	REF="/genomes/0/refseq/hg38/hg38.fa"

	READS_1="fastq/${YSEQID}_R1.fastq.gz"
	READS_2="fastq/${YSEQID}_R2.fastq.gz"
	#SAMFILE="${YSEQID}_bwa-mem_hg38.sam" # no longer needed due to piping
	BAMFILE="${YSEQID}_bwa-mem_hg38.bam"

	BAMFILE_SORTED="${YSEQID}_bwa-mem_hg38_sorted.bam"
	VCF_FILE="${YSEQID}_hg38.vcf"

	cp /genomes/0/tree/trees/yfull/latest_YFull_YTree.json .
	YFULLTREE="latest_YFull_YTree.json"

	cp /genomes/0/script-templates/cladeFinder.py .
	CLADEFINDER="cladeFinder.py"

	#ANN_VCF_FILE="${YSEQID}_hg38_dbSNP150_ann.vcf"
	#ANN_CLINVAR_VCF_FILE="${YSEQID}_hg38_clinvar_ann.vcf"

	#DBSNP="/genomes/0/refseq/hg19/All_chr_20161121.vcf.gz"

	# Read Group(s)
	# READ_GROUPS="\"@RG\tID:1\tSM:${YSEQID}\tPL:ILLUMINA\tPU:UNIT1\tLB:${YSEQID}\""

	if [[ $1 != "-resume" ]]
	then


	# Pipeline commands:
	${BWA} mem -t $NUM_THREADS -M $REF $READS_1 $READS_2 \| \
	samtools view -@ $NUM_THREADS -b -t $REF -o $BAMFILE -
	samtools sort -@ $NUM_THREADS -T /usr/local/geospiza/var/tmp/sorted -o $BAMFILE_SORTED $BAMFILE
	samtools index $BAMFILE_SORTED
	samtools idxstats $BAMFILE_SORTED > ${BAMFILE_SORTED}.idxstats.tsv
	fi

	# Update summary file with BAM filesize:
	BAM_FILESIZE=`du -kh "${BAMFILE_SORTED}" \| cut -f1`
	echo "Size of ${BAMFILE_SORTED} = ${BAM_FILESIZE}"
	cat ${YSEQID}_result_summary.txt \| sed "s/<bamsize>/${BAM_FILESIZE}byte/g" >${YSEQID}_result_summary.txt.tmp
	rm -f ${YSEQID}_result_summary.txt
	mv ${YSEQID}_result_summary.txt.tmp ${YSEQID}_result_summary.txt

	if [[ $1 != "-resume" ]]
	then

	# Delete no longer needed large files
	#rm -f $SAMFILE
	rm -f $BAMFILE # keep $BAMFILE_SORTED
	fi

	# Easy tview file
	echo "#!/bin/bash" > tview_${YSEQID}.sh
	echo "samtools tview ${BAMFILE_SORTED} ${REF}" >> tview_${YSEQID}.sh
	chmod a+x tview_${YSEQID}.sh


	if [[ $1 != "-resume" ]]
	then

	# Separate chrY & mtDNA BAM files
	samtools view -@ $NUM_THREADS -b -o "${YSEQID}_bwa-mem_hg38_chrY.bam" $BAMFILE_SORTED chrY &
	samtools view -@ $NUM_THREADS -b -o "${YSEQID}_bwa-mem_rCRS_chrM.bam" $BAMFILE_SORTED chrM &
	wait
	samtools index "${YSEQID}_bwa-mem_hg38_chrY.bam" &
	samtools index "${YSEQID}_bwa-mem_rCRS_chrM.bam" &
	wait


	#rsync -e "ssh" --progress -v ${YSEQID}_bwa-mem_rCRS_chrM.bam ${UPS}:/genomes/${YSEQID}/
	#rsync -e "ssh" --progress -v ${YSEQID}_bwa-mem_rCRS_chrM.bam.bai ${UPS}:/genomes/${YSEQID}/

	#rsync -e "ssh" --progress -v ${YSEQID}_bwa-mem_hg*_chrY.bam ${UPS}:/genomes/${YSEQID}/
	#rsync -e "ssh" --progress -v ${YSEQID}_bwa-mem_hg*_chrY.bam.bai ${UPS}:/genomes/${YSEQID}/

	# mtDNA allele calling $ FASTA file generation
	bcftools mpileup -r chrM -Ou -C 50 -f ${REF} ${BAMFILE_SORTED} \| bcftools call --threads $NUM_THREADS -O z -v -m -P 0 > chrM_${VCF_FILE}.gz
	tabix chrM_${VCF_FILE}.gz
	samtools faidx $REF chrM \| bcftools consensus chrM_${VCF_FILE}.gz -o ${YSEQID}_mtDNA.fasta


	# Annotate all known Y chromosome SNPs from Ybrowse
	wget http://ybrowse.org/gbrowse2/gff/snps_hg38.vcf.gz
	wget http://ybrowse.org/gbrowse2/gff/snps_hg38.vcf.gz.tbi

	bcftools mpileup -r chrY -C 50 --threads $NUM_THREADS -O z -f $REF $BAMFILE_SORTED > chrY_raw_${VCF_FILE}.gz
	tabix chrY_raw_${VCF_FILE}.gz

	bcftools merge -m both -O z snps_hg38.vcf.gz chrY_raw_${VCF_FILE}.gz > chrY_merged_${VCF_FILE}.gz
	tabix chrY_merged_${VCF_FILE}.gz
	echo "merging completed"

	bcftools call -O z -m -P 0 chrY_merged_${VCF_FILE}.gz > chrY_called_${VCF_FILE}.gz
	tabix chrY_called_${VCF_FILE}.gz
	echo "calling completed"

	bcftools filter -O z -i '%ID!="."' chrY_called_${VCF_FILE}.gz \| bcftools view -O z -s ^HARRY,ALIEN >chrY_cleaned_${VCF_FILE}.gz
	tabix chrY_cleaned_${VCF_FILE}.gz
	echo "cleanup completed"

	bcftools filter -O z -i '(GT=="1/1" && AA==REF) \|\| (GT=="0/0" && AA==ALT)' chrY_cleaned_${VCF_FILE}.gz >chrY_derived_${VCF_FILE}.gz &
	bcftools filter -O z -i '(GT=="0/0" && AA==REF) \|\| (GT=="1/1" && AA==ALT)' chrY_cleaned_${VCF_FILE}.gz >chrY_ancestral_${VCF_FILE}.gz &
	wait
	tabix chrY_derived_${VCF_FILE}.gz &
	tabix chrY_ancestral_${VCF_FILE}.gz &
	wait
	echo "ancestral & derived filters completed"

	# Find the Y haplogroup
	bcftools query -f '%ID,' chrY_derived_${VCF_FILE}.gz > ${YSEQID}_positives.txt &
	bcftools query -f '%ID,' chrY_ancestral_${VCF_FILE}.gz > ${YSEQID}_negatives.txt &
	wait
	fi


	python ${CLADEFINDER} ${YFULLTREE} ${YSEQID}_positives.txt ${YSEQID}_negatives.txt ${YSEQID}_cladeFinderOutput.csv
	echo "Y haplogroup checked"
	echo "The 5 best hits are:"
	head -n 5 ${YSEQID}_cladeFinderOutput.csv


	YFULLHG="unknown"
	YFULLPATH="unknown"
	OLDIFS=$IFS
	IFS=$'\t'
	LINECOUNTER=$((0))

	[ ! -f ${YSEQID}_cladeFinderOutput.csv ] && { echo "${YSEQID}_cladeFinderOutput.csv file not found"; }
	while read -r haplogroup path purity depth score conflicting_negatives
	do
	LINECOUNTER=$((LINECOUNTER + 1))
	if [ $LINECOUNTER -eq $((2)) ]
	then
	YFULLHG=$haplogroup
	YFULLPATH=$path
	fi

	done < ${YSEQID}_cladeFinderOutput.csv
	IFS=$OLDIFS


	echo "YFULLHG: ${YFULLHG}"
	echo "YFULLPATH: ${YFULLPATH}"

	# We're reading the first 100k sequences from the first fastq file to determine the average read length
	READLENGTH=$(samtools bam2fq -@ ${NUM_THREADS} ${BAMFILE_SORTED} \| head -n 100000 \| awk '{if(NR%4==2) {count++; bases += length} } END{print bases/count}')
	echo $READLENGTH

	# Calculating the coverage from the idxstats output
	COVERAGE_HG38=$(awk -v rl="$READLENGTH" '{x+=$2;m+=$3}END{print m*rl/x "x"}' < ${YSEQID}_bwa-mem_hg38_sorted.bam.idxstats.tsv)
	echo $COVERAGE_HG38

	cp /genomes/0/script-templates/getEquivalentAndDownstreamSNPs.py .
	PHYLOEQ_SNPS_FILE=${YSEQID}_PHYLOEQ_SNPS.tsv
	DOWNSTR_SNPS_FILE=${YSEQID}_DOWNSTR_SNPS.tsv
	python3 getEquivalentAndDownstreamSNPs.py ${YFULLTREE} "${YFULLHG}" chrY_cleaned_${VCF_FILE}.gz ${YSEQID}_positives.txt ${YSEQID}_negatives.txt ${PHYLOEQ_SNPS_FILE} ${DOWNSTR_SNPS_FILE} ${YSEQID}_bwa-mem_hg38_sorted.bam.idxstats.tsv

	cp /genomes/0/script-templates/getMTDNADifferences.py .
	MTDNA_SNPS_FILE=${YSEQID}_MTDNA_SNPS.tsv
	HAPLOGREP_JAR=haplogrep-2.1.25.jar
	cp /genomes/0/haplogrep/${HAPLOGREP_JAR} .

	MTHAPLOGROUP=$(python getMTDNADifferences.py -process chrM_${VCF_FILE}.gz ${HAPLOGREP_JAR} ${MTDNA_SNPS_FILE})


	# Update summary file with Y haplogroup:

	# Note: Since the YFull tree uses forward slashes for identical SNPs with different names, we replace the sed separator "/" with "\|"
	# This keeps us from escaping every forward slash. But don't confuse it with the pipe character in the same command!

	cat ${YSEQID}_result_summary.txt \| \
	sed "s\|<MTHAPLOGROUP>\|${MTHAPLOGROUP}\|g" \| \
	sed "s\|<YHAPLOGROUP>\|${YFULLHG}\|g" \| \
	sed "s\|<YFULLPATH>\|${YFULLPATH}\|g" \| \
	sed "s\|<COVERAGE_HG38>\|${COVERAGE_HG38}\|g" \| \
	sed "s\|<READLENGTH>\|${READLENGTH}\|g" \| \
	sed "s\|<TERMINALSNP>\|${YFULLHG}\|g" >${YSEQID}_result_summary.txt.tmp


	mv ${YSEQID}_result_summary.txt ${YSEQID}_result_summary.txt.backup

	mv ${YSEQID}_result_summary.txt.tmp ${YSEQID}_result_summary.txt

	bcftools filter -O z -i '(%ID==".") && TYPE="snp" && QUAL>=30 && GT=="1/1" && DP4[2] + DP4[3] >=2 && (DP4[0] + DP4[1]) / DP < 0.1' chrY_called_${VCF_FILE}.gz \| bcftools view -O z -s ^HARRY,ALIEN >chrY_novel_SNPs_${VCF_FILE}.gz
	tabix chrY_novel_SNPs_${VCF_FILE}.gz
	zcat chrY_novel_SNPs_${VCF_FILE}.gz \| grep -v "##" >chrY_novel_SNPs_${VCF_FILE}.tsv
	echo "novel SNP calling completed"

	# Create Novel SNP template after checking for unreliable ranges and identity within Y Chromosome
	cp /genomes/0/script-templates/identityResolutionTemplateCreator.py .
	python identityResolutionTemplateCreator.py -batch chrY_novel_SNPs_${VCF_FILE}.tsv chrY_novel_SNPs_${YSEQID}_hg38.tsv ${REF}

	bcftools filter -O z -i '(%ID==".") && TYPE="indel" && QUAL>=30 && GT=="1/1" && DP4[2] + DP4[3] >=2 && (DP4[0] + DP4[1]) / DP < 0.1' chrY_called_${VCF_FILE}.gz \| bcftools view -O z -s ^HARRY,ALIEN >chrY_INDELs_${VCF_FILE}.gz
	tabix chrY_INDELs_${VCF_FILE}.gz
	echo "INDEL calling completed"



	# Parallel SNP calling by chromosome
	bcftools mpileup -r chr1 -Ou -C 50 -f $REF $BAMFILE_SORTED \| bcftools call -O z --threads $NUM_THREADS -v -V indels -m -P 0 > chr1_${VCF_FILE}.gz &
	bcftools mpileup -r chr2 -Ou -C 50 -f $REF $BAMFILE_SORTED \| bcftools call -O z --threads $NUM_THREADS -v -V indels -m -P 0 > chr2_${VCF_FILE}.gz &
	bcftools mpileup -r chr3 -Ou -C 50 -f $REF $BAMFILE_SORTED \| bcftools call -O z --threads $NUM_THREADS -v -V indels -m -P 0 > chr3_${VCF_FILE}.gz &
	bcftools mpileup -r chr4 -Ou -C 50 -f $REF $BAMFILE_SORTED \| bcftools call -O z --threads $NUM_THREADS -v -V indels -m -P 0 > chr4_${VCF_FILE}.gz &
	bcftools mpileup -r chr5 -Ou -C 50 -f $REF $BAMFILE_SORTED \| bcftools call -O z --threads $NUM_THREADS -v -V indels -m -P 0 > chr5_${VCF_FILE}.gz &
	bcftools mpileup -r chr6 -Ou -C 50 -f $REF $BAMFILE_SORTED \| bcftools call -O z --threads $NUM_THREADS -v -V indels -m -P 0 > chr6_${VCF_FILE}.gz &
	bcftools mpileup -r chr7 -Ou -C 50 -f $REF $BAMFILE_SORTED \| bcftools call -O z --threads $NUM_THREADS -v -V indels -m -P 0 > chr7_${VCF_FILE}.gz &
	bcftools mpileup -r chr8 -Ou -C 50 -f $REF $BAMFILE_SORTED \| bcftools call -O z --threads $NUM_THREADS -v -V indels -m -P 0 > chr8_${VCF_FILE}.gz &
	bcftools mpileup -r chr9 -Ou -C 50 -f $REF $BAMFILE_SORTED \| bcftools call -O z --threads $NUM_THREADS -v -V indels -m -P 0 > chr9_${VCF_FILE}.gz &
	bcftools mpileup -r chr10 -Ou -C 50 -f $REF $BAMFILE_SORTED \| bcftools call -O z --threads $NUM_THREADS -v -V indels -m -P 0 > chr10_${VCF_FILE}.gz &
	bcftools mpileup -r chr11 -Ou -C 50 -f $REF $BAMFILE_SORTED \| bcftools call -O z --threads $NUM_THREADS -v -V indels -m -P 0 > chr11_${VCF_FILE}.gz &
	bcftools mpileup -r chr12 -Ou -C 50 -f $REF $BAMFILE_SORTED \| bcftools call -O z --threads $NUM_THREADS -v -V indels -m -P 0 > chr12_${VCF_FILE}.gz &
	bcftools mpileup -r chr13 -Ou -C 50 -f $REF $BAMFILE_SORTED \| bcftools call -O z --threads $NUM_THREADS -v -V indels -m -P 0 > chr13_${VCF_FILE}.gz &
	bcftools mpileup -r chr14 -Ou -C 50 -f $REF $BAMFILE_SORTED \| bcftools call -O z --threads $NUM_THREADS -v -V indels -m -P 0 > chr14_${VCF_FILE}.gz &
	bcftools mpileup -r chr15 -Ou -C 50 -f $REF $BAMFILE_SORTED \| bcftools call -O z --threads $NUM_THREADS -v -V indels -m -P 0 > chr15_${VCF_FILE}.gz &
	bcftools mpileup -r chr16 -Ou -C 50 -f $REF $BAMFILE_SORTED \| bcftools call -O z --threads $NUM_THREADS -v -V indels -m -P 0 > chr16_${VCF_FILE}.gz &
	bcftools mpileup -r chr17 -Ou -C 50 -f $REF $BAMFILE_SORTED \| bcftools call -O z --threads $NUM_THREADS -v -V indels -m -P 0 > chr17_${VCF_FILE}.gz &
	bcftools mpileup -r chr18 -Ou -C 50 -f $REF $BAMFILE_SORTED \| bcftools call -O z --threads $NUM_THREADS -v -V indels -m -P 0 > chr18_${VCF_FILE}.gz &
	bcftools mpileup -r chr19 -Ou -C 50 -f $REF $BAMFILE_SORTED \| bcftools call -O z --threads $NUM_THREADS -v -V indels -m -P 0 > chr19_${VCF_FILE}.gz &
	bcftools mpileup -r chr20 -Ou -C 50 -f $REF $BAMFILE_SORTED \| bcftools call -O z --threads $NUM_THREADS -v -V indels -m -P 0 > chr20_${VCF_FILE}.gz &
	bcftools mpileup -r chr21 -Ou -C 50 -f $REF $BAMFILE_SORTED \| bcftools call -O z --threads $NUM_THREADS -v -V indels -m -P 0 > chr21_${VCF_FILE}.gz &
	bcftools mpileup -r chr22 -Ou -C 50 -f $REF $BAMFILE_SORTED \| bcftools call -O z --threads $NUM_THREADS -v -V indels -m -P 0 > chr22_${VCF_FILE}.gz &
	bcftools mpileup -r chrX -Ou -C 50 -f $REF $BAMFILE_SORTED \| bcftools call -O z --threads $NUM_THREADS -v -V indels -m -P 0 > chrX_${VCF_FILE}.gz &
	bcftools mpileup -r chrY -Ou -C 50 -f $REF $BAMFILE_SORTED \| bcftools call -O z --threads $NUM_THREADS -v -V indels -m -P 0 > chrY_${VCF_FILE}.gz &
	wait

	# Concatenate all chromosome VCFs to one big file
	bcftools concat -O z chr[1-9]_${VCF_FILE}.gz chr[1-2][0-9]_${VCF_FILE}.gz chr[M,X-Y]_${VCF_FILE}.gz > ${VCF_FILE}.gz
	tabix ${VCF_FILE}.gz

	# Delete no longer needed VCFs
	rm -f chr[0-9]_${VCF_FILE}.gz chr[1-2][0-9]_${VCF_FILE}.gz chrX_${VCF_FILE}.gz



	END=$(date +%s.%N)

	DIFF=$(echo "$END - $START" \| bc)
	echo ${DIFF}