Created
July 27, 2020 18:25
-
-
Save tkrahn/7dfc51c2bb97a6d654378a21ea0a96d4 to your computer and use it in GitHub Desktop.
YSEQ WGS processing pipeline with which we are currently analyzing FastQ files.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
START=$(date +%s.%N) | |
clear | |
# setup parameters | |
YSEQID=${PWD##*/} | |
# YSEQID="1234" # (the above command simply gets the name of the last segment of the current working directory) | |
NUM_THREADS=$(getconf _NPROCESSORS_ONLN) | |
echo "We can use ${NUM_THREADS} threads." | |
BWA='bwa' | |
if test -x "/usr/local/bin/bwa-mem2"; then | |
echo "We can use bwa-mem2" | |
BWA='bwa-mem2' | |
fi | |
UPS='upload.server.yseq.net' | |
# Create a random password | |
PASSWD=`date +%s | sha256sum | base64 | head -c 8` | |
echo "Random password created: ${PASSWD}" | |
# Generate summary file | |
cat /genomes/0/script-templates/x_result_summary.txt | sed "s/<kitnumber>/${YSEQID}/g" | sed "s/<passwd>/${PASSWD}/g" >${YSEQID}_result_summary.txt | |
echo "${YSEQID}_result_summary.txt created" | |
REF="/genomes/0/refseq/hg38/hg38.fa" | |
READS_1="fastq/${YSEQID}_R1.fastq.gz" | |
READS_2="fastq/${YSEQID}_R2.fastq.gz" | |
#SAMFILE="${YSEQID}_bwa-mem_hg38.sam" # no longer needed due to piping | |
BAMFILE="${YSEQID}_bwa-mem_hg38.bam" | |
BAMFILE_SORTED="${YSEQID}_bwa-mem_hg38_sorted.bam" | |
VCF_FILE="${YSEQID}_hg38.vcf" | |
cp /genomes/0/tree/trees/yfull/latest_YFull_YTree.json . | |
YFULLTREE="latest_YFull_YTree.json" | |
cp /genomes/0/script-templates/cladeFinder.py . | |
CLADEFINDER="cladeFinder.py" | |
#ANN_VCF_FILE="${YSEQID}_hg38_dbSNP150_ann.vcf" | |
#ANN_CLINVAR_VCF_FILE="${YSEQID}_hg38_clinvar_ann.vcf" | |
#DBSNP="/genomes/0/refseq/hg19/All_chr_20161121.vcf.gz" | |
# Read Group(s) | |
# READ_GROUPS="\"@RG\tID:1\tSM:${YSEQID}\tPL:ILLUMINA\tPU:UNIT1\tLB:${YSEQID}\"" | |
if [[ $1 != "-resume" ]] | |
then | |
# Pipeline commands: | |
${BWA} mem -t $NUM_THREADS -M $REF $READS_1 $READS_2 | \ | |
samtools view -@ $NUM_THREADS -b -t $REF -o $BAMFILE - | |
samtools sort -@ $NUM_THREADS -T /usr/local/geospiza/var/tmp/sorted -o $BAMFILE_SORTED $BAMFILE | |
samtools index $BAMFILE_SORTED | |
samtools idxstats $BAMFILE_SORTED > ${BAMFILE_SORTED}.idxstats.tsv | |
fi | |
# Update summary file with BAM filesize: | |
BAM_FILESIZE=`du -kh "${BAMFILE_SORTED}" | cut -f1` | |
echo "Size of ${BAMFILE_SORTED} = ${BAM_FILESIZE}" | |
cat ${YSEQID}_result_summary.txt | sed "s/<bamsize>/${BAM_FILESIZE}byte/g" >${YSEQID}_result_summary.txt.tmp | |
rm -f ${YSEQID}_result_summary.txt | |
mv ${YSEQID}_result_summary.txt.tmp ${YSEQID}_result_summary.txt | |
if [[ $1 != "-resume" ]] | |
then | |
# Delete no longer needed large files | |
#rm -f $SAMFILE | |
rm -f $BAMFILE # keep $BAMFILE_SORTED | |
fi | |
# Easy tview file | |
echo "#!/bin/bash" > tview_${YSEQID}.sh | |
echo "samtools tview ${BAMFILE_SORTED} ${REF}" >> tview_${YSEQID}.sh | |
chmod a+x tview_${YSEQID}.sh | |
if [[ $1 != "-resume" ]] | |
then | |
# Separate chrY & mtDNA BAM files | |
samtools view -@ $NUM_THREADS -b -o "${YSEQID}_bwa-mem_hg38_chrY.bam" $BAMFILE_SORTED chrY & | |
samtools view -@ $NUM_THREADS -b -o "${YSEQID}_bwa-mem_rCRS_chrM.bam" $BAMFILE_SORTED chrM & | |
wait | |
samtools index "${YSEQID}_bwa-mem_hg38_chrY.bam" & | |
samtools index "${YSEQID}_bwa-mem_rCRS_chrM.bam" & | |
wait | |
#rsync -e "ssh" --progress -v ${YSEQID}_bwa-mem_rCRS_chrM.bam ${UPS}:/genomes/${YSEQID}/ | |
#rsync -e "ssh" --progress -v ${YSEQID}_bwa-mem_rCRS_chrM.bam.bai ${UPS}:/genomes/${YSEQID}/ | |
#rsync -e "ssh" --progress -v ${YSEQID}_bwa-mem_hg*_chrY.bam ${UPS}:/genomes/${YSEQID}/ | |
#rsync -e "ssh" --progress -v ${YSEQID}_bwa-mem_hg*_chrY.bam.bai ${UPS}:/genomes/${YSEQID}/ | |
# mtDNA allele calling $ FASTA file generation | |
bcftools mpileup -r chrM -Ou -C 50 -f ${REF} ${BAMFILE_SORTED} | bcftools call --threads $NUM_THREADS -O z -v -m -P 0 > chrM_${VCF_FILE}.gz | |
tabix chrM_${VCF_FILE}.gz | |
samtools faidx $REF chrM | bcftools consensus chrM_${VCF_FILE}.gz -o ${YSEQID}_mtDNA.fasta | |
# Annotate all known Y chromosome SNPs from Ybrowse | |
wget http://ybrowse.org/gbrowse2/gff/snps_hg38.vcf.gz | |
wget http://ybrowse.org/gbrowse2/gff/snps_hg38.vcf.gz.tbi | |
bcftools mpileup -r chrY -C 50 --threads $NUM_THREADS -O z -f $REF $BAMFILE_SORTED > chrY_raw_${VCF_FILE}.gz | |
tabix chrY_raw_${VCF_FILE}.gz | |
bcftools merge -m both -O z snps_hg38.vcf.gz chrY_raw_${VCF_FILE}.gz > chrY_merged_${VCF_FILE}.gz | |
tabix chrY_merged_${VCF_FILE}.gz | |
echo "merging completed" | |
bcftools call -O z -m -P 0 chrY_merged_${VCF_FILE}.gz > chrY_called_${VCF_FILE}.gz | |
tabix chrY_called_${VCF_FILE}.gz | |
echo "calling completed" | |
bcftools filter -O z -i '%ID!="."' chrY_called_${VCF_FILE}.gz | bcftools view -O z -s ^HARRY,ALIEN >chrY_cleaned_${VCF_FILE}.gz | |
tabix chrY_cleaned_${VCF_FILE}.gz | |
echo "cleanup completed" | |
bcftools filter -O z -i '(GT=="1/1" && AA==REF) || (GT=="0/0" && AA==ALT)' chrY_cleaned_${VCF_FILE}.gz >chrY_derived_${VCF_FILE}.gz & | |
bcftools filter -O z -i '(GT=="0/0" && AA==REF) || (GT=="1/1" && AA==ALT)' chrY_cleaned_${VCF_FILE}.gz >chrY_ancestral_${VCF_FILE}.gz & | |
wait | |
tabix chrY_derived_${VCF_FILE}.gz & | |
tabix chrY_ancestral_${VCF_FILE}.gz & | |
wait | |
echo "ancestral & derived filters completed" | |
# Find the Y haplogroup | |
bcftools query -f '%ID,' chrY_derived_${VCF_FILE}.gz > ${YSEQID}_positives.txt & | |
bcftools query -f '%ID,' chrY_ancestral_${VCF_FILE}.gz > ${YSEQID}_negatives.txt & | |
wait | |
fi | |
python ${CLADEFINDER} ${YFULLTREE} ${YSEQID}_positives.txt ${YSEQID}_negatives.txt ${YSEQID}_cladeFinderOutput.csv | |
echo "Y haplogroup checked" | |
echo "The 5 best hits are:" | |
head -n 5 ${YSEQID}_cladeFinderOutput.csv | |
YFULLHG="unknown" | |
YFULLPATH="unknown" | |
OLDIFS=$IFS | |
IFS=$'\t' | |
LINECOUNTER=$((0)) | |
[ ! -f ${YSEQID}_cladeFinderOutput.csv ] && { echo "${YSEQID}_cladeFinderOutput.csv file not found"; } | |
while read -r haplogroup path purity depth score conflicting_negatives | |
do | |
LINECOUNTER=$((LINECOUNTER + 1)) | |
if [ $LINECOUNTER -eq $((2)) ] | |
then | |
YFULLHG=$haplogroup | |
YFULLPATH=$path | |
fi | |
done < ${YSEQID}_cladeFinderOutput.csv | |
IFS=$OLDIFS | |
echo "YFULLHG: ${YFULLHG}" | |
echo "YFULLPATH: ${YFULLPATH}" | |
# We're reading the first 100k sequences from the first fastq file to determine the average read length | |
READLENGTH=$(samtools bam2fq -@ ${NUM_THREADS} ${BAMFILE_SORTED} | head -n 100000 | awk '{if(NR%4==2) {count++; bases += length} } END{print bases/count}') | |
echo $READLENGTH | |
# Calculating the coverage from the idxstats output | |
COVERAGE_HG38=$(awk -v rl="$READLENGTH" '{x+=$2;m+=$3}END{print m*rl/x "x"}' < ${YSEQID}_bwa-mem_hg38_sorted.bam.idxstats.tsv) | |
echo $COVERAGE_HG38 | |
cp /genomes/0/script-templates/getEquivalentAndDownstreamSNPs.py . | |
PHYLOEQ_SNPS_FILE=${YSEQID}_PHYLOEQ_SNPS.tsv | |
DOWNSTR_SNPS_FILE=${YSEQID}_DOWNSTR_SNPS.tsv | |
python3 getEquivalentAndDownstreamSNPs.py ${YFULLTREE} "${YFULLHG}" chrY_cleaned_${VCF_FILE}.gz ${YSEQID}_positives.txt ${YSEQID}_negatives.txt ${PHYLOEQ_SNPS_FILE} ${DOWNSTR_SNPS_FILE} ${YSEQID}_bwa-mem_hg38_sorted.bam.idxstats.tsv | |
cp /genomes/0/script-templates/getMTDNADifferences.py . | |
MTDNA_SNPS_FILE=${YSEQID}_MTDNA_SNPS.tsv | |
HAPLOGREP_JAR=haplogrep-2.1.25.jar | |
cp /genomes/0/haplogrep/${HAPLOGREP_JAR} . | |
MTHAPLOGROUP=$(python getMTDNADifferences.py -process chrM_${VCF_FILE}.gz ${HAPLOGREP_JAR} ${MTDNA_SNPS_FILE}) | |
# Update summary file with Y haplogroup: | |
# Note: Since the YFull tree uses forward slashes for identical SNPs with different names, we replace the sed separator "/" with "|" | |
# This keeps us from escaping every forward slash. But don't confuse it with the pipe character in the same command! | |
cat ${YSEQID}_result_summary.txt | \ | |
sed "s|<MTHAPLOGROUP>|${MTHAPLOGROUP}|g" | \ | |
sed "s|<YHAPLOGROUP>|${YFULLHG}|g" | \ | |
sed "s|<YFULLPATH>|${YFULLPATH}|g" | \ | |
sed "s|<COVERAGE_HG38>|${COVERAGE_HG38}|g" | \ | |
sed "s|<READLENGTH>|${READLENGTH}|g" | \ | |
sed "s|<TERMINALSNP>|${YFULLHG}|g" >${YSEQID}_result_summary.txt.tmp | |
mv ${YSEQID}_result_summary.txt ${YSEQID}_result_summary.txt.backup | |
mv ${YSEQID}_result_summary.txt.tmp ${YSEQID}_result_summary.txt | |
bcftools filter -O z -i '(%ID==".") && TYPE="snp" && QUAL>=30 && GT=="1/1" && DP4[2] + DP4[3] >=2 && (DP4[0] + DP4[1]) / DP < 0.1' chrY_called_${VCF_FILE}.gz | bcftools view -O z -s ^HARRY,ALIEN >chrY_novel_SNPs_${VCF_FILE}.gz | |
tabix chrY_novel_SNPs_${VCF_FILE}.gz | |
zcat chrY_novel_SNPs_${VCF_FILE}.gz | grep -v "##" >chrY_novel_SNPs_${VCF_FILE}.tsv | |
echo "novel SNP calling completed" | |
# Create Novel SNP template after checking for unreliable ranges and identity within Y Chromosome | |
cp /genomes/0/script-templates/identityResolutionTemplateCreator.py . | |
python identityResolutionTemplateCreator.py -batch chrY_novel_SNPs_${VCF_FILE}.tsv chrY_novel_SNPs_${YSEQID}_hg38.tsv ${REF} | |
bcftools filter -O z -i '(%ID==".") && TYPE="indel" && QUAL>=30 && GT=="1/1" && DP4[2] + DP4[3] >=2 && (DP4[0] + DP4[1]) / DP < 0.1' chrY_called_${VCF_FILE}.gz | bcftools view -O z -s ^HARRY,ALIEN >chrY_INDELs_${VCF_FILE}.gz | |
tabix chrY_INDELs_${VCF_FILE}.gz | |
echo "INDEL calling completed" | |
# Parallel SNP calling by chromosome | |
bcftools mpileup -r chr1 -Ou -C 50 -f $REF $BAMFILE_SORTED | bcftools call -O z --threads $NUM_THREADS -v -V indels -m -P 0 > chr1_${VCF_FILE}.gz & | |
bcftools mpileup -r chr2 -Ou -C 50 -f $REF $BAMFILE_SORTED | bcftools call -O z --threads $NUM_THREADS -v -V indels -m -P 0 > chr2_${VCF_FILE}.gz & | |
bcftools mpileup -r chr3 -Ou -C 50 -f $REF $BAMFILE_SORTED | bcftools call -O z --threads $NUM_THREADS -v -V indels -m -P 0 > chr3_${VCF_FILE}.gz & | |
bcftools mpileup -r chr4 -Ou -C 50 -f $REF $BAMFILE_SORTED | bcftools call -O z --threads $NUM_THREADS -v -V indels -m -P 0 > chr4_${VCF_FILE}.gz & | |
bcftools mpileup -r chr5 -Ou -C 50 -f $REF $BAMFILE_SORTED | bcftools call -O z --threads $NUM_THREADS -v -V indels -m -P 0 > chr5_${VCF_FILE}.gz & | |
bcftools mpileup -r chr6 -Ou -C 50 -f $REF $BAMFILE_SORTED | bcftools call -O z --threads $NUM_THREADS -v -V indels -m -P 0 > chr6_${VCF_FILE}.gz & | |
bcftools mpileup -r chr7 -Ou -C 50 -f $REF $BAMFILE_SORTED | bcftools call -O z --threads $NUM_THREADS -v -V indels -m -P 0 > chr7_${VCF_FILE}.gz & | |
bcftools mpileup -r chr8 -Ou -C 50 -f $REF $BAMFILE_SORTED | bcftools call -O z --threads $NUM_THREADS -v -V indels -m -P 0 > chr8_${VCF_FILE}.gz & | |
bcftools mpileup -r chr9 -Ou -C 50 -f $REF $BAMFILE_SORTED | bcftools call -O z --threads $NUM_THREADS -v -V indels -m -P 0 > chr9_${VCF_FILE}.gz & | |
bcftools mpileup -r chr10 -Ou -C 50 -f $REF $BAMFILE_SORTED | bcftools call -O z --threads $NUM_THREADS -v -V indels -m -P 0 > chr10_${VCF_FILE}.gz & | |
bcftools mpileup -r chr11 -Ou -C 50 -f $REF $BAMFILE_SORTED | bcftools call -O z --threads $NUM_THREADS -v -V indels -m -P 0 > chr11_${VCF_FILE}.gz & | |
bcftools mpileup -r chr12 -Ou -C 50 -f $REF $BAMFILE_SORTED | bcftools call -O z --threads $NUM_THREADS -v -V indels -m -P 0 > chr12_${VCF_FILE}.gz & | |
bcftools mpileup -r chr13 -Ou -C 50 -f $REF $BAMFILE_SORTED | bcftools call -O z --threads $NUM_THREADS -v -V indels -m -P 0 > chr13_${VCF_FILE}.gz & | |
bcftools mpileup -r chr14 -Ou -C 50 -f $REF $BAMFILE_SORTED | bcftools call -O z --threads $NUM_THREADS -v -V indels -m -P 0 > chr14_${VCF_FILE}.gz & | |
bcftools mpileup -r chr15 -Ou -C 50 -f $REF $BAMFILE_SORTED | bcftools call -O z --threads $NUM_THREADS -v -V indels -m -P 0 > chr15_${VCF_FILE}.gz & | |
bcftools mpileup -r chr16 -Ou -C 50 -f $REF $BAMFILE_SORTED | bcftools call -O z --threads $NUM_THREADS -v -V indels -m -P 0 > chr16_${VCF_FILE}.gz & | |
bcftools mpileup -r chr17 -Ou -C 50 -f $REF $BAMFILE_SORTED | bcftools call -O z --threads $NUM_THREADS -v -V indels -m -P 0 > chr17_${VCF_FILE}.gz & | |
bcftools mpileup -r chr18 -Ou -C 50 -f $REF $BAMFILE_SORTED | bcftools call -O z --threads $NUM_THREADS -v -V indels -m -P 0 > chr18_${VCF_FILE}.gz & | |
bcftools mpileup -r chr19 -Ou -C 50 -f $REF $BAMFILE_SORTED | bcftools call -O z --threads $NUM_THREADS -v -V indels -m -P 0 > chr19_${VCF_FILE}.gz & | |
bcftools mpileup -r chr20 -Ou -C 50 -f $REF $BAMFILE_SORTED | bcftools call -O z --threads $NUM_THREADS -v -V indels -m -P 0 > chr20_${VCF_FILE}.gz & | |
bcftools mpileup -r chr21 -Ou -C 50 -f $REF $BAMFILE_SORTED | bcftools call -O z --threads $NUM_THREADS -v -V indels -m -P 0 > chr21_${VCF_FILE}.gz & | |
bcftools mpileup -r chr22 -Ou -C 50 -f $REF $BAMFILE_SORTED | bcftools call -O z --threads $NUM_THREADS -v -V indels -m -P 0 > chr22_${VCF_FILE}.gz & | |
bcftools mpileup -r chrX -Ou -C 50 -f $REF $BAMFILE_SORTED | bcftools call -O z --threads $NUM_THREADS -v -V indels -m -P 0 > chrX_${VCF_FILE}.gz & | |
bcftools mpileup -r chrY -Ou -C 50 -f $REF $BAMFILE_SORTED | bcftools call -O z --threads $NUM_THREADS -v -V indels -m -P 0 > chrY_${VCF_FILE}.gz & | |
wait | |
# Concatenate all chromosome VCFs to one big file | |
bcftools concat -O z chr[1-9]_${VCF_FILE}.gz chr[1-2][0-9]_${VCF_FILE}.gz chr[M,X-Y]_${VCF_FILE}.gz > ${VCF_FILE}.gz | |
tabix ${VCF_FILE}.gz | |
# Delete no longer needed VCFs | |
rm -f chr[0-9]_${VCF_FILE}.gz chr[1-2][0-9]_${VCF_FILE}.gz chrX_${VCF_FILE}.gz | |
END=$(date +%s.%N) | |
DIFF=$(echo "$END - $START" | bc) | |
echo ${DIFF} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment