Skip to content

Instantly share code, notes, and snippets.

@gatoravi
Last active May 13, 2017 05:02
Show Gist options
  • Save gatoravi/f774aa39349b1e414d23 to your computer and use it in GitHub Desktop.
Save gatoravi/f774aa39349b1e414d23 to your computer and use it in GitHub Desktop.
A script to generate kmer counts in a region for trios.
#Avinash Ramu, Conrad Lab, WUSTL
#Estimate the kmer statistic in a specified window
#use overlapping windows to capture the kmers at the edges. Make sure the start of the read lies within the window.
#Arguments - a region in the form "chr\tstart\tstop" i.e $1=chr $2=start $3=end
#Updates
#modified - 6/13 made it $6 - ($7 + $8 ) in other words I removed /2
#modified - 6/14 cleanup the code.
#modified - 6/19 add a column for the reference kmer counts
#modified - 7/8 clip the ends of sequences that don't fall within a window.
#! /bin/bash
chr=$1
start=$2
end=$3
#echo $chr"\t"$start"\t"$end
kmer_RD_cutoff=5 #consider a kmer only if it has atleast a total of 5 reads across all three samples.
bam1=~/files/20110915_CEUtrio/WGS/BAMFiles/CEUTrio.HiSeq.WGS.b37_decoy.NA12878.clean.dedup.recal.bam
bam2=~/files/20110915_CEUtrio/WGS/BAMFiles/CEUTrio.HiSeq.WGS.b37_decoy.NA12891.clean.dedup.recal.bam
bam3=~/files/20110915_CEUtrio/WGS/BAMFiles/CEUTrio.HiSeq.WGS.b37_decoy.NA12892.clean.dedup.recal.bam
ref=~/files/20110915_CEUtrio/WGS/ref/hs37d5.fa
hwindow=100 #hundred base pairs on either side of the mutation, so window size = 200
k=10 #length of kmer
n=10 #number of lines to average the kmer statistic over, for example take the average of the statistic in the first k and last k sorted lines (divide by 2*k)
readLength=100 #average length of the read
MQc=0 #mapping quality cutoff
SAMDIR=./SAM
FASTQDIR=./FASTQ
JFbinDIR=./JFBin
JFcountDIR=./JFCount
JoinedDIR=./Joined
MiscDIR=./Temp
RefDIR=./Ref
if [ ! -d "$SAMDIR" ]; then
mkdir $SAMDIR
fi
if [ ! -d "$FASTQDIR" ]; then
mkdir $FASTQDIR
fi
if [ ! -d "$JFbinDIR" ]; then
mkdir $JFbinDIR
fi
if [ ! -d "$JFcountDIR" ]; then
mkdir $JFcountDIR
fi
if [ ! -d "$JoinedDIR" ]; then
mkdir $JoinedDIR
fi
if [ ! -d "$MiscDIR" ]; then
mkdir $MiscDIR
fi
if [ ! -d "$RefDIR" ]; then
mkdir $RefDIR
fi
samop1=$SAMDIR/s1.${start}.sam
samop2=$SAMDIR/s2.${start}.sam
samop3=$SAMDIR/s3.${start}.sam
fastqop1=$FASTQDIR/s1.${start}.fastq
fastqop2=$FASTQDIR/s2.${start}.fastq
fastqop3=$FASTQDIR/s3.${start}.fastq
refSeq=$RefDIR/${start}.seq
JFbin1=$JFbinDIR/s1.${start}.mer
JFbin2=$JFbinDIR/s2.${start}.mer
JFbin3=$JFbinDIR/s3.${start}.mer
JFbinref=$JFbinDIR/ref.${start}.mer
JFcount1=$JFcountDIR/s1.${start}.counts.txt
JFcount2=$JFcountDIR/s2.${start}.counts.txt
JFcount3=$JFcountDIR/s3.${start}.counts.txt
JFcountref=$JFcountDIR/ref.${start}.counts.txt
JFsortedcount1=$JFcountDIR/s1.${start}.sorted.counts.txt
JFsortedcount2=$JFcountDIR/s2.${start}.sorted.counts.txt
JFsortedcount3=$JFcountDIR/s3.${start}.sorted.counts.txt
JFsortedcountref=$JFcountDIR/ref.${start}.sorted.counts.txt
joinedcounts=$JoinedDIR/joined.${start}.txt
tempJoin1=$MiscDIR/temp.${start}.1.txt
tempJoin2=$MiscDIR/temp.${start}.2.txt
tempJoin3=$MiscDIR/temp.${start}.3.txt
joinedStatCounts=$JoinedDIR/joined.${start}.stats.txt
joinedSortedStatCounts=$JoinedDIR/joined.sorted.${start}.stats.txt
locus=$chr":"$start"-"$end
endref=$(($end + $readLength))
samtools view $bam1 $locus > $samop1
samtools view $bam2 $locus > $samop2
samtools view $bam3 $locus > $samop3
if [ ! -s $samop1 ] || [ ! -s $samop2 ] || [ ! -s $samop3 ]; then #some of the samples don't have reads.
rm $samop1 $samop2 $samop3
echo "Exiting ! No BAM !" >&2
exit
fi
awk -v start=$start -v end=$end -v MQc=$MQc '{ seq = $10; if($5 >= MQc && $6 !~ /S/) { if($4 < start) { seq = substr($10, start - $4 +1); } else if($4 + length($10) - 1 > end) { seq = substr($10, 1, end - $4 +1); } print ">"NR":"$4"\n"seq } }' $samop1 > $fastqop1
awk -v start=$start -v end=$end -v MQc=$MQc '{ seq = $10; if($5 >= MQc && $6 !~ /S/) { if($4 < start) { seq = substr($10, start - $4 +1); } else if($4 + length($10) - 1 > end) { seq = substr($10, 1, end - $4 +1); } print ">"NR":"$4"\n"seq } }' $samop2 > $fastqop2
awk -v start=$start -v end=$end -v MQc=$MQc '{ seq = $10; if($5 >= MQc && $6 !~ /S/) { if($4 < start) { seq = substr($10, start - $4 +1); } else if($4 + length($10) - 1 > end) { seq = substr($10, 1, end - $4 +1); } print ">"NR":"$4"\n"seq } }' $samop3 > $fastqop3
locusRef=$chr":"$start"-"$endref
samtools faidx $ref $locusRef > $refSeq
if [ ! -s $refSeq ]; then #no reference fasta.
echo "Exiting ! No Reference Fasta !" >&2
exit
fi
if [ ! -s $fastqop1 ] || [ ! -s $fastqop2 ] || [ ! -s $fastqop3 ]; then #some of the samples don't have FASTQ's
rm $fastqop1 $fastqop2 $fastqop3
echo "Exiting ! No FASTQ !" >&2
exit
fi
#Jellyfish Count
~/bin/jellyfish count -m $k -s 100000 $fastqop1 -o $JFbin1
~/bin/jellyfish count -m $k -s 100000 $fastqop2 -o $JFbin2
~/bin/jellyfish count -m $k -s 100000 $fastqop3 -o $JFbin3
~/bin/jellyfish count -m $k -s 100000 $refSeq -o $JFbinref
#Jellyfish dump
~/bin/jellyfish dump ${JFbin1} -c -o $JFcount1
~/bin/jellyfish dump ${JFbin2} -c -o $JFcount2
~/bin/jellyfish dump ${JFbin3} -c -o $JFcount3
~/bin/jellyfish dump ${JFbinref} -c -o $JFcountref
#sort the counts based on kmer-string
sort $JFcount1 > $JFsortedcount1
sort $JFcount2 > $JFsortedcount2
sort $JFcount3 > $JFsortedcount3
sort $JFcountref > $JFsortedcountref
#Join mom, dad and child counts
join -e 0 -o '0 1.2 2.2' -a1 -a2 $JFsortedcount1 $JFsortedcount2 > $tempJoin1
join -e 0 -o '0 1.2 1.3 2.2' -a1 -a2 $tempJoin1 $JFsortedcount3 > $tempJoin2
join -e 0 -o '0 1.2 1.3 1.4 2.2' -a1 -a2 $tempJoin2 $JFsortedcountref > $joinedcounts
#For each kmer, calculate fraction of counts coming from mom, dad and child. The last column is child_fraction - (mom_fraction + dad_fraction) [ $9=$6-($7+$8)]
awk -v RDC=$kmer_RD_cutoff '{ totalC = $2+$3+$4; if(totalC == 0) { $6 = 0; $7 = 0; $8 = 0; } else { $6=$2/totalC; $7=$3/totalC; $8=$4/totalC; } $9=$6-($7+$8); if(totalC>RDC) print }' $joinedcounts > $joinedStatCounts
sort -gk9 $joinedStatCounts > $joinedSortedStatCounts
#present in reference
awk '$5==1' $joinedSortedStatCounts | head -n $n > $tempJoin1
#not present in reference
awk '$5==0' $joinedSortedStatCounts | tail -n $n > $tempJoin2
cat $tempJoin1 $tempJoin2 > $tempJoin3
#This is the part that calculates the test-statistic of interest.
#without modulus for $NF and check if "in reference" for min and if "not in reference" for max
#the file is sorted by the last column, average first n lines of this column and the last n lines and play around with these values to refine signal.
awk -v s=$start -v n=$n 'BEGIN { sum_min = 0; sum_max = 0; } { if(NR<=n && $5 == 1) { sum_min+=$NF; min_c+=1; } else { sum_max+=$NF; max_c +=1;} } END { if(max_c == 0) { sum_max = 0; max_c = 1 } if(min_c == 0) { sum_min = 0; min_c = 1; } print s"\t"(sum_max/max_c-sum_min/min_c)"\t"sum_min"\t"sum_max"\t"NR; }' $tempJoin3
#cleanup miscellaneous files
#rm $samop1 $samop2 $samop3 $fastqop1 $fastqop2 $fastqop3 ${JFbin1} ${JFbin2} ${JFbin3} ${JFbinref} $joinedcounts $JFcount1 $JFcount2 $JFcount3 $JFcountref $JFsortedcount1 $JFsortedcount2 $JFsortedcount3 $tempJoin1 $tempJoin2 $JFsortedcountref $refSeq $joinedStatCounts
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment