Last active
May 13, 2017 05:02
-
-
Save gatoravi/f774aa39349b1e414d23 to your computer and use it in GitHub Desktop.
A script to generate kmer counts in a region for trios.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Avinash Ramu, Conrad Lab, WUSTL | |
#Estimate the kmer statistic in a specified window | |
#use overlapping windows to capture the kmers at the edges. Make sure the start of the read lies within the window. | |
#Arguments - a region in the form "chr\tstart\tstop" i.e $1=chr $2=start $3=end | |
#Updates | |
#modified - 6/13 made it $6 - ($7 + $8 ) in other words I removed /2 | |
#modified - 6/14 cleanup the code. | |
#modified - 6/19 add a column for the reference kmer counts | |
#modified - 7/8 clip the ends of sequences that don't fall within a window. | |
#! /bin/bash | |
chr=$1 | |
start=$2 | |
end=$3 | |
#echo $chr"\t"$start"\t"$end | |
kmer_RD_cutoff=5 #consider a kmer only if it has atleast a total of 5 reads across all three samples. | |
bam1=~/files/20110915_CEUtrio/WGS/BAMFiles/CEUTrio.HiSeq.WGS.b37_decoy.NA12878.clean.dedup.recal.bam | |
bam2=~/files/20110915_CEUtrio/WGS/BAMFiles/CEUTrio.HiSeq.WGS.b37_decoy.NA12891.clean.dedup.recal.bam | |
bam3=~/files/20110915_CEUtrio/WGS/BAMFiles/CEUTrio.HiSeq.WGS.b37_decoy.NA12892.clean.dedup.recal.bam | |
ref=~/files/20110915_CEUtrio/WGS/ref/hs37d5.fa | |
hwindow=100 #hundred base pairs on either side of the mutation, so window size = 200 | |
k=10 #length of kmer | |
n=10 #number of lines to average the kmer statistic over, for example take the average of the statistic in the first k and last k sorted lines (divide by 2*k) | |
readLength=100 #average length of the read | |
MQc=0 #mapping quality cutoff | |
SAMDIR=./SAM | |
FASTQDIR=./FASTQ | |
JFbinDIR=./JFBin | |
JFcountDIR=./JFCount | |
JoinedDIR=./Joined | |
MiscDIR=./Temp | |
RefDIR=./Ref | |
if [ ! -d "$SAMDIR" ]; then | |
mkdir $SAMDIR | |
fi | |
if [ ! -d "$FASTQDIR" ]; then | |
mkdir $FASTQDIR | |
fi | |
if [ ! -d "$JFbinDIR" ]; then | |
mkdir $JFbinDIR | |
fi | |
if [ ! -d "$JFcountDIR" ]; then | |
mkdir $JFcountDIR | |
fi | |
if [ ! -d "$JoinedDIR" ]; then | |
mkdir $JoinedDIR | |
fi | |
if [ ! -d "$MiscDIR" ]; then | |
mkdir $MiscDIR | |
fi | |
if [ ! -d "$RefDIR" ]; then | |
mkdir $RefDIR | |
fi | |
samop1=$SAMDIR/s1.${start}.sam | |
samop2=$SAMDIR/s2.${start}.sam | |
samop3=$SAMDIR/s3.${start}.sam | |
fastqop1=$FASTQDIR/s1.${start}.fastq | |
fastqop2=$FASTQDIR/s2.${start}.fastq | |
fastqop3=$FASTQDIR/s3.${start}.fastq | |
refSeq=$RefDIR/${start}.seq | |
JFbin1=$JFbinDIR/s1.${start}.mer | |
JFbin2=$JFbinDIR/s2.${start}.mer | |
JFbin3=$JFbinDIR/s3.${start}.mer | |
JFbinref=$JFbinDIR/ref.${start}.mer | |
JFcount1=$JFcountDIR/s1.${start}.counts.txt | |
JFcount2=$JFcountDIR/s2.${start}.counts.txt | |
JFcount3=$JFcountDIR/s3.${start}.counts.txt | |
JFcountref=$JFcountDIR/ref.${start}.counts.txt | |
JFsortedcount1=$JFcountDIR/s1.${start}.sorted.counts.txt | |
JFsortedcount2=$JFcountDIR/s2.${start}.sorted.counts.txt | |
JFsortedcount3=$JFcountDIR/s3.${start}.sorted.counts.txt | |
JFsortedcountref=$JFcountDIR/ref.${start}.sorted.counts.txt | |
joinedcounts=$JoinedDIR/joined.${start}.txt | |
tempJoin1=$MiscDIR/temp.${start}.1.txt | |
tempJoin2=$MiscDIR/temp.${start}.2.txt | |
tempJoin3=$MiscDIR/temp.${start}.3.txt | |
joinedStatCounts=$JoinedDIR/joined.${start}.stats.txt | |
joinedSortedStatCounts=$JoinedDIR/joined.sorted.${start}.stats.txt | |
locus=$chr":"$start"-"$end | |
endref=$(($end + $readLength)) | |
samtools view $bam1 $locus > $samop1 | |
samtools view $bam2 $locus > $samop2 | |
samtools view $bam3 $locus > $samop3 | |
if [ ! -s $samop1 ] || [ ! -s $samop2 ] || [ ! -s $samop3 ]; then #some of the samples don't have reads. | |
rm $samop1 $samop2 $samop3 | |
echo "Exiting ! No BAM !" >&2 | |
exit | |
fi | |
awk -v start=$start -v end=$end -v MQc=$MQc '{ seq = $10; if($5 >= MQc && $6 !~ /S/) { if($4 < start) { seq = substr($10, start - $4 +1); } else if($4 + length($10) - 1 > end) { seq = substr($10, 1, end - $4 +1); } print ">"NR":"$4"\n"seq } }' $samop1 > $fastqop1 | |
awk -v start=$start -v end=$end -v MQc=$MQc '{ seq = $10; if($5 >= MQc && $6 !~ /S/) { if($4 < start) { seq = substr($10, start - $4 +1); } else if($4 + length($10) - 1 > end) { seq = substr($10, 1, end - $4 +1); } print ">"NR":"$4"\n"seq } }' $samop2 > $fastqop2 | |
awk -v start=$start -v end=$end -v MQc=$MQc '{ seq = $10; if($5 >= MQc && $6 !~ /S/) { if($4 < start) { seq = substr($10, start - $4 +1); } else if($4 + length($10) - 1 > end) { seq = substr($10, 1, end - $4 +1); } print ">"NR":"$4"\n"seq } }' $samop3 > $fastqop3 | |
locusRef=$chr":"$start"-"$endref | |
samtools faidx $ref $locusRef > $refSeq | |
if [ ! -s $refSeq ]; then #no reference fasta. | |
echo "Exiting ! No Reference Fasta !" >&2 | |
exit | |
fi | |
if [ ! -s $fastqop1 ] || [ ! -s $fastqop2 ] || [ ! -s $fastqop3 ]; then #some of the samples don't have FASTQ's | |
rm $fastqop1 $fastqop2 $fastqop3 | |
echo "Exiting ! No FASTQ !" >&2 | |
exit | |
fi | |
#Jellyfish Count | |
~/bin/jellyfish count -m $k -s 100000 $fastqop1 -o $JFbin1 | |
~/bin/jellyfish count -m $k -s 100000 $fastqop2 -o $JFbin2 | |
~/bin/jellyfish count -m $k -s 100000 $fastqop3 -o $JFbin3 | |
~/bin/jellyfish count -m $k -s 100000 $refSeq -o $JFbinref | |
#Jellyfish dump | |
~/bin/jellyfish dump ${JFbin1} -c -o $JFcount1 | |
~/bin/jellyfish dump ${JFbin2} -c -o $JFcount2 | |
~/bin/jellyfish dump ${JFbin3} -c -o $JFcount3 | |
~/bin/jellyfish dump ${JFbinref} -c -o $JFcountref | |
#sort the counts based on kmer-string | |
sort $JFcount1 > $JFsortedcount1 | |
sort $JFcount2 > $JFsortedcount2 | |
sort $JFcount3 > $JFsortedcount3 | |
sort $JFcountref > $JFsortedcountref | |
#Join mom, dad and child counts | |
join -e 0 -o '0 1.2 2.2' -a1 -a2 $JFsortedcount1 $JFsortedcount2 > $tempJoin1 | |
join -e 0 -o '0 1.2 1.3 2.2' -a1 -a2 $tempJoin1 $JFsortedcount3 > $tempJoin2 | |
join -e 0 -o '0 1.2 1.3 1.4 2.2' -a1 -a2 $tempJoin2 $JFsortedcountref > $joinedcounts | |
#For each kmer, calculate fraction of counts coming from mom, dad and child. The last column is child_fraction - (mom_fraction + dad_fraction) [ $9=$6-($7+$8)] | |
awk -v RDC=$kmer_RD_cutoff '{ totalC = $2+$3+$4; if(totalC == 0) { $6 = 0; $7 = 0; $8 = 0; } else { $6=$2/totalC; $7=$3/totalC; $8=$4/totalC; } $9=$6-($7+$8); if(totalC>RDC) print }' $joinedcounts > $joinedStatCounts | |
sort -gk9 $joinedStatCounts > $joinedSortedStatCounts | |
#present in reference | |
awk '$5==1' $joinedSortedStatCounts | head -n $n > $tempJoin1 | |
#not present in reference | |
awk '$5==0' $joinedSortedStatCounts | tail -n $n > $tempJoin2 | |
cat $tempJoin1 $tempJoin2 > $tempJoin3 | |
#This is the part that calculates the test-statistic of interest. | |
#without modulus for $NF and check if "in reference" for min and if "not in reference" for max | |
#the file is sorted by the last column, average first n lines of this column and the last n lines and play around with these values to refine signal. | |
awk -v s=$start -v n=$n 'BEGIN { sum_min = 0; sum_max = 0; } { if(NR<=n && $5 == 1) { sum_min+=$NF; min_c+=1; } else { sum_max+=$NF; max_c +=1;} } END { if(max_c == 0) { sum_max = 0; max_c = 1 } if(min_c == 0) { sum_min = 0; min_c = 1; } print s"\t"(sum_max/max_c-sum_min/min_c)"\t"sum_min"\t"sum_max"\t"NR; }' $tempJoin3 | |
#cleanup miscellaneous files | |
#rm $samop1 $samop2 $samop3 $fastqop1 $fastqop2 $fastqop3 ${JFbin1} ${JFbin2} ${JFbin3} ${JFbinref} $joinedcounts $JFcount1 $JFcount2 $JFcount3 $JFcountref $JFsortedcount1 $JFsortedcount2 $JFsortedcount3 $tempJoin1 $tempJoin2 $JFsortedcountref $refSeq $joinedStatCounts |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment