gatoravi · May 13, 2017 05:02
diff --git a/kMerAnalysis.chr22.null.v6.sh b/kMerAnalysis.chr22.null.v6.sh
 #Avinash Ramu, Conrad Lab, WUSTL
 #Estimate the kmer statistic in a specified window
 #use overlapping windows to capture the kmers at the edges. Make sure the start of the read lies within the window.

 #Arguments - a region in the form "chr\tstart\tstop" i.e $1=chr $2=start $3=end

 #Updates
 #modified - 6/13 made it $6 - ($7 + $8 ) in other words I removed /2
 #modified - 6/14 cleanup the code.
 #modified - 6/19 add a column for the reference kmer counts
 #modified - 7/8 clip the ends of sequences that don't fall within a window.

 #! /bin/bash

 chr=$1
 start=$2
 end=$3
 #echo $chr"\t"$start"\t"$end
 kmer_RD_cutoff=5 #consider a kmer only if it has atleast a total of 5 reads across all three samples.
 bam1=~/files/20110915_CEUtrio/WGS/BAMFiles/CEUTrio.HiSeq.WGS.b37_decoy.NA12878.clean.dedup.recal.bam
 bam2=~/files/20110915_CEUtrio/WGS/BAMFiles/CEUTrio.HiSeq.WGS.b37_decoy.NA12891.clean.dedup.recal.bam
 bam3=~/files/20110915_CEUtrio/WGS/BAMFiles/CEUTrio.HiSeq.WGS.b37_decoy.NA12892.clean.dedup.recal.bam
 ref=~/files/20110915_CEUtrio/WGS/ref/hs37d5.fa
 hwindow=100 #hundred base pairs on either side of the mutation, so window size = 200
 k=10 #length of kmer
 n=10 #number of lines to average the kmer statistic over, for example take the average of the statistic in the first k and last k sorted lines (divide by 2*k)
 readLength=100 #average length of the read
 MQc=0 #mapping quality cutoff

 SAMDIR=./SAM
 FASTQDIR=./FASTQ
 JFbinDIR=./JFBin
 JFcountDIR=./JFCount
 JoinedDIR=./Joined
 MiscDIR=./Temp
 RefDIR=./Ref
    
 if [ ! -d "$SAMDIR" ]; then
    mkdir $SAMDIR
 fi

 if [ ! -d "$FASTQDIR" ]; then
    mkdir $FASTQDIR
 fi

 if [ ! -d "$JFbinDIR" ]; then
    mkdir $JFbinDIR
 fi

 if [ ! -d "$JFcountDIR" ]; then
    mkdir $JFcountDIR
 fi

 if [ ! -d "$JoinedDIR" ]; then
    mkdir $JoinedDIR
 fi

 if [ ! -d "$MiscDIR" ]; then
    mkdir $MiscDIR
 fi

 if [ ! -d "$RefDIR" ]; then
    mkdir $RefDIR
 fi

 samop1=$SAMDIR/s1.${start}.sam
 samop2=$SAMDIR/s2.${start}.sam
 samop3=$SAMDIR/s3.${start}.sam

 fastqop1=$FASTQDIR/s1.${start}.fastq
 fastqop2=$FASTQDIR/s2.${start}.fastq
 fastqop3=$FASTQDIR/s3.${start}.fastq
 refSeq=$RefDIR/${start}.seq

 JFbin1=$JFbinDIR/s1.${start}.mer
 JFbin2=$JFbinDIR/s2.${start}.mer
 JFbin3=$JFbinDIR/s3.${start}.mer
 JFbinref=$JFbinDIR/ref.${start}.mer

 JFcount1=$JFcountDIR/s1.${start}.counts.txt
 JFcount2=$JFcountDIR/s2.${start}.counts.txt
 JFcount3=$JFcountDIR/s3.${start}.counts.txt
 JFcountref=$JFcountDIR/ref.${start}.counts.txt

 JFsortedcount1=$JFcountDIR/s1.${start}.sorted.counts.txt
 JFsortedcount2=$JFcountDIR/s2.${start}.sorted.counts.txt
 JFsortedcount3=$JFcountDIR/s3.${start}.sorted.counts.txt
 JFsortedcountref=$JFcountDIR/ref.${start}.sorted.counts.txt

 joinedcounts=$JoinedDIR/joined.${start}.txt
 tempJoin1=$MiscDIR/temp.${start}.1.txt
 tempJoin2=$MiscDIR/temp.${start}.2.txt
 tempJoin3=$MiscDIR/temp.${start}.3.txt
 joinedStatCounts=$JoinedDIR/joined.${start}.stats.txt
 joinedSortedStatCounts=$JoinedDIR/joined.sorted.${start}.stats.txt

 locus=$chr":"$start"-"$end
 endref=$(($end + $readLength))
 samtools view $bam1 $locus > $samop1
 samtools view $bam2 $locus > $samop2
 samtools view $bam3 $locus > $samop3

 if   [ ! -s $samop1 ] ||  [ ! -s $samop2 ] ||  [ ! -s $samop3 ]; then #some of the samples don't have reads.
    rm $samop1 $samop2 $samop3
    echo "Exiting ! No BAM !" >&2
    exit
 fi

 awk  -v start=$start -v end=$end -v MQc=$MQc '{ seq = $10; if($5 >= MQc && $6 !~ /S/) { if($4 < start) { seq = substr($10, start - $4 +1); }  else if($4 + length($10) - 1 > end) { seq = substr($10, 1, end - $4 +1); } print  ">"NR":"$4"\n"seq } }' $samop1 > $fastqop1
 awk  -v start=$start -v end=$end -v MQc=$MQc '{ seq = $10; if($5 >= MQc && $6 !~ /S/) { if($4 < start) { seq = substr($10, start - $4 +1); }  else if($4 + length($10) - 1 > end) { seq = substr($10, 1, end - $4 +1); } print  ">"NR":"$4"\n"seq } }' $samop2 > $fastqop2
 awk  -v start=$start -v end=$end -v MQc=$MQc '{ seq = $10; if($5 >= MQc && $6 !~ /S/) { if($4 < start) { seq = substr($10, start - $4 +1); }  else if($4 + length($10) - 1 > end) { seq = substr($10, 1, end - $4 +1); } print  ">"NR":"$4"\n"seq } }' $samop3 > $fastqop3
 locusRef=$chr":"$start"-"$endref
 samtools faidx $ref $locusRef > $refSeq

 if   [ ! -s $refSeq ]; then #no reference fasta.
    echo "Exiting ! No Reference Fasta !" >&2
    exit
 fi


 if   [ ! -s $fastqop1 ] ||  [ ! -s $fastqop2 ] ||  [ ! -s $fastqop3 ]; then #some of the samples don't have FASTQ's
    rm $fastqop1 $fastqop2 $fastqop3
    echo "Exiting ! No FASTQ !" >&2
    exit
 fi

 #Jellyfish Count
 ~/bin/jellyfish count -m $k -s 100000  $fastqop1 -o $JFbin1
 ~/bin/jellyfish count -m $k -s 100000  $fastqop2 -o $JFbin2
 ~/bin/jellyfish count -m $k -s 100000  $fastqop3 -o $JFbin3
 ~/bin/jellyfish count -m $k -s 100000  $refSeq -o $JFbinref

 #Jellyfish dump
 ~/bin/jellyfish dump ${JFbin1} -c -o $JFcount1
 ~/bin/jellyfish dump ${JFbin2} -c -o $JFcount2
 ~/bin/jellyfish dump ${JFbin3} -c -o $JFcount3
 ~/bin/jellyfish dump ${JFbinref} -c -o $JFcountref

 #sort the counts based on kmer-string
 sort $JFcount1 > $JFsortedcount1
 sort $JFcount2 > $JFsortedcount2
 sort $JFcount3 > $JFsortedcount3
 sort $JFcountref > $JFsortedcountref

 #Join mom, dad and child counts
 join -e 0  -o '0 1.2 2.2'  -a1 -a2 $JFsortedcount1 $JFsortedcount2 > $tempJoin1
 join -e 0  -o '0 1.2 1.3 2.2'  -a1 -a2 $tempJoin1 $JFsortedcount3 > $tempJoin2
 join -e 0  -o '0 1.2 1.3 1.4 2.2'  -a1 -a2 $tempJoin2 $JFsortedcountref > $joinedcounts

 #For each kmer, calculate fraction of counts coming from mom, dad and child. The last column is child_fraction - (mom_fraction + dad_fraction) [ $9=$6-($7+$8)]
 awk -v RDC=$kmer_RD_cutoff '{ totalC = $2+$3+$4;  if(totalC == 0) { $6 = 0; $7 = 0; $8 = 0; } else { $6=$2/totalC; $7=$3/totalC; $8=$4/totalC; } $9=$6-($7+$8); if(totalC>RDC) print }' $joinedcounts >  $joinedStatCounts
 sort -gk9 $joinedStatCounts > $joinedSortedStatCounts

 #present in reference
 awk '$5==1'  $joinedSortedStatCounts | head -n $n > $tempJoin1 
 #not present in reference
 awk '$5==0'  $joinedSortedStatCounts | tail -n $n > $tempJoin2
 cat $tempJoin1 $tempJoin2 > $tempJoin3

 #This is the part that calculates the test-statistic of interest.
 #without modulus for $NF and check if "in reference" for min and if "not in reference" for max
 #the file is sorted by the last column, average first n lines of this column and the last n lines and play around with these values to refine signal.
 awk -v s=$start -v n=$n 'BEGIN { sum_min = 0; sum_max = 0; } { if(NR<=n && $5 == 1) { sum_min+=$NF; min_c+=1; } else { sum_max+=$NF; max_c +=1;} } END { if(max_c == 0) { sum_max = 0; max_c = 1 }  if(min_c == 0) { sum_min = 0; min_c = 1; } print s"\t"(sum_max/max_c-sum_min/min_c)"\t"sum_min"\t"sum_max"\t"NR; }' $tempJoin3

 #cleanup miscellaneous files
 #rm $samop1 $samop2 $samop3 $fastqop1 $fastqop2 $fastqop3 ${JFbin1} ${JFbin2} ${JFbin3}  ${JFbinref} $joinedcounts $JFcount1 $JFcount2 $JFcount3 $JFcountref $JFsortedcount1 $JFsortedcount2 $JFsortedcount3 $tempJoin1 $tempJoin2  $JFsortedcountref $refSeq $joinedStatCounts
	#Avinash Ramu, Conrad Lab, WUSTL
	#Estimate the kmer statistic in a specified window
	#use overlapping windows to capture the kmers at the edges. Make sure the start of the read lies within the window.

	#Arguments - a region in the form "chr\tstart\tstop" i.e $1=chr $2=start $3=end

	#Updates
	#modified - 6/13 made it $6 - ($7 + $8 ) in other words I removed /2
	#modified - 6/14 cleanup the code.
	#modified - 6/19 add a column for the reference kmer counts
	#modified - 7/8 clip the ends of sequences that don't fall within a window.

	#! /bin/bash

	chr=$1
	start=$2
	end=$3
	#echo $chr"\t"$start"\t"$end
	kmer_RD_cutoff=5 #consider a kmer only if it has atleast a total of 5 reads across all three samples.
	bam1=~/files/20110915_CEUtrio/WGS/BAMFiles/CEUTrio.HiSeq.WGS.b37_decoy.NA12878.clean.dedup.recal.bam
	bam2=~/files/20110915_CEUtrio/WGS/BAMFiles/CEUTrio.HiSeq.WGS.b37_decoy.NA12891.clean.dedup.recal.bam
	bam3=~/files/20110915_CEUtrio/WGS/BAMFiles/CEUTrio.HiSeq.WGS.b37_decoy.NA12892.clean.dedup.recal.bam
	ref=~/files/20110915_CEUtrio/WGS/ref/hs37d5.fa
	hwindow=100 #hundred base pairs on either side of the mutation, so window size = 200
	k=10 #length of kmer
	n=10 #number of lines to average the kmer statistic over, for example take the average of the statistic in the first k and last k sorted lines (divide by 2*k)
	readLength=100 #average length of the read
	MQc=0 #mapping quality cutoff

	SAMDIR=./SAM
	FASTQDIR=./FASTQ
	JFbinDIR=./JFBin
	JFcountDIR=./JFCount
	JoinedDIR=./Joined
	MiscDIR=./Temp
	RefDIR=./Ref

	if [ ! -d "$SAMDIR" ]; then
	mkdir $SAMDIR
	fi

	if [ ! -d "$FASTQDIR" ]; then
	mkdir $FASTQDIR
	fi

	if [ ! -d "$JFbinDIR" ]; then
	mkdir $JFbinDIR
	fi

	if [ ! -d "$JFcountDIR" ]; then
	mkdir $JFcountDIR
	fi

	if [ ! -d "$JoinedDIR" ]; then
	mkdir $JoinedDIR
	fi

	if [ ! -d "$MiscDIR" ]; then
	mkdir $MiscDIR
	fi

	if [ ! -d "$RefDIR" ]; then
	mkdir $RefDIR
	fi

	samop1=$SAMDIR/s1.${start}.sam
	samop2=$SAMDIR/s2.${start}.sam
	samop3=$SAMDIR/s3.${start}.sam

	fastqop1=$FASTQDIR/s1.${start}.fastq
	fastqop2=$FASTQDIR/s2.${start}.fastq
	fastqop3=$FASTQDIR/s3.${start}.fastq
	refSeq=$RefDIR/${start}.seq

	JFbin1=$JFbinDIR/s1.${start}.mer
	JFbin2=$JFbinDIR/s2.${start}.mer
	JFbin3=$JFbinDIR/s3.${start}.mer
	JFbinref=$JFbinDIR/ref.${start}.mer

	JFcount1=$JFcountDIR/s1.${start}.counts.txt
	JFcount2=$JFcountDIR/s2.${start}.counts.txt
	JFcount3=$JFcountDIR/s3.${start}.counts.txt
	JFcountref=$JFcountDIR/ref.${start}.counts.txt

	JFsortedcount1=$JFcountDIR/s1.${start}.sorted.counts.txt
	JFsortedcount2=$JFcountDIR/s2.${start}.sorted.counts.txt
	JFsortedcount3=$JFcountDIR/s3.${start}.sorted.counts.txt
	JFsortedcountref=$JFcountDIR/ref.${start}.sorted.counts.txt

	joinedcounts=$JoinedDIR/joined.${start}.txt
	tempJoin1=$MiscDIR/temp.${start}.1.txt
	tempJoin2=$MiscDIR/temp.${start}.2.txt
	tempJoin3=$MiscDIR/temp.${start}.3.txt
	joinedStatCounts=$JoinedDIR/joined.${start}.stats.txt
	joinedSortedStatCounts=$JoinedDIR/joined.sorted.${start}.stats.txt

	locus=$chr":"$start"-"$end
	endref=$(($end + $readLength))
	samtools view $bam1 $locus > $samop1
	samtools view $bam2 $locus > $samop2
	samtools view $bam3 $locus > $samop3

	if [ ! -s $samop1 ] \|\| [ ! -s $samop2 ] \|\| [ ! -s $samop3 ]; then #some of the samples don't have reads.
	rm $samop1 $samop2 $samop3
	echo "Exiting ! No BAM !" >&2
	exit
	fi

	awk -v start=$start -v end=$end -v MQc=$MQc '{ seq = $10; if($5 >= MQc && $6 !~ /S/) { if($4 < start) { seq = substr($10, start - $4 +1); } else if($4 + length($10) - 1 > end) { seq = substr($10, 1, end - $4 +1); } print ">"NR":"$4"\n"seq } }' $samop1 > $fastqop1
	awk -v start=$start -v end=$end -v MQc=$MQc '{ seq = $10; if($5 >= MQc && $6 !~ /S/) { if($4 < start) { seq = substr($10, start - $4 +1); } else if($4 + length($10) - 1 > end) { seq = substr($10, 1, end - $4 +1); } print ">"NR":"$4"\n"seq } }' $samop2 > $fastqop2
	awk -v start=$start -v end=$end -v MQc=$MQc '{ seq = $10; if($5 >= MQc && $6 !~ /S/) { if($4 < start) { seq = substr($10, start - $4 +1); } else if($4 + length($10) - 1 > end) { seq = substr($10, 1, end - $4 +1); } print ">"NR":"$4"\n"seq } }' $samop3 > $fastqop3
	locusRef=$chr":"$start"-"$endref
	samtools faidx $ref $locusRef > $refSeq

	if [ ! -s $refSeq ]; then #no reference fasta.
	echo "Exiting ! No Reference Fasta !" >&2
	exit
	fi


	if [ ! -s $fastqop1 ] \|\| [ ! -s $fastqop2 ] \|\| [ ! -s $fastqop3 ]; then #some of the samples don't have FASTQ's
	rm $fastqop1 $fastqop2 $fastqop3
	echo "Exiting ! No FASTQ !" >&2
	exit
	fi

	#Jellyfish Count
	~/bin/jellyfish count -m $k -s 100000 $fastqop1 -o $JFbin1
	~/bin/jellyfish count -m $k -s 100000 $fastqop2 -o $JFbin2
	~/bin/jellyfish count -m $k -s 100000 $fastqop3 -o $JFbin3
	~/bin/jellyfish count -m $k -s 100000 $refSeq -o $JFbinref

	#Jellyfish dump
	~/bin/jellyfish dump ${JFbin1} -c -o $JFcount1
	~/bin/jellyfish dump ${JFbin2} -c -o $JFcount2
	~/bin/jellyfish dump ${JFbin3} -c -o $JFcount3
	~/bin/jellyfish dump ${JFbinref} -c -o $JFcountref

	#sort the counts based on kmer-string
	sort $JFcount1 > $JFsortedcount1
	sort $JFcount2 > $JFsortedcount2
	sort $JFcount3 > $JFsortedcount3
	sort $JFcountref > $JFsortedcountref

	#Join mom, dad and child counts
	join -e 0 -o '0 1.2 2.2' -a1 -a2 $JFsortedcount1 $JFsortedcount2 > $tempJoin1
	join -e 0 -o '0 1.2 1.3 2.2' -a1 -a2 $tempJoin1 $JFsortedcount3 > $tempJoin2
	join -e 0 -o '0 1.2 1.3 1.4 2.2' -a1 -a2 $tempJoin2 $JFsortedcountref > $joinedcounts

	#For each kmer, calculate fraction of counts coming from mom, dad and child. The last column is child_fraction - (mom_fraction + dad_fraction) [ $9=$6-($7+$8)]
	awk -v RDC=$kmer_RD_cutoff '{ totalC = $2+$3+$4; if(totalC == 0) { $6 = 0; $7 = 0; $8 = 0; } else { $6=$2/totalC; $7=$3/totalC; $8=$4/totalC; } $9=$6-($7+$8); if(totalC>RDC) print }' $joinedcounts > $joinedStatCounts
	sort -gk9 $joinedStatCounts > $joinedSortedStatCounts

	#present in reference
	awk '$5==1' $joinedSortedStatCounts \| head -n $n > $tempJoin1
	#not present in reference
	awk '$5==0' $joinedSortedStatCounts \| tail -n $n > $tempJoin2
	cat $tempJoin1 $tempJoin2 > $tempJoin3

	#This is the part that calculates the test-statistic of interest.
	#without modulus for $NF and check if "in reference" for min and if "not in reference" for max
	#the file is sorted by the last column, average first n lines of this column and the last n lines and play around with these values to refine signal.
	awk -v s=$start -v n=$n 'BEGIN { sum_min = 0; sum_max = 0; } { if(NR<=n && $5 == 1) { sum_min+=$NF; min_c+=1; } else { sum_max+=$NF; max_c +=1;} } END { if(max_c == 0) { sum_max = 0; max_c = 1 } if(min_c == 0) { sum_min = 0; min_c = 1; } print s"\t"(sum_max/max_c-sum_min/min_c)"\t"sum_min"\t"sum_max"\t"NR; }' $tempJoin3

	#cleanup miscellaneous files
	#rm $samop1 $samop2 $samop3 $fastqop1 $fastqop2 $fastqop3 ${JFbin1} ${JFbin2} ${JFbin3} ${JFbinref} $joinedcounts $JFcount1 $JFcount2 $JFcount3 $JFcountref $JFsortedcount1 $JFsortedcount2 $JFsortedcount3 $tempJoin1 $tempJoin2 $JFsortedcountref $refSeq $joinedStatCounts