drio · July 22, 2010 18:20
diff --git a/gistfile1.sh b/gistfile1.sh
 #!/bin/bash
 #
 set -e
 #set -x

 run()
 {
  echo "`date`: $1"
  echo $1 | /bin/bash
  if [ $? -ne 0 ]
  then
    echo "`date`: exit error: $?"
    echo "When running: $1"
    exit 1
  else
    echo "`date`: ok"
  fi
 }

 usage()
 {
  [ ".$1" != "." ] && echo "ERROR: $1"
  echo "Usage:"
  echo "$0 <root seed> <bam_file> <fasta ref>"
  exit 0
 }

 root=$1; input_bam=$2; ref="$3"
 st_filter="/Users/drio/projects/samtools-svn/misc/samtools.pl"
 dbamfilter="/Users/drio/projects/bfast_related/dnaa/dutil/dbamfilter"
 picard="/Users/drio/projects/picard"

 [ ".$input_bam"  == "." ] && usage "I need a bam"
 [ ".$root" == "." ]       && usage "I need a root seed"
 [ ".$ref" == "." ]        && usage "I need a fasta file for the reference genome"
 [ ! -f "$input_bam" ]     && usage "bam: $input_bam not found."
 [ ! -f "$ref" ]           && usage "ref: $ref not found."
 [ ! -f "$st_filter" ]     && usage "Couldn't find st_filter: $st_filter"
 [ ! -f "$dbamfilter" ]    && usage "Couldn't find dbamfilter: $dbamfilter"
 [ ! -d "$picard" ]        && usage "Couldn't find picard dir: $picard"

 bam_sorted="$root.coor.sort.bam"        # remove low quality aliments
 bam_dups_marked="$root.dups.marked.bam" # dups marked
 bam_hqa="$root.mq.bam"                  # remove low quality aliments
 bam_rd="$root.rd.bam"                   # remove PCR dups
 bam_wi="$root.wi.bam"                   # indels removed

 pile_up_output="pileup.$root.txt"
 snp_file="./snp.file.$root.txt"
 final="filtered.snp.list.$root.txt"

 : << DEBUGXXX
 DEBUGXXX
 # 1. Sort by Coordinate
 run "java -jar -Xmx2g $picard/SortSam.jar \
 TMP_DIR=/tmp \
 INPUT=$input_bam \
 OUTPUT=$bam_sorted \
 SORT_ORDER=coordinate VALIDATION_STRINGENCY=SILENT"

 # 2. Mark dups
 run "java -jar -Xmx6g $picard/MarkDuplicates.jar \
 TMP_DIR=/tmp \
 INPUT=$bam_sorted \
 OUTPUT=$bam_dups_marked \
 METRICS_FILE='./metric_file.picard' \
 VERBOSITY=ERROR \
 VALIDATION_STRINGENCY=SILENT"

 # 3. Keep alignments with mapping quality of 20 or greater 
 # (samtools view q 20 <in.bam> or 'dbamfilter -q 20 <in.bam>)
 run "samtools view -bq 20 $bam_dups_marked > $bam_hqa"

 # 4. Flag PCR duplicates and do not include them when variant calling
 # (samtools view F 1024)
 #run "samtools view -b -F 1024 $bam_hqa > $bam_rd"

 # 5. Varying the ³-r² parameter in ³samtools pileup² was able to give us a
 # nice ROC curve assuming a 1M Illumina Chip as truth.  Plot such a curve and
 # pick the desired "FP/TP" and thus the "-r" parameter.  Other parameters had
 # no effect.
 # XXXXXXXXXXX

 # 6. When searching for SNPs, we removed all alignments with indels since
 # false-snps frequently occurred around indels.  You can remove all reads with
 # indels with "dbamfilter -i <in.bam".
 #run "$dbamfilter $bam_rd > $bam_wi"
 run "$dbamfilter $bam_hqa > $bam_wi"

 # 7. Once we get a list of variants, we only believe variants that were seen 4
 # or more times, with at least one observation on either strand.  The latter
 # requirement is extremely powerful.
 # XXXXXXXXXXX

 # Call snps with samtools 
 #run "samtools pileup -cf $ref $bam_wi > $pile_up_output"
 run "samtools pileup -cf $ref $input_bam > $pile_up_output"

 # 8. Filter indels from the pile_up
 perl $st_filter varFilter $pile_up_output | \
 perl -ne '@a=split(/\t/);if($a[2] ne "*"){print STDERR "$_";}' &> $snp_file

 # 9. Phred-scaled likelihood that the genotype is wrong, which is also
 # called `consensus quality'. 
 # High value --> low prob of error 
 cons_qual='$F[4].to_i > 50'

 # 10. Phred-scaled likelihood that the genotype is identical to the reference, 
 # which is also called `SNP quality'. Suppose the reference base is A and in 
 # alignment we see 17 G and 3 A. We will get a low consensus quality because 
 # it is difficult to distinguish an A/G heterozygote from a G/G homozygote. 
 # We will get a high SNP quality, though, because the evidence of a SNP is very 
 # strong. 
 # High value --> the prob of the genotype being like the ref is pretty low
 snp_qual='$F[5].to_i > 200'
 #snp_qual='1==1'

 # 11. root mean square (RMS) mapping quality
 # map_quality='$F[6].to_i < 30'
 map_quality='1 == 1'

 # 12. reads covering the position 
 reads_covering='$F[7].to_i > 10'

 cat $snp_file | ruby -ane \
 "print \$_ if $reads_covering and $map_quality and $cons_qual and $snp_qual" \
 &> $final

 # 13. Display Homo snps only
 cat $final | ruby -ane 'puts $_ if $F[3] =~ /[ACGT]/' >  homos.txt
	#!/bin/bash
	#
	set -e
	#set -x

	run()
	{
	echo "`date`: $1"
	echo $1 \| /bin/bash
	if [ $? -ne 0 ]
	then
	echo "`date`: exit error: $?"
	echo "When running: $1"
	exit 1
	else
	echo "`date`: ok"
	fi
	}

	usage()
	{
	[ ".$1" != "." ] && echo "ERROR: $1"
	echo "Usage:"
	echo "$0 <root seed> <bam_file> <fasta ref>"
	exit 0
	}

	root=$1; input_bam=$2; ref="$3"
	st_filter="/Users/drio/projects/samtools-svn/misc/samtools.pl"
	dbamfilter="/Users/drio/projects/bfast_related/dnaa/dutil/dbamfilter"
	picard="/Users/drio/projects/picard"

	[ ".$input_bam" == "." ] && usage "I need a bam"
	[ ".$root" == "." ] && usage "I need a root seed"
	[ ".$ref" == "." ] && usage "I need a fasta file for the reference genome"
	[ ! -f "$input_bam" ] && usage "bam: $input_bam not found."
	[ ! -f "$ref" ] && usage "ref: $ref not found."
	[ ! -f "$st_filter" ] && usage "Couldn't find st_filter: $st_filter"
	[ ! -f "$dbamfilter" ] && usage "Couldn't find dbamfilter: $dbamfilter"
	[ ! -d "$picard" ] && usage "Couldn't find picard dir: $picard"

	bam_sorted="$root.coor.sort.bam" # remove low quality aliments
	bam_dups_marked="$root.dups.marked.bam" # dups marked
	bam_hqa="$root.mq.bam" # remove low quality aliments
	bam_rd="$root.rd.bam" # remove PCR dups
	bam_wi="$root.wi.bam" # indels removed

	pile_up_output="pileup.$root.txt"
	snp_file="./snp.file.$root.txt"
	final="filtered.snp.list.$root.txt"

	: << DEBUGXXX
	DEBUGXXX
	# 1. Sort by Coordinate
	run "java -jar -Xmx2g $picard/SortSam.jar \
	TMP_DIR=/tmp \
	INPUT=$input_bam \
	OUTPUT=$bam_sorted \
	SORT_ORDER=coordinate VALIDATION_STRINGENCY=SILENT"

	# 2. Mark dups
	run "java -jar -Xmx6g $picard/MarkDuplicates.jar \
	TMP_DIR=/tmp \
	INPUT=$bam_sorted \
	OUTPUT=$bam_dups_marked \
	METRICS_FILE='./metric_file.picard' \
	VERBOSITY=ERROR \
	VALIDATION_STRINGENCY=SILENT"

	# 3. Keep alignments with mapping quality of 20 or greater
	# (samtools view q 20 <in.bam> or 'dbamfilter -q 20 <in.bam>)
	run "samtools view -bq 20 $bam_dups_marked > $bam_hqa"

	# 4. Flag PCR duplicates and do not include them when variant calling
	# (samtools view F 1024)
	#run "samtools view -b -F 1024 $bam_hqa > $bam_rd"

	# 5. Varying the ³-r² parameter in ³samtools pileup² was able to give us a
	# nice ROC curve assuming a 1M Illumina Chip as truth. Plot such a curve and
	# pick the desired "FP/TP" and thus the "-r" parameter. Other parameters had
	# no effect.
	# XXXXXXXXXXX

	# 6. When searching for SNPs, we removed all alignments with indels since
	# false-snps frequently occurred around indels. You can remove all reads with
	# indels with "dbamfilter -i <in.bam".
	#run "$dbamfilter $bam_rd > $bam_wi"
	run "$dbamfilter $bam_hqa > $bam_wi"

	# 7. Once we get a list of variants, we only believe variants that were seen 4
	# or more times, with at least one observation on either strand. The latter
	# requirement is extremely powerful.
	# XXXXXXXXXXX

	# Call snps with samtools
	#run "samtools pileup -cf $ref $bam_wi > $pile_up_output"
	run "samtools pileup -cf $ref $input_bam > $pile_up_output"

	# 8. Filter indels from the pile_up
	perl $st_filter varFilter $pile_up_output \| \
	perl -ne '@a=split(/\t/);if($a[2] ne "*"){print STDERR "$_";}' &> $snp_file

	# 9. Phred-scaled likelihood that the genotype is wrong, which is also
	# called `consensus quality'.
	# High value --> low prob of error
	cons_qual='$F[4].to_i > 50'

	# 10. Phred-scaled likelihood that the genotype is identical to the reference,
	# which is also called `SNP quality'. Suppose the reference base is A and in
	# alignment we see 17 G and 3 A. We will get a low consensus quality because
	# it is difficult to distinguish an A/G heterozygote from a G/G homozygote.
	# We will get a high SNP quality, though, because the evidence of a SNP is very
	# strong.
	# High value --> the prob of the genotype being like the ref is pretty low
	snp_qual='$F[5].to_i > 200'
	#snp_qual='1==1'

	# 11. root mean square (RMS) mapping quality
	# map_quality='$F[6].to_i < 30'
	map_quality='1 == 1'

	# 12. reads covering the position
	reads_covering='$F[7].to_i > 10'

	cat $snp_file \| ruby -ane \
	"print \$_ if $reads_covering and $map_quality and $cons_qual and $snp_qual" \
	&> $final

	# 13. Display Homo snps only
	cat $final \| ruby -ane 'puts $_ if $F[3] =~ /[ACGT]/' > homos.txt