Skip to content

Instantly share code, notes, and snippets.

@drio
Created July 22, 2010 18:20
Show Gist options
  • Save drio/486366 to your computer and use it in GitHub Desktop.
Save drio/486366 to your computer and use it in GitHub Desktop.
bfast snps
#!/bin/bash
#
set -e
#set -x
run()
{
echo "`date`: $1"
echo $1 | /bin/bash
if [ $? -ne 0 ]
then
echo "`date`: exit error: $?"
echo "When running: $1"
exit 1
else
echo "`date`: ok"
fi
}
usage()
{
[ ".$1" != "." ] && echo "ERROR: $1"
echo "Usage:"
echo "$0 <root seed> <bam_file> <fasta ref>"
exit 0
}
root=$1; input_bam=$2; ref="$3"
st_filter="/Users/drio/projects/samtools-svn/misc/samtools.pl"
dbamfilter="/Users/drio/projects/bfast_related/dnaa/dutil/dbamfilter"
picard="/Users/drio/projects/picard"
[ ".$input_bam" == "." ] && usage "I need a bam"
[ ".$root" == "." ] && usage "I need a root seed"
[ ".$ref" == "." ] && usage "I need a fasta file for the reference genome"
[ ! -f "$input_bam" ] && usage "bam: $input_bam not found."
[ ! -f "$ref" ] && usage "ref: $ref not found."
[ ! -f "$st_filter" ] && usage "Couldn't find st_filter: $st_filter"
[ ! -f "$dbamfilter" ] && usage "Couldn't find dbamfilter: $dbamfilter"
[ ! -d "$picard" ] && usage "Couldn't find picard dir: $picard"
bam_sorted="$root.coor.sort.bam" # remove low quality aliments
bam_dups_marked="$root.dups.marked.bam" # dups marked
bam_hqa="$root.mq.bam" # remove low quality aliments
bam_rd="$root.rd.bam" # remove PCR dups
bam_wi="$root.wi.bam" # indels removed
pile_up_output="pileup.$root.txt"
snp_file="./snp.file.$root.txt"
final="filtered.snp.list.$root.txt"
: << DEBUGXXX
DEBUGXXX
# 1. Sort by Coordinate
run "java -jar -Xmx2g $picard/SortSam.jar \
TMP_DIR=/tmp \
INPUT=$input_bam \
OUTPUT=$bam_sorted \
SORT_ORDER=coordinate VALIDATION_STRINGENCY=SILENT"
# 2. Mark dups
run "java -jar -Xmx6g $picard/MarkDuplicates.jar \
TMP_DIR=/tmp \
INPUT=$bam_sorted \
OUTPUT=$bam_dups_marked \
METRICS_FILE='./metric_file.picard' \
VERBOSITY=ERROR \
VALIDATION_STRINGENCY=SILENT"
# 3. Keep alignments with mapping quality of 20 or greater
# (samtools view q 20 <in.bam> or 'dbamfilter -q 20 <in.bam>)
run "samtools view -bq 20 $bam_dups_marked > $bam_hqa"
# 4. Flag PCR duplicates and do not include them when variant calling
# (samtools view F 1024)
#run "samtools view -b -F 1024 $bam_hqa > $bam_rd"
# 5. Varying the ³-r² parameter in ³samtools pileup² was able to give us a
# nice ROC curve assuming a 1M Illumina Chip as truth. Plot such a curve and
# pick the desired "FP/TP" and thus the "-r" parameter. Other parameters had
# no effect.
# XXXXXXXXXXX
# 6. When searching for SNPs, we removed all alignments with indels since
# false-snps frequently occurred around indels. You can remove all reads with
# indels with "dbamfilter -i <in.bam".
#run "$dbamfilter $bam_rd > $bam_wi"
run "$dbamfilter $bam_hqa > $bam_wi"
# 7. Once we get a list of variants, we only believe variants that were seen 4
# or more times, with at least one observation on either strand. The latter
# requirement is extremely powerful.
# XXXXXXXXXXX
# Call snps with samtools
#run "samtools pileup -cf $ref $bam_wi > $pile_up_output"
run "samtools pileup -cf $ref $input_bam > $pile_up_output"
# 8. Filter indels from the pile_up
perl $st_filter varFilter $pile_up_output | \
perl -ne '@a=split(/\t/);if($a[2] ne "*"){print STDERR "$_";}' &> $snp_file
# 9. Phred-scaled likelihood that the genotype is wrong, which is also
# called `consensus quality'.
# High value --> low prob of error
cons_qual='$F[4].to_i > 50'
# 10. Phred-scaled likelihood that the genotype is identical to the reference,
# which is also called `SNP quality'. Suppose the reference base is A and in
# alignment we see 17 G and 3 A. We will get a low consensus quality because
# it is difficult to distinguish an A/G heterozygote from a G/G homozygote.
# We will get a high SNP quality, though, because the evidence of a SNP is very
# strong.
# High value --> the prob of the genotype being like the ref is pretty low
snp_qual='$F[5].to_i > 200'
#snp_qual='1==1'
# 11. root mean square (RMS) mapping quality
# map_quality='$F[6].to_i < 30'
map_quality='1 == 1'
# 12. reads covering the position
reads_covering='$F[7].to_i > 10'
cat $snp_file | ruby -ane \
"print \$_ if $reads_covering and $map_quality and $cons_qual and $snp_qual" \
&> $final
# 13. Display Homo snps only
cat $final | ruby -ane 'puts $_ if $F[3] =~ /[ACGT]/' > homos.txt
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment