Created
July 22, 2010 18:20
-
-
Save drio/486366 to your computer and use it in GitHub Desktop.
bfast snps
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# | |
set -e | |
#set -x | |
run() | |
{ | |
echo "`date`: $1" | |
echo $1 | /bin/bash | |
if [ $? -ne 0 ] | |
then | |
echo "`date`: exit error: $?" | |
echo "When running: $1" | |
exit 1 | |
else | |
echo "`date`: ok" | |
fi | |
} | |
usage() | |
{ | |
[ ".$1" != "." ] && echo "ERROR: $1" | |
echo "Usage:" | |
echo "$0 <root seed> <bam_file> <fasta ref>" | |
exit 0 | |
} | |
root=$1; input_bam=$2; ref="$3" | |
st_filter="/Users/drio/projects/samtools-svn/misc/samtools.pl" | |
dbamfilter="/Users/drio/projects/bfast_related/dnaa/dutil/dbamfilter" | |
picard="/Users/drio/projects/picard" | |
[ ".$input_bam" == "." ] && usage "I need a bam" | |
[ ".$root" == "." ] && usage "I need a root seed" | |
[ ".$ref" == "." ] && usage "I need a fasta file for the reference genome" | |
[ ! -f "$input_bam" ] && usage "bam: $input_bam not found." | |
[ ! -f "$ref" ] && usage "ref: $ref not found." | |
[ ! -f "$st_filter" ] && usage "Couldn't find st_filter: $st_filter" | |
[ ! -f "$dbamfilter" ] && usage "Couldn't find dbamfilter: $dbamfilter" | |
[ ! -d "$picard" ] && usage "Couldn't find picard dir: $picard" | |
bam_sorted="$root.coor.sort.bam" # remove low quality aliments | |
bam_dups_marked="$root.dups.marked.bam" # dups marked | |
bam_hqa="$root.mq.bam" # remove low quality aliments | |
bam_rd="$root.rd.bam" # remove PCR dups | |
bam_wi="$root.wi.bam" # indels removed | |
pile_up_output="pileup.$root.txt" | |
snp_file="./snp.file.$root.txt" | |
final="filtered.snp.list.$root.txt" | |
: << DEBUGXXX | |
DEBUGXXX | |
# 1. Sort by Coordinate | |
run "java -jar -Xmx2g $picard/SortSam.jar \ | |
TMP_DIR=/tmp \ | |
INPUT=$input_bam \ | |
OUTPUT=$bam_sorted \ | |
SORT_ORDER=coordinate VALIDATION_STRINGENCY=SILENT" | |
# 2. Mark dups | |
run "java -jar -Xmx6g $picard/MarkDuplicates.jar \ | |
TMP_DIR=/tmp \ | |
INPUT=$bam_sorted \ | |
OUTPUT=$bam_dups_marked \ | |
METRICS_FILE='./metric_file.picard' \ | |
VERBOSITY=ERROR \ | |
VALIDATION_STRINGENCY=SILENT" | |
# 3. Keep alignments with mapping quality of 20 or greater | |
# (samtools view q 20 <in.bam> or 'dbamfilter -q 20 <in.bam>) | |
run "samtools view -bq 20 $bam_dups_marked > $bam_hqa" | |
# 4. Flag PCR duplicates and do not include them when variant calling | |
# (samtools view F 1024) | |
#run "samtools view -b -F 1024 $bam_hqa > $bam_rd" | |
# 5. Varying the ³-r² parameter in ³samtools pileup² was able to give us a | |
# nice ROC curve assuming a 1M Illumina Chip as truth. Plot such a curve and | |
# pick the desired "FP/TP" and thus the "-r" parameter. Other parameters had | |
# no effect. | |
# XXXXXXXXXXX | |
# 6. When searching for SNPs, we removed all alignments with indels since | |
# false-snps frequently occurred around indels. You can remove all reads with | |
# indels with "dbamfilter -i <in.bam". | |
#run "$dbamfilter $bam_rd > $bam_wi" | |
run "$dbamfilter $bam_hqa > $bam_wi" | |
# 7. Once we get a list of variants, we only believe variants that were seen 4 | |
# or more times, with at least one observation on either strand. The latter | |
# requirement is extremely powerful. | |
# XXXXXXXXXXX | |
# Call snps with samtools | |
#run "samtools pileup -cf $ref $bam_wi > $pile_up_output" | |
run "samtools pileup -cf $ref $input_bam > $pile_up_output" | |
# 8. Filter indels from the pile_up | |
perl $st_filter varFilter $pile_up_output | \ | |
perl -ne '@a=split(/\t/);if($a[2] ne "*"){print STDERR "$_";}' &> $snp_file | |
# 9. Phred-scaled likelihood that the genotype is wrong, which is also | |
# called `consensus quality'. | |
# High value --> low prob of error | |
cons_qual='$F[4].to_i > 50' | |
# 10. Phred-scaled likelihood that the genotype is identical to the reference, | |
# which is also called `SNP quality'. Suppose the reference base is A and in | |
# alignment we see 17 G and 3 A. We will get a low consensus quality because | |
# it is difficult to distinguish an A/G heterozygote from a G/G homozygote. | |
# We will get a high SNP quality, though, because the evidence of a SNP is very | |
# strong. | |
# High value --> the prob of the genotype being like the ref is pretty low | |
snp_qual='$F[5].to_i > 200' | |
#snp_qual='1==1' | |
# 11. root mean square (RMS) mapping quality | |
# map_quality='$F[6].to_i < 30' | |
map_quality='1 == 1' | |
# 12. reads covering the position | |
reads_covering='$F[7].to_i > 10' | |
cat $snp_file | ruby -ane \ | |
"print \$_ if $reads_covering and $map_quality and $cons_qual and $snp_qual" \ | |
&> $final | |
# 13. Display Homo snps only | |
cat $final | ruby -ane 'puts $_ if $F[3] =~ /[ACGT]/' > homos.txt |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment