Created
June 15, 2012 13:35
-
-
Save arq5x/2936510 to your computer and use it in GitHub Desktop.
Exome pipeline for Charles
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
########################################## | |
# Step 0. setup a list of sample names. | |
# Assume that each of your gzipped | |
# FASTQ files is named as follows: | |
# sample1.1.fq.gz | |
# sample1.2.fq.gz | |
# sample2.1.fq.gz | |
# sample2.2.fq.gz | |
# ... | |
# sampleN.1.fq.gz | |
# sampleN.2.fq.gz | |
########################################## | |
export SAMPLES="sample1 sample2 sample3" | |
########################################## | |
# Step 1. Align the raw FASTQ with BWA | |
########################################## | |
export EXHOME=/home/arq5x/cphg-home/projects/rs-exome | |
export STEPNAME=bwa-aln | |
export GENOME=/home/arq5x/cphg-home/shared/genomes/hg19/bwa/gatk/hg19_gatk.fa | |
for sample in `echo $SAMPLES` | |
do | |
export QSUB="qsub -q cphg -W group_list=CPHG -V -l select=1:mem=4000m:ncpus=8 -N $STEPNAME" | |
echo "cd $EXHOME; bwa aln -e 1 -t 8 $GENOME fastq/$sample.1.fq.gz > bam/$sample.1.sai" | $QSUB | |
echo "cd $EXHOME; bwa aln -e 1 -t 8 $GENOME fastq/$sample.2.fq.gz > bam/$sample.2.sai" | $QSUB | |
done | |
############################################################ | |
# Step 2. Pair the alignments (assumes PE data). | |
# Keep proper, on-target (i.e. +/- 500 bp of a probe) pairs | |
# using bedtools. | |
# Require mapping quality >= 20 | |
# Add the sample ID as a RG | |
############################################################ | |
export EXHOME=/home/arq5x/cphg-home/projects/rs-exome | |
export STEPNAME=bwa-pair | |
export GENOME=/home/arq5x/cphg-home/shared/genomes/hg19/bwa/gatk/hg19_gatk.fa | |
for sample in `echo $SAMPLES` | |
do | |
# -m ea sends an email when job (e)nds or (a)borts | |
export QSUB="qsub -q cphg -W group_list=CPHG -V -l select=1:mem=8000m:ncpus=3 -N $STEPNAME" | |
echo "cd $EXHOME; bwa sampe -r '@RG\tID:$sample\tSM:$sample' $GENOME bam/$sample.1.sai bam/$sample.2.sai fastq/$sample.1.fq.gz fastq/$sample.2.fq.gz \ | |
| samtools view -q 20 -f 2 -Su - \ | |
| intersectBed -abam stdin -b bed/sureselect.liftedTohg19.slop500.bed \ | |
> bam/$sample.conc.on.bam" | $QSUB | |
done | |
############################################################ | |
# Step 3. Sort the on-target concordants by genome position. | |
############################################################ | |
export EXHOME=/home/arq5x/cphg-home/projects/rs-exome | |
export STEPNAME=possrt | |
for sample in `echo $SAMPLES` | |
do | |
# -m ea sends an email when job (e)nds or (a)borts | |
export QSUB="qsub -q cphg -W group_list=CPHG -V -l select=1:mem=8000m:ncpus=1 -N $STEPNAME" | |
echo "cd $EXHOME; samtools sort -m 4000000000 bam/$sample.conc.on.bam bam/$sample.conc.on.pos" | $QSUB | |
done | |
############################################################ | |
# Step 4. Index the position-sorted BAM files. | |
############################################################ | |
export EXHOME=/home/arq5x/cphg-home/projects/rs-exome | |
export STEPNAME=index | |
for sample in `echo $SAMPLES` | |
do | |
export QSUB="qsub -q cphg -W group_list=CPHG -V -l select=1:mem=2000m:ncpus=1 -N $STEPNAME" | |
echo "cd $EXHOME; samtools index bam/$sample.conc.on.pos.bam" | $QSUB | |
done | |
############################################################ | |
# Step 5. Mark duplicates | |
############################################################ | |
export EXHOME=/home/arq5x/cphg-home/projects/rs-exome | |
export STEPNAME=rs-ex-dupe | |
export GENOME=/home/arq5x/cphg-home/shared/genomes/hg19/bwa/gatk/hg19_gatk.fa | |
export BIN=/home/arq5x/cphg-home/shared/bin/picard-tools-1.60 | |
for sample in `echo $SAMPLES`; | |
do | |
export QSUB="qsub -q cphg -W group_list=CPHG -V -l select=1:mem=8000m:ncpus=2 -N $STEPNAME"; | |
echo "cd $EXHOME; java -Xmx2g -jar $BIN/MarkDuplicates.jar \ | |
INPUT=bam/$sample.conc.on.pos.bam \ | |
OUTPUT=bam/$sample.conc.on.pos.dedup.bam \ | |
TMP_DIR=$EXHOME/bam \ | |
VALIDATION_STRINGENCY=LENIENT \ | |
ASSUME_SORTED=true \ | |
METRICS_FILE=bam/$sample.markdup_metrics" | $QSUB; | |
done | |
############################################################ | |
# Step 6. Index the position-sorted, deduped BAM files. | |
############################################################ | |
export EXHOME=/home/arq5x/cphg-home/projects/rs-exome | |
export STEPNAME=dup-index | |
for sample in `echo $SAMPLES` | |
do | |
export QSUB="qsub -q cphg -W group_list=CPHG -V -l select=1:mem=2000m:ncpus=1 -N $STEPNAME" | |
echo "cd $EXHOME; samtools index bam/$sample.conc.on.pos.dedup.bam" | $QSUB | |
done | |
############################################################ | |
# Step 7. Merge BAM files from each sample into a _single_ bam | |
# file. This makes variant calling more accurate and | |
# facilitates visualization and variant calling. | |
############################################################ | |
export EXHOME=/home/arq5x/cphg-home/projects/rs-exome | |
export STEPNAME=merge | |
export GENOME=/home/arq5x/cphg-home/shared/genomes/hg19/bwa/gatk/hg19_gatk.fa | |
export BIN=/home/arq5x/cphg-home/shared/bin/picard-tools-1.60 | |
export QSUB="qsub -q cphg -W group_list=CPHG -V -l select=1:mem=8000m:ncpus=4 -N $STEPNAME"; | |
echo "cd $EXHOME; java -Xmx2g -jar $BIN/MergeSamFiles.jar \ | |
O=bam/all.conc.on.pos.dedup.bam \ | |
I=bam/sample1.conc.on.pos.dedup.bam \ | |
I=bam/sample2.conc.on.pos.dedup.bam \ | |
I=bam/sample3.conc.on.pos.dedup.bam \ | |
TMP_DIR=$EXHOME/bam \ | |
USE_THREADING=true" | $QSUB; | |
############################################################ | |
# Step 8. Identify realignment targets. These are cases where | |
# the alignments are suspicious owing to INDELs in the region | |
#. This step identifies such regions and the next step | |
# corrects them. Improves accuracy of variant detection. | |
############################################################ | |
export EXHOME=/home/arq5x/cphg-home/projects/rs-exome | |
export STEPNAME=realtgts | |
export GENOME=/home/arq5x/cphg-home/shared/genomes/hg19/bwa/gatk/hg19_gatk.fa | |
export BIN=/home/arq5x/cphg-home/shared/bin/GenomeAnalysisTK-1.4-9-g1f1233b | |
export QSUB="qsub -q cphg -W group_list=CPHG -V -l select=1:mem=8000m:ncpus=3 -N $STEPNAME"; | |
echo "cd $EXHOME; java -Xmx2g -jar $BIN/GenomeAnalysisTK.jar \ | |
-T RealignerTargetCreator \ | |
-R $GENOME \ | |
-I bam/all.conc.on.pos.dedup.bam \ | |
-o bam/all.conc.on.pos.dedup.bam.intervals" | $QSUB; | |
############################################################ | |
# Step 9. Use GATK to realign the BAMs at the suspicious targets. | |
############################################################ | |
export EXHOME=/home/arq5x/cphg-home/projects/rs-exome | |
export STEPNAME=realtgts | |
export GENOME=/home/arq5x/cphg-home/shared/genomes/hg19/bwa/gatk/hg19_gatk.fa | |
export BIN=/home/arq5x/cphg-home/shared/bin/GenomeAnalysisTK-1.4-9-g1f1233b | |
export QSUB="qsub -q cphg -W group_list=CPHG -V -l select=1:mem=16000m:ncpus=4 -N $STEPNAME"; | |
echo "cd $RSHOME; java -Xmx2g -Djava.io.tmpdir=$EXHOME/bam -jar $BIN/GenomeAnalysisTK.jar \ | |
-T IndelRealigner \ | |
--num_threads 1 \ | |
-R $GENOME \ | |
-targetIntervals bam/all.conc.on.pos.dedup.bam.intervals \ | |
-I bam/all.conc.on.pos.dedup.bam \ | |
-o bam/all.conc.on.pos.dedup.realigned.bam" | $QSUB; | |
############################################################ | |
# Step 10. Index the merged, deduped BAM file as well as the | |
# merged, deduped, realigned file. | |
############################################################ | |
export EXHOME=/home/arq5x/cphg-home/projects/rs-exome | |
export STEPNAME=rsex-index | |
export QSUB="qsub -q cphg -W group_list=CPHG -V -l select=1:mem=4000m:ncpus=4 -N $STEPNAME" | |
echo "cd $EXHOME; samtools index bam/all.conc.on.pos.dedup.bam" | $QSUB | |
echo "cd $EXHOME; samtools index bam/all.conc.on.pos.dedup.realigned.bam" | $QSUB | |
############################################################ | |
# Step 11. Call SNPs and INDELs with GATK W/O using BAQ. | |
############################################################ | |
export EXHOME=/home/arq5x/cphg-home/projects/rs-exome | |
export STEPNAME=rsex-varbaq | |
export GENOME=/home/arq5x/cphg-home/shared/genomes/hg19/bwa/gatk/hg19_gatk.fa | |
export BIN=/home/arq5x/cphg-home/shared/bin/GenomeAnalysisTK-1.4-9-g1f1233b | |
export QSUB="qsub -q cphg -W group_list=CPHG -V -l select=1:mem=16000m:ncpus=12 -N $STEPNAME"; | |
echo "cd $EXHOME; java -Xmx2g -jar $BIN/GenomeAnalysisTK.jar \ | |
-T UnifiedGenotyper \ | |
-glm BOTH \ | |
--num_threads 10 \ | |
-baq OFF \ | |
-R $GENOME \ | |
-I bam/all.conc.on.pos.dedup.realigned.bam \ | |
-o varCalling/2012-Feb-01/all.raw.nobaq.vcf" | $QSUB; | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment