Skip to content

Instantly share code, notes, and snippets.

@jerowe
Last active July 29, 2018 04:01
Show Gist options
  • Select an option

  • Save jerowe/74e8a92844b6e721b27e1c42d0442d8a to your computer and use it in GitHub Desktop.

Select an option

Save jerowe/74e8a92844b6e721b27e1c42d0442d8a to your computer and use it in GitHub Desktop.
Variant Detection - Test your dataset
#!/usr/bin/env bash
#SBATCH -p serial
#SBATCH --job-name=metagenomics
#SBATCH --time=00:10:00
# Output and error files
#SBATCH -o job.%J.out
#SBATCH -e job.%J.err
#SBATCH --nodes=1
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=6
#SBATCH --mem=10GB
#Total workflow time i
#real 5m52.126s
#user 8m16.341s
#sys 0m25.835s
set -x -e
#module purge all
#srun --mem 10GB --cpus-per-task 4 --pty /bin/bash
module load gencore/1 gencore_variant_detection
cp -rf /scratch/gencore/datasets/variant_detection.tar.gz ./
tar -xvf variant_detection.tar.gz
#JOB bwa_index
#INPUTS GCF_000001405.33_GRCh38.p7_chr20_genomic.fna
#TIME 69.895 sec
bwa index GCF_000001405.33_GRCh38.p7_chr20_genomic.fna
#JOB bwa_mem
#DEPS bwa_index
#INPUTS GCF_000001405.33_GRCh38.p7_chr20_genomic.fna
#INPUTS read_1.fastq
#INPUTS read_2.fastq
#OUTPUTS aligned_reads.sam
#TIME
#real 0m27.728s
#user 0m26.764s
#sys 0m0.896s
bwa mem -M \
-R '@RG\tID:sample_1\tLB:sample_1\tPL:ILLUMINA\tPM:HISEQ\tSM:sample_1' \
GCF_000001405.33_GRCh38.p7_chr20_genomic.fna \
read_1.fastq \
read_2.fastq > aligned_reads.sam
#JOB picard_sortsam
#DEPS bwa_mem
#INPUTS aligned_reads.sam
#OUTPUTS sorted_reads.bam
#TIME
#real 0m10.652s
#user 0m14.259s
#sys 0m3.275s
picard SortSam \
INPUT=aligned_reads.sam \
OUTPUT=sorted_reads.bam \
SORT_ORDER=coordinate
#JOB samtools_flagstat
#DEPS picard_sortsam
#INPUTS aligned_reads.sam
#OUTPUTS ???
#TIME
#real 0m0.836s
#user 0m0.319s
#sys 0m0.031s
samtools flagstat aligned_reads.sam
#JOB picard_collectinsertsize
#DEPS picard_sortsam
#INPUTS sorted_reads.bam
#OUTPUTS insert_metrics.txt
#TIME
#real 0m11.002s
#user 0m15.863s
#sys 0m1.088s
picard CollectInsertSizeMetrics \
INPUT=sorted_reads.bam \
OUTPUT=insert_metrics.txt \
HISTOGRAM_FILE=insert_size_histogram.pdf
#JOB picard_markdups
#DEPS bwa_mem
#INPUTS sorted_reads.bam
#INPUTS metrics.txt **(IS this input or output?)
#OUTPUTS dedup_reads.bam
#TIME
#real 0m17.160s
#user 0m28.817s
#sys 0m1.517s
picard MarkDuplicates \
INPUT=sorted_reads.bam \
OUTPUT=dedup_reads.bam \
METRICS_FILE=metrics.txt
#JOB picard_bamindex
#INPUTS dedup_reads.bam
#TIME
#real 0m3.895s
#user 0m6.709s
#sys 0m0.658s
picard -Xmx2g BuildBamIndex \
I=dedup_reads.bam
#JOB picard_createdict
#INPUTS GCF_000001405.33_GRCh38.p7_chr20_genomic.fna
#OUTPUTS GCF_000001405.33_GRCh38.p7_chr20_genomic.fna.dict
#TIME
#real 0m1.665s
#user 0m2.206s
#sys 0m0.333s
rm -rf GCF_000001405.33_GRCh38.p7_chr20_genomic.fna.dict
picard CreateSequenceDictionary \
R=GCF_000001405.33_GRCh38.p7_chr20_genomic.fna \
O=GCF_000001405.33_GRCh38.p7_chr20_genomic.fna.dict
#JOB samtools_faidx
#INPUTS GCF_000001405.33_GRCh38.p7_chr20_genomic.fna
#OUTPUTS GCF_000001405.33_GRCh38.p7_chr20_genomic.fna.fai
#TIME
#real 0m0.362s
#user 0m0.331s
#sys 0m0.025s
samtools faidx GCF_000001405.33_GRCh38.p7_chr20_genomic.fna
#JOB tabix
#INPUTS 1000G_omni2.5.hg38.vcf.gz
tabix -f -p vcf 1000G_omni2.5.hg38.vcf.gz
#JOB gatk_baserecal
#DEPS samtools_faidx
#DEPS tabix
#DEPS picard_markdups
#DEPS picard_bamindex
#INPUTS GCF_000001405.33_GRCh38.p7_chr20_genomic.fna
#INPUTS GCF_000001405.33_GRCh38.p7_chr20_genomic.fna.fai
#INPUTS 1000G_omni2.5.hg38.vcf.gz
#INPUTS dedup_reads.bam
#OUTPUTS recal_data.table
#TIME
#real 0m52.047s
#user 1m22.863s
#sys 0m5.596s
gatk -T BaseRecalibrator \
-I dedup_reads.bam \
-R GCF_000001405.33_GRCh38.p7_chr20_genomic.fna \
-knownSites 1000G_omni2.5.hg38.vcf.gz \
-o recal_data.table
#JOB gatk_printreads
#DEPS gatk_baserecal
#INPUTS GCF_000001405.33_GRCh38.p7_chr20_genomic.fna
#INPUTS GCF_000001405.33_GRCh38.p7_chr20_genomic.fna.fai
#INPUTS dedup_reads.bam
#OUTPUTS recal_reads.bam
#TIME
#real 0m47.264s
#user 1m17.679s
#sys 0m3.246s
gatk -T PrintReads \
-R GCF_000001405.33_GRCh38.p7_chr20_genomic.fna \
-I dedup_reads.bam \
-BQSR recal_data.table \
-o recal_reads.bam
#JOB gatk_hapcaller
#DEPS gatk_printreads
#INPUTS GCF_000001405.33_GRCh38.p7_chr20_genomic.fna
#INPUTS GCF_000001405.33_GRCh38.p7_chr20_genomic.fna.fai
#INPUTS dedup_reads.bam
#OUTPUTS recal_reads.bam
#TIME
#real 1m22.284s
#user 2m6.945s
#sys 0m2.806s
gatk -T HaplotypeCaller \
-R GCF_000001405.33_GRCh38.p7_chr20_genomic.fna \
-I recal_reads.bam \
-o raw_variants.vcf
#JOB gatk_select_snps
#DEPS gatk_hapcaller
#INPUTS GCF_000001405.33_GRCh38.p7_chr20_genomic.fna
#INPUTS GCF_000001405.33_GRCh38.p7_chr20_genomic.fna.fai
#INPUTS raw_variants.vcf
#OUTPUTS raw_snps.vcf
#TIME
#real 0m6.927s
#user 0m10.721s
#sys 0m0.868s
gatk -T SelectVariants \
-R GCF_000001405.33_GRCh38.p7_chr20_genomic.fna \
-V raw_variants.vcf \
-selectType SNP \
-o raw_snps.vcf
#JOB gatk_select_indels
#DEPS gatk_hapcaller
#INPUTS GCF_000001405.33_GRCh38.p7_chr20_genomic.fna
#INPUTS GCF_000001405.33_GRCh38.p7_chr20_genomic.fna.fai
#INPUTS raw_variants.vcf
#OUTPUTS raw_indels.vcf
#TIME
#real 0m6.447s
#user 0m9.051s
#sys 0m0.827s
gatk -T SelectVariants \
-R GCF_000001405.33_GRCh38.p7_chr20_genomic.fna \
-V raw_variants.vcf \
-selectType INDEL \
-o raw_indels.vcf
#JOB gatk_var_filter_snps
#DEPS gatk_select_snps
#INPUTS GCF_000001405.33_GRCh38.p7_chr20_genomic.fna
#INPUTS GCF_000001405.33_GRCh38.p7_chr20_genomic.fna.fai
#INPUTS raw_snps.vcf
#OUTPUTS filtered_snps.vcf
#--filterExpression \" "QD < 2.0 || FS > 60.0 || MQ < 40.0 || MQRankSum < '-12.5' || ReadPosRankSum < '-8.0' || SOR > 4.0" \" \
#TIME
#real 0m7.004s
#user 0m10.866s
#sys 0m0.993s
gatk -T VariantFiltration \
-R GCF_000001405.33_GRCh38.p7_chr20_genomic.fna \
-V raw_snps.vcf \
-filterName "QD_filter" \
-filter "QD'<'2.0" \
-filterName "FS_filter" \
-filter "FS'>'60.0" \
-filterName "MQ_filter" \
-filter "MQ'<'40.0" \
-filterName "SOR_filter" \
-filter "SOR'>'4.0" \
-o filtered_snps.vcf
#JOB gatk_var_filter_indels
#DEPS gatk_select_indels
#INPUTS GCF_000001405.33_GRCh38.p7_chr20_genomic.fna
#INPUTS GCF_000001405.33_GRCh38.p7_chr20_genomic.fna.fai
#INPUTS raw_indels.vcf
#OUTPUTS filtered_indels.vcf
#--filterExpression 'QD < 2.0 || FS > 200.0 || SOR > 10.0' \
#TIME
#real 0m5.938s
#user 0m7.904s
#sys 0m0.854s
gatk -T VariantFiltration \
-R GCF_000001405.33_GRCh38.p7_chr20_genomic.fna \
-V raw_indels.vcf \
-filterName "QD_filter" \
-filter "QD'<'2.0" \
-filterName "FS_filter" \
-filter "FS'>'200.0" \
-filterName "SOR_filter" \
-filter "SOR'>'10.0" \
-o filtered_indels.vcf
#Get SnpEff database
#wget https://sourceforge.net/projects/snpeff/files/databases/v4_1/snpEff_v4_1_GRCh38.p2.RefSeq.zip
#JOB snpeff
#DEPS gatk_var_filter_snps
#INPUTS filtered_snps_renamed.vcf ****(Where does this come from?)
#OUTPUTS filtered_snps_ann.vcf
#TIME
#real 32m31.022s
#user 63m32.585s
#sys 1m3.405s
sed -i 's/NC_000020.11/20/g' filtered_snps.vcf
/scratch/gencore/software/snpEff/4.1/bin/snpEff \
-v GRCh38.p2.RefSeq \
filtered_snps.vcf > filtered_snps.ann.vcf
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment