Last active
July 29, 2018 04:01
-
-
Save jerowe/74e8a92844b6e721b27e1c42d0442d8a to your computer and use it in GitHub Desktop.
Variant Detection - Test your dataset
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env bash | |
| #SBATCH -p serial | |
| #SBATCH --job-name=metagenomics | |
| #SBATCH --time=00:10:00 | |
| # Output and error files | |
| #SBATCH -o job.%J.out | |
| #SBATCH -e job.%J.err | |
| #SBATCH --nodes=1 | |
| #SBATCH --ntasks=1 | |
| #SBATCH --cpus-per-task=6 | |
| #SBATCH --mem=10GB | |
| #Total workflow time i | |
| #real 5m52.126s | |
| #user 8m16.341s | |
| #sys 0m25.835s | |
| set -x -e | |
| #module purge all | |
| #srun --mem 10GB --cpus-per-task 4 --pty /bin/bash | |
| module load gencore/1 gencore_variant_detection | |
| cp -rf /scratch/gencore/datasets/variant_detection.tar.gz ./ | |
| tar -xvf variant_detection.tar.gz | |
| #JOB bwa_index | |
| #INPUTS GCF_000001405.33_GRCh38.p7_chr20_genomic.fna | |
| #TIME 69.895 sec | |
| bwa index GCF_000001405.33_GRCh38.p7_chr20_genomic.fna | |
| #JOB bwa_mem | |
| #DEPS bwa_index | |
| #INPUTS GCF_000001405.33_GRCh38.p7_chr20_genomic.fna | |
| #INPUTS read_1.fastq | |
| #INPUTS read_2.fastq | |
| #OUTPUTS aligned_reads.sam | |
| #TIME | |
| #real 0m27.728s | |
| #user 0m26.764s | |
| #sys 0m0.896s | |
| bwa mem -M \ | |
| -R '@RG\tID:sample_1\tLB:sample_1\tPL:ILLUMINA\tPM:HISEQ\tSM:sample_1' \ | |
| GCF_000001405.33_GRCh38.p7_chr20_genomic.fna \ | |
| read_1.fastq \ | |
| read_2.fastq > aligned_reads.sam | |
| #JOB picard_sortsam | |
| #DEPS bwa_mem | |
| #INPUTS aligned_reads.sam | |
| #OUTPUTS sorted_reads.bam | |
| #TIME | |
| #real 0m10.652s | |
| #user 0m14.259s | |
| #sys 0m3.275s | |
| picard SortSam \ | |
| INPUT=aligned_reads.sam \ | |
| OUTPUT=sorted_reads.bam \ | |
| SORT_ORDER=coordinate | |
| #JOB samtools_flagstat | |
| #DEPS picard_sortsam | |
| #INPUTS aligned_reads.sam | |
| #OUTPUTS ??? | |
| #TIME | |
| #real 0m0.836s | |
| #user 0m0.319s | |
| #sys 0m0.031s | |
| samtools flagstat aligned_reads.sam | |
| #JOB picard_collectinsertsize | |
| #DEPS picard_sortsam | |
| #INPUTS sorted_reads.bam | |
| #OUTPUTS insert_metrics.txt | |
| #TIME | |
| #real 0m11.002s | |
| #user 0m15.863s | |
| #sys 0m1.088s | |
| picard CollectInsertSizeMetrics \ | |
| INPUT=sorted_reads.bam \ | |
| OUTPUT=insert_metrics.txt \ | |
| HISTOGRAM_FILE=insert_size_histogram.pdf | |
| #JOB picard_markdups | |
| #DEPS bwa_mem | |
| #INPUTS sorted_reads.bam | |
| #INPUTS metrics.txt **(IS this input or output?) | |
| #OUTPUTS dedup_reads.bam | |
| #TIME | |
| #real 0m17.160s | |
| #user 0m28.817s | |
| #sys 0m1.517s | |
| picard MarkDuplicates \ | |
| INPUT=sorted_reads.bam \ | |
| OUTPUT=dedup_reads.bam \ | |
| METRICS_FILE=metrics.txt | |
| #JOB picard_bamindex | |
| #INPUTS dedup_reads.bam | |
| #TIME | |
| #real 0m3.895s | |
| #user 0m6.709s | |
| #sys 0m0.658s | |
| picard -Xmx2g BuildBamIndex \ | |
| I=dedup_reads.bam | |
| #JOB picard_createdict | |
| #INPUTS GCF_000001405.33_GRCh38.p7_chr20_genomic.fna | |
| #OUTPUTS GCF_000001405.33_GRCh38.p7_chr20_genomic.fna.dict | |
| #TIME | |
| #real 0m1.665s | |
| #user 0m2.206s | |
| #sys 0m0.333s | |
| rm -rf GCF_000001405.33_GRCh38.p7_chr20_genomic.fna.dict | |
| picard CreateSequenceDictionary \ | |
| R=GCF_000001405.33_GRCh38.p7_chr20_genomic.fna \ | |
| O=GCF_000001405.33_GRCh38.p7_chr20_genomic.fna.dict | |
| #JOB samtools_faidx | |
| #INPUTS GCF_000001405.33_GRCh38.p7_chr20_genomic.fna | |
| #OUTPUTS GCF_000001405.33_GRCh38.p7_chr20_genomic.fna.fai | |
| #TIME | |
| #real 0m0.362s | |
| #user 0m0.331s | |
| #sys 0m0.025s | |
| samtools faidx GCF_000001405.33_GRCh38.p7_chr20_genomic.fna | |
| #JOB tabix | |
| #INPUTS 1000G_omni2.5.hg38.vcf.gz | |
| tabix -f -p vcf 1000G_omni2.5.hg38.vcf.gz | |
| #JOB gatk_baserecal | |
| #DEPS samtools_faidx | |
| #DEPS tabix | |
| #DEPS picard_markdups | |
| #DEPS picard_bamindex | |
| #INPUTS GCF_000001405.33_GRCh38.p7_chr20_genomic.fna | |
| #INPUTS GCF_000001405.33_GRCh38.p7_chr20_genomic.fna.fai | |
| #INPUTS 1000G_omni2.5.hg38.vcf.gz | |
| #INPUTS dedup_reads.bam | |
| #OUTPUTS recal_data.table | |
| #TIME | |
| #real 0m52.047s | |
| #user 1m22.863s | |
| #sys 0m5.596s | |
| gatk -T BaseRecalibrator \ | |
| -I dedup_reads.bam \ | |
| -R GCF_000001405.33_GRCh38.p7_chr20_genomic.fna \ | |
| -knownSites 1000G_omni2.5.hg38.vcf.gz \ | |
| -o recal_data.table | |
| #JOB gatk_printreads | |
| #DEPS gatk_baserecal | |
| #INPUTS GCF_000001405.33_GRCh38.p7_chr20_genomic.fna | |
| #INPUTS GCF_000001405.33_GRCh38.p7_chr20_genomic.fna.fai | |
| #INPUTS dedup_reads.bam | |
| #OUTPUTS recal_reads.bam | |
| #TIME | |
| #real 0m47.264s | |
| #user 1m17.679s | |
| #sys 0m3.246s | |
| gatk -T PrintReads \ | |
| -R GCF_000001405.33_GRCh38.p7_chr20_genomic.fna \ | |
| -I dedup_reads.bam \ | |
| -BQSR recal_data.table \ | |
| -o recal_reads.bam | |
| #JOB gatk_hapcaller | |
| #DEPS gatk_printreads | |
| #INPUTS GCF_000001405.33_GRCh38.p7_chr20_genomic.fna | |
| #INPUTS GCF_000001405.33_GRCh38.p7_chr20_genomic.fna.fai | |
| #INPUTS dedup_reads.bam | |
| #OUTPUTS recal_reads.bam | |
| #TIME | |
| #real 1m22.284s | |
| #user 2m6.945s | |
| #sys 0m2.806s | |
| gatk -T HaplotypeCaller \ | |
| -R GCF_000001405.33_GRCh38.p7_chr20_genomic.fna \ | |
| -I recal_reads.bam \ | |
| -o raw_variants.vcf | |
| #JOB gatk_select_snps | |
| #DEPS gatk_hapcaller | |
| #INPUTS GCF_000001405.33_GRCh38.p7_chr20_genomic.fna | |
| #INPUTS GCF_000001405.33_GRCh38.p7_chr20_genomic.fna.fai | |
| #INPUTS raw_variants.vcf | |
| #OUTPUTS raw_snps.vcf | |
| #TIME | |
| #real 0m6.927s | |
| #user 0m10.721s | |
| #sys 0m0.868s | |
| gatk -T SelectVariants \ | |
| -R GCF_000001405.33_GRCh38.p7_chr20_genomic.fna \ | |
| -V raw_variants.vcf \ | |
| -selectType SNP \ | |
| -o raw_snps.vcf | |
| #JOB gatk_select_indels | |
| #DEPS gatk_hapcaller | |
| #INPUTS GCF_000001405.33_GRCh38.p7_chr20_genomic.fna | |
| #INPUTS GCF_000001405.33_GRCh38.p7_chr20_genomic.fna.fai | |
| #INPUTS raw_variants.vcf | |
| #OUTPUTS raw_indels.vcf | |
| #TIME | |
| #real 0m6.447s | |
| #user 0m9.051s | |
| #sys 0m0.827s | |
| gatk -T SelectVariants \ | |
| -R GCF_000001405.33_GRCh38.p7_chr20_genomic.fna \ | |
| -V raw_variants.vcf \ | |
| -selectType INDEL \ | |
| -o raw_indels.vcf | |
| #JOB gatk_var_filter_snps | |
| #DEPS gatk_select_snps | |
| #INPUTS GCF_000001405.33_GRCh38.p7_chr20_genomic.fna | |
| #INPUTS GCF_000001405.33_GRCh38.p7_chr20_genomic.fna.fai | |
| #INPUTS raw_snps.vcf | |
| #OUTPUTS filtered_snps.vcf | |
| #--filterExpression \" "QD < 2.0 || FS > 60.0 || MQ < 40.0 || MQRankSum < '-12.5' || ReadPosRankSum < '-8.0' || SOR > 4.0" \" \ | |
| #TIME | |
| #real 0m7.004s | |
| #user 0m10.866s | |
| #sys 0m0.993s | |
| gatk -T VariantFiltration \ | |
| -R GCF_000001405.33_GRCh38.p7_chr20_genomic.fna \ | |
| -V raw_snps.vcf \ | |
| -filterName "QD_filter" \ | |
| -filter "QD'<'2.0" \ | |
| -filterName "FS_filter" \ | |
| -filter "FS'>'60.0" \ | |
| -filterName "MQ_filter" \ | |
| -filter "MQ'<'40.0" \ | |
| -filterName "SOR_filter" \ | |
| -filter "SOR'>'4.0" \ | |
| -o filtered_snps.vcf | |
| #JOB gatk_var_filter_indels | |
| #DEPS gatk_select_indels | |
| #INPUTS GCF_000001405.33_GRCh38.p7_chr20_genomic.fna | |
| #INPUTS GCF_000001405.33_GRCh38.p7_chr20_genomic.fna.fai | |
| #INPUTS raw_indels.vcf | |
| #OUTPUTS filtered_indels.vcf | |
| #--filterExpression 'QD < 2.0 || FS > 200.0 || SOR > 10.0' \ | |
| #TIME | |
| #real 0m5.938s | |
| #user 0m7.904s | |
| #sys 0m0.854s | |
| gatk -T VariantFiltration \ | |
| -R GCF_000001405.33_GRCh38.p7_chr20_genomic.fna \ | |
| -V raw_indels.vcf \ | |
| -filterName "QD_filter" \ | |
| -filter "QD'<'2.0" \ | |
| -filterName "FS_filter" \ | |
| -filter "FS'>'200.0" \ | |
| -filterName "SOR_filter" \ | |
| -filter "SOR'>'10.0" \ | |
| -o filtered_indels.vcf | |
| #Get SnpEff database | |
| #wget https://sourceforge.net/projects/snpeff/files/databases/v4_1/snpEff_v4_1_GRCh38.p2.RefSeq.zip | |
| #JOB snpeff | |
| #DEPS gatk_var_filter_snps | |
| #INPUTS filtered_snps_renamed.vcf ****(Where does this come from?) | |
| #OUTPUTS filtered_snps_ann.vcf | |
| #TIME | |
| #real 32m31.022s | |
| #user 63m32.585s | |
| #sys 1m3.405s | |
| sed -i 's/NC_000020.11/20/g' filtered_snps.vcf | |
| /scratch/gencore/software/snpEff/4.1/bin/snpEff \ | |
| -v GRCh38.p2.RefSeq \ | |
| filtered_snps.vcf > filtered_snps.ann.vcf |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment