Last active
July 26, 2024 11:42
-
-
Save mbk0asis/932cda7f68020eb09481 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
############################################################################################# | |
# Commands in GATK 4.0 drastically changed. | |
# to run 'MarkDuplicates', simply | |
$ /dir/to/gatk Markduplicates --INPUT test.bam --OUTPUT test.dedupped.bam [options] | |
# originally | |
$ picard-tools MarkDuplicates I=test.bam O=test.dedupped.bam [options] | |
############################################################################################# | |
############################################################################################# | |
# replace each of the following with correct paths | |
STAR=~/bin/STAR-STAR_2.5.0a/bin/Linux_x86_64/STAR | |
GATK=~/bin/GenomeAnalysisTK-3.5/GenomeAnalysisTK.jar | |
GenomeDir=~/00-NGS/Annotation/Bos_taurus_NCBI_UMD3.1/STARIndex/ | |
GenomeFasta=~/00-NGS/Annotation/Bos_taurus_NCBI_UMD3.1/STARIndex/genome.fa | |
Reads="/00-NGS/RNAseq/bov/niceM/FASTQ/FI*" | |
CommonPars="--runThreadN 8 --outSAMattributes All" | |
############################################################################################# | |
STAR RNA-seq alignment | |
# generate indexed genome for 1st pass alignment | |
mkdir STAR_index | |
cd STAR_index | |
$STAR --genomeDir $GenomeDir --runMode genomeGenerate --genomeFastaFiles $GenomeFasta \ | |
--sjdbGTFfile ~/00-NGS/Annotation/Homo_sapiens_UCSC_GRCh38/GTF/genes.gtf --sjdbOverhang 100 --runThreadN 8 | |
## '--sjdbOverhang' is 'ReadLength-1' | |
# run 1st pass | |
mkdir Pass1 | |
cd Pass1 | |
for f in $Reads; do \ | |
$STAR $CommonPars --genomeDir $GenomeDir --readFilesIn $f --outFileNamePrefix $f_ ; done | |
# make splice junctions database file out of SJ.out.tab, | |
# filter out non-canonical junctions | |
# (if have multiple files, combine them into one and conduct filtering) | |
mkdir GenomeForPass2 | |
cd GenomeForPass2 | |
awk 'BEGIN {OFS="\t"; strChar[0]="."; strChar[1]="+"; strChar[2]="-";} \ | |
{if($5 > 0){print $1,$2,$3,strChar[$4]}}' \ | |
../Pass1/SJ.out.tab > SJ.out.tab.Pass1.sjdb | |
# generate genome with junctions from the 1st pass | |
$STAR --genomeDir ./ --runMode genomeGenerate --genomeFastaFiles $GenomeFasta \ | |
--sjdbFileChrStartEnd SJ.out.tab.Pass1.sjdb --sjdbOverhang 100 --runThreadN 8 | |
cd .. | |
# run 2nd pass with the new genome (for novel junction discovery) | |
mkdir Pass2 | |
cd Pass2 | |
for f in $Reads; do \ | |
$STAR $CommonPars --genomeDir ../GenomeForPass2 --readFilesIn $f \ | |
--outFileNamePrefix $f_ ; done | |
############################################################################################# | |
PICARD - add read groups, sorting, remove duplicates | |
# Picard AddOrReplaceReadGroups & sorting | |
for b in ./*.bam; do \ | |
java -jar ~/picard/build/lib/picard.jar AddOrReplaceReadGroups I=$b O=$b.rg.sort.bam SO=coordinate \ | |
RGID=$b RGLB=TruSeq_DNA_prep RGPL=illumina RGPU=hiseq2000 RGSM=$b ; done | |
# Picard Markduplicates (for amplicon analyses, this step may be skipped) | |
for b in ./*.sort.bam; do \ | |
java -jar ~/picard/build/lib/picard.jar MarkDuplicates I=$b O=$b.dedupped.bam CREATE_INDEX=true \ | |
VALIDATION_STRINGENCY=SILENT M=output.metrics ; done | |
# index bam files | |
for b in ./*.rg.sort.dedupped.bam ; do | |
samtools index $b ; done | |
# Reference Sequence Dictionary file (AAA.dict) | |
# go to the directory containing 'reference.fasta' | |
java -jar ~/picard/build/lib/picard.jar CreateSequenceDictionary R=ref.fasta O=ref.dict | |
############################################################################################# | |
GATK - varant calling | |
# GATK SplitNCigarReads | |
for d in ./*.dedupped.bam ; do \ | |
java -jar $GATK -T SplitNCigarReads -R $GenomeFasta -I $d -o $d.split.bam \ | |
-rf ReassignOneMappingQuality -RMQF 255 -RMQT 60 -U ALLOW_N_CIGAR_READS; done | |
# GATK Indel Realignment (Optional) | |
java -jar $GATK -T IndelRealigner -R $GenomeFasta -I split.bam \ | |
-known indels.vcf -targetIntervals intervalListFromRTC.intervals \ | |
-o realigned.bam | |
# GATK Base recalibration (highly recommended, but not works without known SNP data. | |
# Skip this step, if can't find dbSNP.vcf file for the organism) | |
java -jar $GATK -T BaseRecalibrator -R $GenomeFasta -I realigned.bam \ | |
-knownSites latest_dbsnp.vcf -o recal_data.table; done | |
# Choose one of variant calling tools (UnifiedGenotyper or HaplotypeCaller) | |
# UnifiedGenotyper | |
# Multi-sample SNP calling | |
java -jar GenomeAnalysisTK.jar \ | |
-T UnifiedGenotyper \ | |
-R reference.fasta \ | |
-I sample1.bam [-I sample2.bam ...] \ | |
--dbsnp dbSNP.vcf \ | |
-o snps.raw.vcf \ | |
-stand_call_conf [50.0] \ | |
-stand_emit_conf 10.0 \ | |
[-L targets.interval_list] | |
# Generate calls at all sites | |
java -jar GenomeAnalysisTK.jar \ | |
-nt 8 \ # number of threads | |
-T UnifiedGenotyper \ | |
-R reference.fasta \ | |
-I input.bam \ | |
-o raw_variants.vcf \ | |
--output_mode EMIT_ALL_SITES \ | |
-dt NONE # no downsampling to 250 | |
# GATK HaplotypeCaller | |
for s in ./*.split.bam; do \ | |
java -jar $GATK -T HaplotypeCaller -R $GenomeFasta -I $s \ | |
-stand_call_conf 20 -o $s.vcf -filterNoBases --filter_reads_with_N_cigar; | |
done | |
# If having errors with "incompatible contigs", | |
# run picard ReorderSam then GATK HaplotypeCaller | |
# if picard was installed using 'apt-get' (i.e. Ubuntu repository) | |
picard-tools ReorderSam I=original.bam R=reference.fasta O=reordered.bam | |
# if picard was installed from the source files | |
java -jar picard.jar ReorderSam I=original.bam R=reference.fasta O=reordered.bam | |
# GATK VariantFiltration (with HaplotypeCaller outputs) | |
for v in ./*.bam.vcf; do java -jar ~/bin/GenomeAnalysisTK-3.5/GenomeAnalysisTK.jar \ | |
-T VariantFiltration \ | |
-R ~/00-NGS/Annotation/Bos_taurus_NCBI_UMD3.1/Bowtie2Index/genome.fa \ | |
-V $v \ | |
-window 35 \ | |
-cluster 3 \ | |
-filterName FS -filter "FS > 30" \ | |
-filterName QD -filter "QD < 2" \ | |
-o $v.filtered.vcf; done | |
# vcf files can be uploaded on IGV |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
The commands in this post are definately oudated. It was for GATK3.5. 'GATK4' was released quite while ago, so you should check the latest instruction for proper usage. FYI, '-T' represent tools which was deprecated in GATK4.