Skip to content

Instantly share code, notes, and snippets.

@mbk0asis
Last active April 7, 2016 10:36
Show Gist options
  • Save mbk0asis/1b9abbffba19bcff6a1964a348892c68 to your computer and use it in GitHub Desktop.
Save mbk0asis/1b9abbffba19bcff6a1964a348892c68 to your computer and use it in GitHub Desktop.
multiplex cancer mutation detection
#############################################################################################
#
# Cancer Mutation Targeted NGS by Multiplex PCR
# Alignmnet on cDNA using 'Bowtie2'
# Variants analyses using 'GATK'
#
#############################################################################################
# extract cDNA sequences from COSMIC "All_COSMIC_Genes.fasta"
$ grep ">" cancer_amplicons.fasta | head
>APC_2596
>APC_3275
>APC_3891
>APC_4076
>APC_4247
>APC_4325
>APC_4643
>APC_593
>ARID1A_3766
>ARID1A_3948
# extract gene names
$ grep ">" cancer.fasta | sed 's/>//g;s/_/\t/g' | cut -f 1 | uniq > mut_target_genes
APC
ARID1A
ARID2
ATM
AXIN1
BRAF
CDH1
CDKN2A
CTNNB1
EGFR
# extract target gene cDNA sequences
$ pyfasta extract --fasta=All_COSMIC_Genes.fasta --file --space --header mut_target_genes > mutGenes.fa
# Bowtie2 mapping to cDNAs
$ bowtie2-build mutGenes.fa mutGenes
$ for f in ./*.fq; do bowtie2 -p 8 -x ../../refAmplicons/mutGenes -U $f -S $f.sam ; done
# samtools ('sort' and 'index' may be skipped for SNP analyses)
$ for s in ./*.sam; do samtools view -bS $s | samtools sort -@ 8 - $s.sorted; done
$ for b in ./*.bam; do samtools index $b & done
#############################################################################################
PICARD - add read groups, sorting, remove duplicates
# Picard AddOrReplaceReadGroups & sorting
$ for b in ./*.bam; do \
picard-tools AddOrReplaceReadGroups I=$b O=$b.rg.sort.bam SO=coordinate \
RGID=$b RGLB=TruSeq_DNA_prep RGPL=illumina RGPU=hiseq2000 RGSM=$b ; done
# Picard Markduplicates (for amplicon analyses, this step may be skipped)
$ for b in ./*.sort.bam; do \
picard-tools MarkDuplicates I=$b O=$b.dedupped.bam CREATE_INDEX=true \
VALIDATION_STRINGENCY=SILENT M=output.metrics ; done
# index bam files
$ for b in ./*.rg.sort.dedupped.bam ; do \
samtools index $b ; done
# Reference Sequence Dictionary file (AAA.dict)
# go to the directory containing 'reference.fasta'
$ picard-tools CreateSequenceDictionary R=ref.fasta -O ref.dict
#############################################################################################
GATK - varant calling
# GATK SplitNCigarReads
$ for d in ./*.dedupped.bam ; do \
java -jar GenomeAnalysisTK.jar -T SplitNCigarReads \
-R $GenomeFasta \
-I $d \
-o $d.split.bam \
-rf ReassignOneMappingQuality \
-RMQF 255 -RMQT 60 -U ALLOW_N_CIGAR_READS;
done
# GATK Indel Realignment (Optional)
$ java -jar GenomeAnalysisTK.jar \
-T IndelRealigner \
-R $GenomeFasta \
-I split.bam \
-known indels.vcf \
-targetIntervals intervalListFromRTC.intervals \
-o realigned.bam
# GATK Base recalibration (highly recommended, but not works without known SNP data.
# Skip this step, if can't find dbSNP.vcf file for the organism)
$ java -jar GenomeAnalysisTK.jar \
-T BaseRecalibrator \
-R $GenomeFasta \
-I realigned.bam \
-knownSites latest_dbsnp.vcf \
-o recal_data.table; done
# UnifiedGenotyper (Variant Calling)
$ java -jar ~/bin/GATK/GenomeAnalysisTK.jar \
-T UnifiedGenotyper \
-R ref_cDNA/mutGenes.fa \
-I INPUT.bam -o OUTPUT.vcf \
-glm BOTH \
-deletions 1 \
-minIndelFrac 0.01
###################################################################################################
# Add a column in the front for hotspot extraction
$ for v in ./*.vcf; do grep -v "#" $v | awk 'BEGIN{FS=OFS="\t"}{print $1"_"$2,$0}' > $v.mod; done
# Original
APC 2595 . A G 249.77 . AC=1;AF=0.500;AN=2;BaseQRankSum=-1.377;DP=24;Dels=0.08;ExcessHet=3.0103;FS=3.090;HaplotypeScore=0.0000;MLEAC=1;MLEAF=0.500;MQ=28.44;MQ0=12;MQRankSum=1.531;QD=15.61;ReadPosRankSum=-2.262;SOR=1.773 GT:AD:DP:GQ:PL 0/1:6,10:22:91:278,0,91
# Modified
APC_2595 APC 2595 . A G 249.77 . AC=1;AF=0.500;AN=2;BaseQRankSum=-1.377;DP=24;Dels=0.08;ExcessHet=3.0103;FS=3.090;HaplotypeScore=0.0000;MLEAC=1;MLEAF=0.500;MQ=28.44;MQ0=12;MQRankSum=1.531;QD=15.61;ReadPosRankSum=-2.262;SOR=1.773 GT:AD:DP:GQ:PL 0/1:6,10:22:91:278,0,91
##################################################################################################
# extract hotspots from tables
$ for v in ./*.vcf.mod ; do
cat cancerMutHotSpots.txt | while read line; do grep -w $line $v ; done | cut -f 1,2,3,5,6,11 | sed 's/:/\t/g' | cut -f 1,2,3,4,5,6,7,8 | sed 's/,/\t/g' > $v.hotspots;
done
# hotspot results
CTNNB1_121 CTNNB1 121 A G 0/1 153,97 250
GRIN2A_2884 GRIN2A 2884 G A 0/1 219,31 250
KRAS_35 KRAS 35 G T 0/1 30,205 250
MED12_107 MED12 107 T G 0/1 10,7 17
PIK3CA_3140 PIK3CA 3140 A G 0/1 218,32 250
TP53_818 TP53 818 G A 0/1 23,67 90
##################################################################################################
# join with hotspot list
$ for h in ./*.hotspots; do join -a1 cancerMutHotSpots.txt $h > $h.result; done
# result - only matching hotspots are appended
CDKN2A_322
CDKN2A_329
CDKN2A_330
CDKN2A_341
CTNNB1_110
CTNNB1_121 CTNNB1 121 A G 0/1 153 97 250
CTNNB1_122
CTNNB1_133
CTNNB1_134
CTNNB1_94
CTNNB1_98
EGFR_2235
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment