Last active
April 7, 2016 10:36
-
-
Save mbk0asis/1b9abbffba19bcff6a1964a348892c68 to your computer and use it in GitHub Desktop.
multiplex cancer mutation detection
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
############################################################################################# | |
# | |
# Cancer Mutation Targeted NGS by Multiplex PCR | |
# Alignmnet on cDNA using 'Bowtie2' | |
# Variants analyses using 'GATK' | |
# | |
############################################################################################# | |
# extract cDNA sequences from COSMIC "All_COSMIC_Genes.fasta" | |
$ grep ">" cancer_amplicons.fasta | head | |
>APC_2596 | |
>APC_3275 | |
>APC_3891 | |
>APC_4076 | |
>APC_4247 | |
>APC_4325 | |
>APC_4643 | |
>APC_593 | |
>ARID1A_3766 | |
>ARID1A_3948 | |
# extract gene names | |
$ grep ">" cancer.fasta | sed 's/>//g;s/_/\t/g' | cut -f 1 | uniq > mut_target_genes | |
APC | |
ARID1A | |
ARID2 | |
ATM | |
AXIN1 | |
BRAF | |
CDH1 | |
CDKN2A | |
CTNNB1 | |
EGFR | |
# extract target gene cDNA sequences | |
$ pyfasta extract --fasta=All_COSMIC_Genes.fasta --file --space --header mut_target_genes > mutGenes.fa | |
# Bowtie2 mapping to cDNAs | |
$ bowtie2-build mutGenes.fa mutGenes | |
$ for f in ./*.fq; do bowtie2 -p 8 -x ../../refAmplicons/mutGenes -U $f -S $f.sam ; done | |
# samtools ('sort' and 'index' may be skipped for SNP analyses) | |
$ for s in ./*.sam; do samtools view -bS $s | samtools sort -@ 8 - $s.sorted; done | |
$ for b in ./*.bam; do samtools index $b & done | |
############################################################################################# | |
PICARD - add read groups, sorting, remove duplicates | |
# Picard AddOrReplaceReadGroups & sorting | |
$ for b in ./*.bam; do \ | |
picard-tools AddOrReplaceReadGroups I=$b O=$b.rg.sort.bam SO=coordinate \ | |
RGID=$b RGLB=TruSeq_DNA_prep RGPL=illumina RGPU=hiseq2000 RGSM=$b ; done | |
# Picard Markduplicates (for amplicon analyses, this step may be skipped) | |
$ for b in ./*.sort.bam; do \ | |
picard-tools MarkDuplicates I=$b O=$b.dedupped.bam CREATE_INDEX=true \ | |
VALIDATION_STRINGENCY=SILENT M=output.metrics ; done | |
# index bam files | |
$ for b in ./*.rg.sort.dedupped.bam ; do \ | |
samtools index $b ; done | |
# Reference Sequence Dictionary file (AAA.dict) | |
# go to the directory containing 'reference.fasta' | |
$ picard-tools CreateSequenceDictionary R=ref.fasta -O ref.dict | |
############################################################################################# | |
GATK - varant calling | |
# GATK SplitNCigarReads | |
$ for d in ./*.dedupped.bam ; do \ | |
java -jar GenomeAnalysisTK.jar -T SplitNCigarReads \ | |
-R $GenomeFasta \ | |
-I $d \ | |
-o $d.split.bam \ | |
-rf ReassignOneMappingQuality \ | |
-RMQF 255 -RMQT 60 -U ALLOW_N_CIGAR_READS; | |
done | |
# GATK Indel Realignment (Optional) | |
$ java -jar GenomeAnalysisTK.jar \ | |
-T IndelRealigner \ | |
-R $GenomeFasta \ | |
-I split.bam \ | |
-known indels.vcf \ | |
-targetIntervals intervalListFromRTC.intervals \ | |
-o realigned.bam | |
# GATK Base recalibration (highly recommended, but not works without known SNP data. | |
# Skip this step, if can't find dbSNP.vcf file for the organism) | |
$ java -jar GenomeAnalysisTK.jar \ | |
-T BaseRecalibrator \ | |
-R $GenomeFasta \ | |
-I realigned.bam \ | |
-knownSites latest_dbsnp.vcf \ | |
-o recal_data.table; done | |
# UnifiedGenotyper (Variant Calling) | |
$ java -jar ~/bin/GATK/GenomeAnalysisTK.jar \ | |
-T UnifiedGenotyper \ | |
-R ref_cDNA/mutGenes.fa \ | |
-I INPUT.bam -o OUTPUT.vcf \ | |
-glm BOTH \ | |
-deletions 1 \ | |
-minIndelFrac 0.01 | |
################################################################################################### | |
# Add a column in the front for hotspot extraction | |
$ for v in ./*.vcf; do grep -v "#" $v | awk 'BEGIN{FS=OFS="\t"}{print $1"_"$2,$0}' > $v.mod; done | |
# Original | |
APC 2595 . A G 249.77 . AC=1;AF=0.500;AN=2;BaseQRankSum=-1.377;DP=24;Dels=0.08;ExcessHet=3.0103;FS=3.090;HaplotypeScore=0.0000;MLEAC=1;MLEAF=0.500;MQ=28.44;MQ0=12;MQRankSum=1.531;QD=15.61;ReadPosRankSum=-2.262;SOR=1.773 GT:AD:DP:GQ:PL 0/1:6,10:22:91:278,0,91 | |
# Modified | |
APC_2595 APC 2595 . A G 249.77 . AC=1;AF=0.500;AN=2;BaseQRankSum=-1.377;DP=24;Dels=0.08;ExcessHet=3.0103;FS=3.090;HaplotypeScore=0.0000;MLEAC=1;MLEAF=0.500;MQ=28.44;MQ0=12;MQRankSum=1.531;QD=15.61;ReadPosRankSum=-2.262;SOR=1.773 GT:AD:DP:GQ:PL 0/1:6,10:22:91:278,0,91 | |
################################################################################################## | |
# extract hotspots from tables | |
$ for v in ./*.vcf.mod ; do | |
cat cancerMutHotSpots.txt | while read line; do grep -w $line $v ; done | cut -f 1,2,3,5,6,11 | sed 's/:/\t/g' | cut -f 1,2,3,4,5,6,7,8 | sed 's/,/\t/g' > $v.hotspots; | |
done | |
# hotspot results | |
CTNNB1_121 CTNNB1 121 A G 0/1 153,97 250 | |
GRIN2A_2884 GRIN2A 2884 G A 0/1 219,31 250 | |
KRAS_35 KRAS 35 G T 0/1 30,205 250 | |
MED12_107 MED12 107 T G 0/1 10,7 17 | |
PIK3CA_3140 PIK3CA 3140 A G 0/1 218,32 250 | |
TP53_818 TP53 818 G A 0/1 23,67 90 | |
################################################################################################## | |
# join with hotspot list | |
$ for h in ./*.hotspots; do join -a1 cancerMutHotSpots.txt $h > $h.result; done | |
# result - only matching hotspots are appended | |
CDKN2A_322 | |
CDKN2A_329 | |
CDKN2A_330 | |
CDKN2A_341 | |
CTNNB1_110 | |
CTNNB1_121 CTNNB1 121 A G 0/1 153 97 250 | |
CTNNB1_122 | |
CTNNB1_133 | |
CTNNB1_134 | |
CTNNB1_94 | |
CTNNB1_98 | |
EGFR_2235 | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment