mbk0asis · April 7, 2016 10:36
diff --git a/multiplex cancer mutation detection b/multiplex cancer mutation detection
 #############################################################################################
 #
 # Cancer Mutation Targeted NGS by Multiplex PCR
 # Alignmnet on cDNA using 'Bowtie2'
 # Variants analyses using 'GATK'
 #
 #############################################################################################

 # extract cDNA sequences from COSMIC "All_COSMIC_Genes.fasta"

 $ grep ">" cancer_amplicons.fasta | head
 >APC_2596 
 >APC_3275 
 >APC_3891 
 >APC_4076 
 >APC_4247 
 >APC_4325 
 >APC_4643 
 >APC_593 
 >ARID1A_3766 
 >ARID1A_3948 


 # extract gene names
 $ grep ">" cancer.fasta | sed 's/>//g;s/_/\t/g' | cut -f 1 | uniq > mut_target_genes
 APC
 ARID1A
 ARID2
 ATM
 AXIN1
 BRAF
 CDH1
 CDKN2A
 CTNNB1
 EGFR


 # extract target gene cDNA sequences
 $ pyfasta extract --fasta=All_COSMIC_Genes.fasta --file --space --header mut_target_genes > mutGenes.fa


 # Bowtie2 mapping to cDNAs
 $ bowtie2-build mutGenes.fa mutGenes
 $ for f in ./*.fq; do bowtie2 -p 8 -x ../../refAmplicons/mutGenes -U $f -S $f.sam ; done


 # samtools ('sort' and 'index' may be skipped for SNP analyses)
 $ for s in ./*.sam; do samtools view -bS $s | samtools sort -@ 8 - $s.sorted; done
 $ for b in ./*.bam; do samtools index $b & done


 #############################################################################################
 PICARD - add read groups, sorting, remove duplicates

 # Picard AddOrReplaceReadGroups & sorting
 $ for b in ./*.bam; do \
 picard-tools AddOrReplaceReadGroups I=$b O=$b.rg.sort.bam SO=coordinate \
 RGID=$b RGLB=TruSeq_DNA_prep RGPL=illumina RGPU=hiseq2000 RGSM=$b ; done

 # Picard Markduplicates (for amplicon analyses, this step may be skipped)
 $ for b in ./*.sort.bam; do \
 picard-tools MarkDuplicates I=$b O=$b.dedupped.bam CREATE_INDEX=true \
 VALIDATION_STRINGENCY=SILENT M=output.metrics ; done

 # index bam files
 $ for b in ./*.rg.sort.dedupped.bam ; do \
 samtools index $b ; done

 # Reference Sequence Dictionary file (AAA.dict)
 # go to the directory containing 'reference.fasta'
 $ picard-tools CreateSequenceDictionary R=ref.fasta -O ref.dict



 #############################################################################################
 GATK - varant calling

 # GATK SplitNCigarReads
 $ for d in ./*.dedupped.bam ; do \
 java -jar GenomeAnalysisTK.jar -T SplitNCigarReads \
 -R $GenomeFasta \
 -I $d \
 -o $d.split.bam \
 -rf ReassignOneMappingQuality \
 -RMQF 255 -RMQT 60 -U ALLOW_N_CIGAR_READS; 
 done


 # GATK Indel Realignment (Optional)
 $ java -jar GenomeAnalysisTK.jar \
 -T IndelRealigner \
 -R $GenomeFasta \
 -I split.bam \
 -known indels.vcf \
 -targetIntervals intervalListFromRTC.intervals \
 -o realigned.bam


 # GATK Base recalibration (highly recommended, but not works without known SNP data. 
 # Skip this step, if can't find dbSNP.vcf file for the organism)
 $ java -jar GenomeAnalysisTK.jar \
 -T BaseRecalibrator \
 -R $GenomeFasta \
 -I realigned.bam \
 -knownSites latest_dbsnp.vcf \
 -o recal_data.table; done


 # UnifiedGenotyper (Variant Calling)
 $ java -jar ~/bin/GATK/GenomeAnalysisTK.jar \
 -T UnifiedGenotyper \
 -R ref_cDNA/mutGenes.fa \
 -I INPUT.bam -o OUTPUT.vcf \
 -glm BOTH \
 -deletions 1 \
 -minIndelFrac 0.01


 ###################################################################################################
 # Add a column in the front for hotspot extraction
 $ for v in ./*.vcf; do grep -v "#" $v | awk 'BEGIN{FS=OFS="\t"}{print $1"_"$2,$0}' > $v.mod; done

 # Original
 APC	2595	.	A	G	249.77	.	AC=1;AF=0.500;AN=2;BaseQRankSum=-1.377;DP=24;Dels=0.08;ExcessHet=3.0103;FS=3.090;HaplotypeScore=0.0000;MLEAC=1;MLEAF=0.500;MQ=28.44;MQ0=12;MQRankSum=1.531;QD=15.61;ReadPosRankSum=-2.262;SOR=1.773	GT:AD:DP:GQ:PL	0/1:6,10:22:91:278,0,91

 # Modified
 APC_2595	APC	2595	.	A	G	249.77	.	AC=1;AF=0.500;AN=2;BaseQRankSum=-1.377;DP=24;Dels=0.08;ExcessHet=3.0103;FS=3.090;HaplotypeScore=0.0000;MLEAC=1;MLEAF=0.500;MQ=28.44;MQ0=12;MQRankSum=1.531;QD=15.61;ReadPosRankSum=-2.262;SOR=1.773	GT:AD:DP:GQ:PL	0/1:6,10:22:91:278,0,91


 ##################################################################################################
 # extract hotspots from tables
 $ for v in ./*.vcf.mod ; do 
 cat cancerMutHotSpots.txt | while read line; do grep -w $line $v ; done | cut -f 1,2,3,5,6,11 | sed 's/:/\t/g' | cut -f 1,2,3,4,5,6,7,8 | sed 's/,/\t/g' > $v.hotspots;
 done

 # hotspot results
 CTNNB1_121	CTNNB1	121	A	G	0/1	153,97	250
 GRIN2A_2884	GRIN2A	2884	G	A	0/1	219,31	250
 KRAS_35	KRAS	35	G	T	0/1	30,205	250
 MED12_107	MED12	107	T	G	0/1	10,7	17
 PIK3CA_3140	PIK3CA	3140	A	G	0/1	218,32	250
 TP53_818	TP53	818	G	A	0/1	23,67	90


 ##################################################################################################
 # join with hotspot list
 $ for h in ./*.hotspots; do join -a1 cancerMutHotSpots.txt $h > $h.result; done

 # result - only matching hotspots are appended
 CDKN2A_322
 CDKN2A_329
 CDKN2A_330
 CDKN2A_341
 CTNNB1_110
 CTNNB1_121 CTNNB1 121 A G 0/1 153 97 250
 CTNNB1_122
 CTNNB1_133
 CTNNB1_134
 CTNNB1_94
 CTNNB1_98
 EGFR_2235
	#############################################################################################
	#
	# Cancer Mutation Targeted NGS by Multiplex PCR
	# Alignmnet on cDNA using 'Bowtie2'
	# Variants analyses using 'GATK'
	#
	#############################################################################################

	# extract cDNA sequences from COSMIC "All_COSMIC_Genes.fasta"

	$ grep ">" cancer_amplicons.fasta \| head
	>APC_2596
	>APC_3275
	>APC_3891
	>APC_4076
	>APC_4247
	>APC_4325
	>APC_4643
	>APC_593
	>ARID1A_3766
	>ARID1A_3948


	# extract gene names
	$ grep ">" cancer.fasta \| sed 's/>//g;s/_/\t/g' \| cut -f 1 \| uniq > mut_target_genes
	APC
	ARID1A
	ARID2
	ATM
	AXIN1
	BRAF
	CDH1
	CDKN2A
	CTNNB1
	EGFR


	# extract target gene cDNA sequences
	$ pyfasta extract --fasta=All_COSMIC_Genes.fasta --file --space --header mut_target_genes > mutGenes.fa


	# Bowtie2 mapping to cDNAs
	$ bowtie2-build mutGenes.fa mutGenes
	$ for f in ./*.fq; do bowtie2 -p 8 -x ../../refAmplicons/mutGenes -U $f -S $f.sam ; done


	# samtools ('sort' and 'index' may be skipped for SNP analyses)
	$ for s in ./*.sam; do samtools view -bS $s \| samtools sort -@ 8 - $s.sorted; done
	$ for b in ./*.bam; do samtools index $b & done


	#############################################################################################
	PICARD - add read groups, sorting, remove duplicates

	# Picard AddOrReplaceReadGroups & sorting
	$ for b in ./*.bam; do \
	picard-tools AddOrReplaceReadGroups I=$b O=$b.rg.sort.bam SO=coordinate \
	RGID=$b RGLB=TruSeq_DNA_prep RGPL=illumina RGPU=hiseq2000 RGSM=$b ; done

	# Picard Markduplicates (for amplicon analyses, this step may be skipped)
	$ for b in ./*.sort.bam; do \
	picard-tools MarkDuplicates I=$b O=$b.dedupped.bam CREATE_INDEX=true \
	VALIDATION_STRINGENCY=SILENT M=output.metrics ; done

	# index bam files
	$ for b in ./*.rg.sort.dedupped.bam ; do \
	samtools index $b ; done

	# Reference Sequence Dictionary file (AAA.dict)
	# go to the directory containing 'reference.fasta'
	$ picard-tools CreateSequenceDictionary R=ref.fasta -O ref.dict



	#############################################################################################
	GATK - varant calling

	# GATK SplitNCigarReads
	$ for d in ./*.dedupped.bam ; do \
	java -jar GenomeAnalysisTK.jar -T SplitNCigarReads \
	-R $GenomeFasta \
	-I $d \
	-o $d.split.bam \
	-rf ReassignOneMappingQuality \
	-RMQF 255 -RMQT 60 -U ALLOW_N_CIGAR_READS;
	done


	# GATK Indel Realignment (Optional)
	$ java -jar GenomeAnalysisTK.jar \
	-T IndelRealigner \
	-R $GenomeFasta \
	-I split.bam \
	-known indels.vcf \
	-targetIntervals intervalListFromRTC.intervals \
	-o realigned.bam


	# GATK Base recalibration (highly recommended, but not works without known SNP data.
	# Skip this step, if can't find dbSNP.vcf file for the organism)
	$ java -jar GenomeAnalysisTK.jar \
	-T BaseRecalibrator \
	-R $GenomeFasta \
	-I realigned.bam \
	-knownSites latest_dbsnp.vcf \
	-o recal_data.table; done


	# UnifiedGenotyper (Variant Calling)
	$ java -jar ~/bin/GATK/GenomeAnalysisTK.jar \
	-T UnifiedGenotyper \
	-R ref_cDNA/mutGenes.fa \
	-I INPUT.bam -o OUTPUT.vcf \
	-glm BOTH \
	-deletions 1 \
	-minIndelFrac 0.01


	###################################################################################################
	# Add a column in the front for hotspot extraction
	$ for v in ./*.vcf; do grep -v "#" $v \| awk 'BEGIN{FS=OFS="\t"}{print $1"_"$2,$0}' > $v.mod; done

	# Original
	APC 2595 . A G 249.77 . AC=1;AF=0.500;AN=2;BaseQRankSum=-1.377;DP=24;Dels=0.08;ExcessHet=3.0103;FS=3.090;HaplotypeScore=0.0000;MLEAC=1;MLEAF=0.500;MQ=28.44;MQ0=12;MQRankSum=1.531;QD=15.61;ReadPosRankSum=-2.262;SOR=1.773 GT:AD:DP:GQ:PL 0/1:6,10:22:91:278,0,91

	# Modified
	APC_2595 APC 2595 . A G 249.77 . AC=1;AF=0.500;AN=2;BaseQRankSum=-1.377;DP=24;Dels=0.08;ExcessHet=3.0103;FS=3.090;HaplotypeScore=0.0000;MLEAC=1;MLEAF=0.500;MQ=28.44;MQ0=12;MQRankSum=1.531;QD=15.61;ReadPosRankSum=-2.262;SOR=1.773 GT:AD:DP:GQ:PL 0/1:6,10:22:91:278,0,91


	##################################################################################################
	# extract hotspots from tables
	$ for v in ./*.vcf.mod ; do
	cat cancerMutHotSpots.txt \| while read line; do grep -w $line $v ; done \| cut -f 1,2,3,5,6,11 \| sed 's/:/\t/g' \| cut -f 1,2,3,4,5,6,7,8 \| sed 's/,/\t/g' > $v.hotspots;
	done

	# hotspot results
	CTNNB1_121 CTNNB1 121 A G 0/1 153,97 250
	GRIN2A_2884 GRIN2A 2884 G A 0/1 219,31 250
	KRAS_35 KRAS 35 G T 0/1 30,205 250
	MED12_107 MED12 107 T G 0/1 10,7 17
	PIK3CA_3140 PIK3CA 3140 A G 0/1 218,32 250
	TP53_818 TP53 818 G A 0/1 23,67 90


	##################################################################################################
	# join with hotspot list
	$ for h in ./*.hotspots; do join -a1 cancerMutHotSpots.txt $h > $h.result; done

	# result - only matching hotspots are appended
	CDKN2A_322
	CDKN2A_329
	CDKN2A_330
	CDKN2A_341
	CTNNB1_110
	CTNNB1_121 CTNNB1 121 A G 0/1 153 97 250
	CTNNB1_122
	CTNNB1_133
	CTNNB1_134
	CTNNB1_94
	CTNNB1_98
	EGFR_2235