mbk0asis · December 10, 2016 08:42
diff --git a/multiplex PCR target sites b/multiplex PCR target sites
 # Files used here
 All_COSMIC_Genes.fasta - mRNA sequences for all cancer genes (normal)
 Pongo_abelii.Ensembl.PPYG2.cdna.all.fa - Orangutan mRNAs
 CosmicCodingMuts.vcf  - mutation information
 ID coversion file - Ensembl transcript ID and gene symbols from biomart ( e.g. mart_export.txt)


 1. Select high frequency MUT cancer genes (cnt >= 10)
 $ cat CosmicCodingMuts_SORTED.vcf | grep -v "#" | grep -v SNV | cut -f 1,2,4,5,8 | \
 sed 's/;/\t/g' | cut -f 1,2,3,4,5,6,9,11 | sed 's/=/\t/g' | cut -f 1,2,3,4,6,8,10,12 | \
 awk '$8 >= 10' > hg_cnt10


 2.Extract cancer gene sequences from Orangutan mRNA
 $ cat hg_cnt10 | cut -f 5 | \
 while read line; do grep -w $line pa_mart_export.txt ; done | \
 cut -f 2,3 | sort -k 1 | uniq > pa_cancerGenes_ID_symbols

 ENSPPYT00000021061	BRAF
 ENSPPYT00000003448	HRAS
 ENSPPYT00000033208	HRAS
 ENSPPYT00000005184	KRAS
 ENSPPYT00000001202	NRAS
 ENSPPYT00000018553	CSF1R
 ENSPPYT00000016896	FGFR3
 ENSPPYT00000006206	FLT3
 ENSPPYT00000017165	KIT
 ENSPPYT00000020898	MET

 $ cut -f 1 pa_cancerGenes_ID_symbols > pa_cancerGenes_ID
 $ pyfasta extract --fasta Pongo_abelii.Ensembl.PPYG2.cdna.all.fa \
 --file --header --space pa_cancerGenes_symbols > pa_cancer_mRNA.fa

 # change fasta header (Ensembl ID --> gene symbols)
 $ fasta_formatter -i pa_cancer_mRNA.fa -o pa_cancer_mRNA.tab -t
 $ join pa_cancerGenes_ID_symbols_cnt10 pa_cancerGenes_mRNA.tab | \
 sed 's/ /\t/g' | cut -f 2,3 | awk '{print ">"$0}' | \
 sed 's/\t/\n/g' > pa_cancerGenes_mRNA_symbols.fa

 $ pyfasta split --header "%(seqid)s.fa" pa_cancerGene_mRNA_symbols.fa
 # this will create individual mRNA sequences (e.g. ABL1.fa, DNMT3A.fa,...)


 3. Extract hg cancer gene seq from 'All_COSMIC_Genes.fasta'
 $ fasta_formatter -i All_COSMIC_Genes.fasta -o All_COSMIC_Genes.tab -t
 $ cat pa_cancerGene_symbols | while read line; do grep -w $line ; done | \
 awk '{print ">"$0}' | sed 's/\t/\n/g' > hg_normal_mRNA.fa


 4. blast hg & pa mRNAs
 # move 'hg_normal_mRNA.fa' & individual PA mRNAs in the same directory
 # -outfmt 5 to generate results in XML
 $ for f in ./*.fa;
 do
 bastn -query $f -subject  hg_normal_mRNA.fa -ourfmt 5 > $f.xml
 done

 $ for x in ./*.xml;
 do
 java -jar ~/bin/jvarkit-master/dist-2.0.1/blastn2snp.jar $x | column -t >> all.SNV
 done
 # open 'all.SNV' with excel


 5. combine MUT and SNV data using MySQL
 # extract gene symbols & mutation positon from CosmicCodingMuts.vcf
 # the result file must contain both MUT and SNV positions on transcripts (EXCEL)

 ID  MUT_pos SNV_pos cnt
 gene_A  100 120 50
 gene_B  40  54  3
  .     .   .   .
  .     .   .   . 
  .     .   .   .


 6. Calculate distance between MUT and SNV

 7. Select loci where MUT & SNV are in close proximity

 8. Mark MUT & SNV on transcripts (EXCEL 'REPLACE' function)

 9. Extract sequences around MUT position
  1) make BED file
    ID  start end name
    gene_A  50  150 gene_A_loci_1
    
  2) Using the BED file, extract sequences
    $ bedtools getfasta -fi MUT_SNV_marked_cDNA_seq -bed MUT.bed -fo AROUND_MUT_seq.fa

 10. run BatchPrimer3

 11. Filter unwanted results (e.g. mutation frequency, distance between mutation and SNV)

 12. in-silico PCR to generate PCR product sequences 
     (downlaoded from 'http://hgwdev.cse.ucsc.edu/~kent/exe/linux/isPcr.zip')
 $ isPcr database query output
  - database : fasta to be aligned
  - query : primers (tab separated txt file with 3 columns - name forward reverse)
  - output : output file name (default format = fasta)
  - You may transform output to tsv format using 'fasta_formatter' with '-t' option
	# Files used here
	All_COSMIC_Genes.fasta - mRNA sequences for all cancer genes (normal)
	Pongo_abelii.Ensembl.PPYG2.cdna.all.fa - Orangutan mRNAs
	CosmicCodingMuts.vcf - mutation information
	ID coversion file - Ensembl transcript ID and gene symbols from biomart ( e.g. mart_export.txt)


	1. Select high frequency MUT cancer genes (cnt >= 10)
	$ cat CosmicCodingMuts_SORTED.vcf \| grep -v "#" \| grep -v SNV \| cut -f 1,2,4,5,8 \| \
	sed 's/;/\t/g' \| cut -f 1,2,3,4,5,6,9,11 \| sed 's/=/\t/g' \| cut -f 1,2,3,4,6,8,10,12 \| \
	awk '$8 >= 10' > hg_cnt10


	2.Extract cancer gene sequences from Orangutan mRNA
	$ cat hg_cnt10 \| cut -f 5 \| \
	while read line; do grep -w $line pa_mart_export.txt ; done \| \
	cut -f 2,3 \| sort -k 1 \| uniq > pa_cancerGenes_ID_symbols

	ENSPPYT00000021061 BRAF
	ENSPPYT00000003448 HRAS
	ENSPPYT00000033208 HRAS
	ENSPPYT00000005184 KRAS
	ENSPPYT00000001202 NRAS
	ENSPPYT00000018553 CSF1R
	ENSPPYT00000016896 FGFR3
	ENSPPYT00000006206 FLT3
	ENSPPYT00000017165 KIT
	ENSPPYT00000020898 MET

	$ cut -f 1 pa_cancerGenes_ID_symbols > pa_cancerGenes_ID
	$ pyfasta extract --fasta Pongo_abelii.Ensembl.PPYG2.cdna.all.fa \
	--file --header --space pa_cancerGenes_symbols > pa_cancer_mRNA.fa

	# change fasta header (Ensembl ID --> gene symbols)
	$ fasta_formatter -i pa_cancer_mRNA.fa -o pa_cancer_mRNA.tab -t
	$ join pa_cancerGenes_ID_symbols_cnt10 pa_cancerGenes_mRNA.tab \| \
	sed 's/ /\t/g' \| cut -f 2,3 \| awk '{print ">"$0}' \| \
	sed 's/\t/\n/g' > pa_cancerGenes_mRNA_symbols.fa

	$ pyfasta split --header "%(seqid)s.fa" pa_cancerGene_mRNA_symbols.fa
	# this will create individual mRNA sequences (e.g. ABL1.fa, DNMT3A.fa,...)


	3. Extract hg cancer gene seq from 'All_COSMIC_Genes.fasta'
	$ fasta_formatter -i All_COSMIC_Genes.fasta -o All_COSMIC_Genes.tab -t
	$ cat pa_cancerGene_symbols \| while read line; do grep -w $line ; done \| \
	awk '{print ">"$0}' \| sed 's/\t/\n/g' > hg_normal_mRNA.fa


	4. blast hg & pa mRNAs
	# move 'hg_normal_mRNA.fa' & individual PA mRNAs in the same directory
	# -outfmt 5 to generate results in XML
	$ for f in ./*.fa;
	do
	bastn -query $f -subject hg_normal_mRNA.fa -ourfmt 5 > $f.xml
	done

	$ for x in ./*.xml;
	do
	java -jar ~/bin/jvarkit-master/dist-2.0.1/blastn2snp.jar $x \| column -t >> all.SNV
	done
	# open 'all.SNV' with excel


	5. combine MUT and SNV data using MySQL
	# extract gene symbols & mutation positon from CosmicCodingMuts.vcf
	# the result file must contain both MUT and SNV positions on transcripts (EXCEL)

	ID MUT_pos SNV_pos cnt
	gene_A 100 120 50
	gene_B 40 54 3
	. . . .
	. . . .
	. . . .


	6. Calculate distance between MUT and SNV

	7. Select loci where MUT & SNV are in close proximity

	8. Mark MUT & SNV on transcripts (EXCEL 'REPLACE' function)

	9. Extract sequences around MUT position
	1) make BED file
	ID start end name
	gene_A 50 150 gene_A_loci_1

	2) Using the BED file, extract sequences
	$ bedtools getfasta -fi MUT_SNV_marked_cDNA_seq -bed MUT.bed -fo AROUND_MUT_seq.fa

	10. run BatchPrimer3

	11. Filter unwanted results (e.g. mutation frequency, distance between mutation and SNV)

	12. in-silico PCR to generate PCR product sequences
	(downlaoded from 'http://hgwdev.cse.ucsc.edu/~kent/exe/linux/isPcr.zip')
	$ isPcr database query output
	- database : fasta to be aligned
	- query : primers (tab separated txt file with 3 columns - name forward reverse)
	- output : output file name (default format = fasta)
	- You may transform output to tsv format using 'fasta_formatter' with '-t' option