jimhavrilla · August 29, 2015 13:58
diff --git a/README.md b/README.md
diff --git a/find_somatic_mutations.py b/find_somatic_mutations.py
 #!/usr/bin/env python
 import sys
 import numpy as np

 from gemini import GeminiQuery
 from gemini.gemini_constants import *

 gq = GeminiQuery(sys.argv[1])

 patient_samples = \
  [('s02-N-2379-AJ-0030','s02-T-2379-AJ-0001','s02-R-2379-AJ-0002'),
   ('s03-N-2379-AJ-0031','s03-T-2379-AJ-0003','s03-R-2379-AJ-0004'),
   ('s04-N-2379-AJ-0032','s04-T-2379-AJ-0005','s04-R-2379-AJ-0006'),
   ('s05-N-2379-AJ-0007','s05-T-2379-AJ-0008','s05-R-2379-AJ-0009'),
   ('s09-N-2379-AJ-0033','s09-T-2379-AJ-0010','s09-R-2379-AJ-0011'),
   ('s10-N-2379-AJ-0034','s10-T-2379-AJ-0012','s10-R-2379-AJ-0013'),
   ('s11-N-2379-AJ-0035','s11-T-2379-AJ-0014','s11-R-2379-AJ-0015'),
   ('s12-N-2379-AJ-0036','s12-T-2379-AJ-0016','s12-R-2379-AJ-0017'),
   ('s15-N-2379-AJ-0037','s15-T-2379-AJ-0018','s15-R-2379-AJ-0019'),
   ('s17-N-2379-AJ-0038','s17-T-2379-AJ-0020','s17-R-2379-AJ-0021'),
   ('s18-N-2379-AJ-0039','s18-T-2379-AJ-0022','s18-R-2379-AJ-0023'),
   ('s19-N-2379-AJ-0040','s19-T-2379-AJ-0024','s19-R-2379-AJ-0025'),
   ('s20-N-2379-AJ-0041','s20-T-2379-AJ-0026','s20-R-2379-AJ-0027'),
   ('s21-N-2379-AJ-0042','s21-T-2379-AJ-0028','s21-R-2379-AJ-0029')]

 normal_samples = \
  ['s02-N-2379-AJ-0030',
   's03-N-2379-AJ-0031',
   's04-N-2379-AJ-0032',
   's05-N-2379-AJ-0007',
   's09-N-2379-AJ-0033',
   's10-N-2379-AJ-0034',
   's11-N-2379-AJ-0035',
   's12-N-2379-AJ-0036',
   's15-N-2379-AJ-0037',
   's17-N-2379-AJ-0038',
   's18-N-2379-AJ-0039',
   's19-N-2379-AJ-0040',
   's20-N-2379-AJ-0041',
   's21-N-2379-AJ-0042']

 query = "SELECT chrom, start, end, ref, alt, type, sub_type, qual, gene, rs_ids,\
                impact, impact_severity, num_het, num_hom_alt, \
                gt_ref_depths, gt_alt_depths \
         FROM variants \
         WHERE on_probes=1 \
           AND call_rate >= 0.95 \
           AND aaf_1kg_all is NULL \
           AND aaf_esp_all is NULL \
           AND qual >= 5"


 def in_normals(gt_types, smp2idx):
  for normal in normal_samples:
    norm_idx = smp2idx[normal]
    if gt_types[norm_idx] != HOM_REF and gt_types[norm_idx] != UNKNOWN:
      return True
  return False

 smp2idx = gq.sample_to_idx
 idx2smp = gq.idx_to_sample
 gq.run(query, show_variant_samples=True)

 print '\t'.join(['sample', 'chrom', 'start', 'end', 'qual', 'ref', \
                'alt', 'gene', 'type', 'sub_type', 'rs_ids', \
                'impact', 'impact_severity', 'num_het', 'ref_norm', \
                'ref_tum', 'ref_resist', 'alt_norm', 'alt_tum', 'alt_resist'])
 for row in gq:
  gt_types = row['gt_types']
  gt_depths = row['gt_depths']
  gt_ref_depths = row['gt_ref_depths']
  gt_alt_depths = row['gt_alt_depths']

  if in_normals(gt_types, smp2idx):
    continue
  
  for patient in patient_samples:
    normal = patient[0]
    tumor = patient[1]
    resis = patient[2]

    normal_idx = smp2idx[normal]
    tumor_idx = smp2idx[tumor]
    resis_idx = smp2idx[resis]

    if gt_types[normal_idx] == HOM_REF and \
       ((gt_types[tumor_idx] != HOM_REF and gt_types[tumor_idx] != UNKNOWN) or (gt_types[resis_idx] != HOM_REF and gt_types[resis_idx] != UNKNOWN)) and  \
       gt_alt_depths[normal_idx] == 0 and \
       (gt_alt_depths[tumor_idx] > 1 or gt_alt_depths[resis_idx] > 1) and \
       gt_depths[normal_idx] >= 20 and \
       gt_depths[tumor_idx] >= 20 and \
       gt_depths[resis_idx] >= 20:

      print '\t'.join(str(s) for s in [patient[0][0:3], row['chrom'], row['start'], row['end'], row['qual'], \
              row['ref'], row['alt'], row['gene'], row['type'], row['sub_type'], row['rs_ids'], row['impact'], row['impact_severity'], \
              row['num_het'],gt_ref_depths[normal_idx], gt_ref_depths[tumor_idx], gt_ref_depths[resis_idx], \
              gt_alt_depths[normal_idx], gt_alt_depths[tumor_idx], gt_alt_depths[resis_idx]])


diff --git a/snpfreq.py b/snpfreq.py
 #snpfreq.py by Jim Havrilla @ quinlanlab @ UVA
 #gets snp frequency distribution plots for samples individually (patients) and altogether

 #table
 #A-C > T-G
 #A-G > T-C
 #A-T > T-A
 #C-A > C-A
 #C-G > C-G
 #C-T > C-T
 #G-A > C-T
 #G-C > C-G
 #G-T > C-A
 #T-A > T-A
 #T-C > T-C
 #T-G > T-G

 import fileinput
 import sys
 import rpy2.robjects as ro
 from rpy2.robjects import r
 from collections import Counter
 from rpy2.robjects.packages import importr

 gd=importr('grDevices')

 l=[]

 #take in file
 for line in fileinput.input("mutations.txt"):
 		foo=line.rstrip().split("\t")
 		if foo[8] == "snp":
 			if foo[5]=="A" and foo[6]=="C":
 				foo=(foo[0],"T->G")
 			elif foo[5]=="A" and foo[6]=="G":
 				foo=(foo[0],"T->C")
 			elif foo[5]=="A" and foo[6]=="T":
 				foo=(foo[0],"T->A")
 			elif foo[5]=="G" and foo[6]=="A":
 				foo=(foo[0],"C->T")
 			elif foo[5]=="G" and foo[6]=="C":
 				foo=(foo[0],"C->G")
 			elif foo[5]=="G" and foo[6]=="T":
 				foo=(foo[0],"C->A")
 			else:
 				foo=(foo[0],foo[5]+"->"+foo[6])
 			l.append(foo)
 #sort by sample (patient)
 l.sort()
 #get counts for each sample (patient) and plot
 m=[]
 new=l[0][0]
 for (t1,t2) in l:
 	if t1!=new:
 		c=Counter(m)
 		gd.jpeg("plot"+new+".jpeg")
 		r.barplot(ro.IntVector(c.values()),names=c.keys(),main="Mutational signatures in human cancer - "+new,legend=c.keys(),col=r.c("darkblue","red","black","yellow","green","purple"),xlab="snp type",ylab="number of snps")
 		gd.dev_off()
 		new=t1
 		m=[]
 	m.append(t2)
 #plot from R for all samples (patients)
 c=Counter([t2 for (t1,t2) in l])
 gd.jpeg("allplot.jpeg")
 r.barplot(ro.IntVector(c.values()),names=c.keys(),main="Mutational signatures in human cancer",legend=c.keys(),col=r.c("darkblue","red","black","yellow","green","purple"),xlab="snp type",ylab="number of snps")
 gd.dev_off()
	#!/usr/bin/env python
	import sys
	import numpy as np

	from gemini import GeminiQuery
	from gemini.gemini_constants import *

	gq = GeminiQuery(sys.argv[1])

	patient_samples = \
	[('s02-N-2379-AJ-0030','s02-T-2379-AJ-0001','s02-R-2379-AJ-0002'),
	('s03-N-2379-AJ-0031','s03-T-2379-AJ-0003','s03-R-2379-AJ-0004'),
	('s04-N-2379-AJ-0032','s04-T-2379-AJ-0005','s04-R-2379-AJ-0006'),
	('s05-N-2379-AJ-0007','s05-T-2379-AJ-0008','s05-R-2379-AJ-0009'),
	('s09-N-2379-AJ-0033','s09-T-2379-AJ-0010','s09-R-2379-AJ-0011'),
	('s10-N-2379-AJ-0034','s10-T-2379-AJ-0012','s10-R-2379-AJ-0013'),
	('s11-N-2379-AJ-0035','s11-T-2379-AJ-0014','s11-R-2379-AJ-0015'),
	('s12-N-2379-AJ-0036','s12-T-2379-AJ-0016','s12-R-2379-AJ-0017'),
	('s15-N-2379-AJ-0037','s15-T-2379-AJ-0018','s15-R-2379-AJ-0019'),
	('s17-N-2379-AJ-0038','s17-T-2379-AJ-0020','s17-R-2379-AJ-0021'),
	('s18-N-2379-AJ-0039','s18-T-2379-AJ-0022','s18-R-2379-AJ-0023'),
	('s19-N-2379-AJ-0040','s19-T-2379-AJ-0024','s19-R-2379-AJ-0025'),
	('s20-N-2379-AJ-0041','s20-T-2379-AJ-0026','s20-R-2379-AJ-0027'),
	('s21-N-2379-AJ-0042','s21-T-2379-AJ-0028','s21-R-2379-AJ-0029')]

	normal_samples = \
	['s02-N-2379-AJ-0030',
	's03-N-2379-AJ-0031',
	's04-N-2379-AJ-0032',
	's05-N-2379-AJ-0007',
	's09-N-2379-AJ-0033',
	's10-N-2379-AJ-0034',
	's11-N-2379-AJ-0035',
	's12-N-2379-AJ-0036',
	's15-N-2379-AJ-0037',
	's17-N-2379-AJ-0038',
	's18-N-2379-AJ-0039',
	's19-N-2379-AJ-0040',
	's20-N-2379-AJ-0041',
	's21-N-2379-AJ-0042']

	query = "SELECT chrom, start, end, ref, alt, type, sub_type, qual, gene, rs_ids,\
	impact, impact_severity, num_het, num_hom_alt, \
	gt_ref_depths, gt_alt_depths \
	FROM variants \
	WHERE on_probes=1 \
	AND call_rate >= 0.95 \
	AND aaf_1kg_all is NULL \
	AND aaf_esp_all is NULL \
	AND qual >= 5"


	def in_normals(gt_types, smp2idx):
	for normal in normal_samples:
	norm_idx = smp2idx[normal]
	if gt_types[norm_idx] != HOM_REF and gt_types[norm_idx] != UNKNOWN:
	return True
	return False

	smp2idx = gq.sample_to_idx
	idx2smp = gq.idx_to_sample
	gq.run(query, show_variant_samples=True)

	print '\t'.join(['sample', 'chrom', 'start', 'end', 'qual', 'ref', \
	'alt', 'gene', 'type', 'sub_type', 'rs_ids', \
	'impact', 'impact_severity', 'num_het', 'ref_norm', \
	'ref_tum', 'ref_resist', 'alt_norm', 'alt_tum', 'alt_resist'])
	for row in gq:
	gt_types = row['gt_types']
	gt_depths = row['gt_depths']
	gt_ref_depths = row['gt_ref_depths']
	gt_alt_depths = row['gt_alt_depths']

	if in_normals(gt_types, smp2idx):
	continue

	for patient in patient_samples:
	normal = patient[0]
	tumor = patient[1]
	resis = patient[2]

	normal_idx = smp2idx[normal]
	tumor_idx = smp2idx[tumor]
	resis_idx = smp2idx[resis]

	if gt_types[normal_idx] == HOM_REF and \
	((gt_types[tumor_idx] != HOM_REF and gt_types[tumor_idx] != UNKNOWN) or (gt_types[resis_idx] != HOM_REF and gt_types[resis_idx] != UNKNOWN)) and \
	gt_alt_depths[normal_idx] == 0 and \
	(gt_alt_depths[tumor_idx] > 1 or gt_alt_depths[resis_idx] > 1) and \
	gt_depths[normal_idx] >= 20 and \
	gt_depths[tumor_idx] >= 20 and \
	gt_depths[resis_idx] >= 20:

	print '\t'.join(str(s) for s in [patient[0][0:3], row['chrom'], row['start'], row['end'], row['qual'], \
	row['ref'], row['alt'], row['gene'], row['type'], row['sub_type'], row['rs_ids'], row['impact'], row['impact_severity'], \
	row['num_het'],gt_ref_depths[normal_idx], gt_ref_depths[tumor_idx], gt_ref_depths[resis_idx], \
	gt_alt_depths[normal_idx], gt_alt_depths[tumor_idx], gt_alt_depths[resis_idx]])
	#snpfreq.py by Jim Havrilla @ quinlanlab @ UVA
	#gets snp frequency distribution plots for samples individually (patients) and altogether

	#table
	#A-C > T-G
	#A-G > T-C
	#A-T > T-A
	#C-A > C-A
	#C-G > C-G
	#C-T > C-T
	#G-A > C-T
	#G-C > C-G
	#G-T > C-A
	#T-A > T-A
	#T-C > T-C
	#T-G > T-G

	import fileinput
	import sys
	import rpy2.robjects as ro
	from rpy2.robjects import r
	from collections import Counter
	from rpy2.robjects.packages import importr

	gd=importr('grDevices')

	l=[]

	#take in file
	for line in fileinput.input("mutations.txt"):
	foo=line.rstrip().split("\t")
	if foo[8] == "snp":
	if foo[5]=="A" and foo[6]=="C":
	foo=(foo[0],"T->G")
	elif foo[5]=="A" and foo[6]=="G":
	foo=(foo[0],"T->C")
	elif foo[5]=="A" and foo[6]=="T":
	foo=(foo[0],"T->A")
	elif foo[5]=="G" and foo[6]=="A":
	foo=(foo[0],"C->T")
	elif foo[5]=="G" and foo[6]=="C":
	foo=(foo[0],"C->G")
	elif foo[5]=="G" and foo[6]=="T":
	foo=(foo[0],"C->A")
	else:
	foo=(foo[0],foo[5]+"->"+foo[6])
	l.append(foo)
	#sort by sample (patient)
	l.sort()
	#get counts for each sample (patient) and plot
	m=[]
	new=l[0][0]
	for (t1,t2) in l:
	if t1!=new:
	c=Counter(m)
	gd.jpeg("plot"+new+".jpeg")
	r.barplot(ro.IntVector(c.values()),names=c.keys(),main="Mutational signatures in human cancer - "+new,legend=c.keys(),col=r.c("darkblue","red","black","yellow","green","purple"),xlab="snp type",ylab="number of snps")
	gd.dev_off()
	new=t1
	m=[]
	m.append(t2)
	#plot from R for all samples (patients)
	c=Counter([t2 for (t1,t2) in l])
	gd.jpeg("allplot.jpeg")
	r.barplot(ro.IntVector(c.values()),names=c.keys(),main="Mutational signatures in human cancer",legend=c.keys(),col=r.c("darkblue","red","black","yellow","green","purple"),xlab="snp type",ylab="number of snps")
	gd.dev_off()