First found somatic mutations in patient database using Gemini and Python with varying dosages of carboplatin. Moved on to finding frequency profile for each patient's SNPs in Python and plotting them individually and as a group in R.
Last active
August 29, 2015 13:58
-
-
Save jimhavrilla/10414821 to your computer and use it in GitHub Desktop.
ovarian-cancer-analysis
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import sys | |
import numpy as np | |
from gemini import GeminiQuery | |
from gemini.gemini_constants import * | |
gq = GeminiQuery(sys.argv[1]) | |
patient_samples = \ | |
[('s02-N-2379-AJ-0030','s02-T-2379-AJ-0001','s02-R-2379-AJ-0002'), | |
('s03-N-2379-AJ-0031','s03-T-2379-AJ-0003','s03-R-2379-AJ-0004'), | |
('s04-N-2379-AJ-0032','s04-T-2379-AJ-0005','s04-R-2379-AJ-0006'), | |
('s05-N-2379-AJ-0007','s05-T-2379-AJ-0008','s05-R-2379-AJ-0009'), | |
('s09-N-2379-AJ-0033','s09-T-2379-AJ-0010','s09-R-2379-AJ-0011'), | |
('s10-N-2379-AJ-0034','s10-T-2379-AJ-0012','s10-R-2379-AJ-0013'), | |
('s11-N-2379-AJ-0035','s11-T-2379-AJ-0014','s11-R-2379-AJ-0015'), | |
('s12-N-2379-AJ-0036','s12-T-2379-AJ-0016','s12-R-2379-AJ-0017'), | |
('s15-N-2379-AJ-0037','s15-T-2379-AJ-0018','s15-R-2379-AJ-0019'), | |
('s17-N-2379-AJ-0038','s17-T-2379-AJ-0020','s17-R-2379-AJ-0021'), | |
('s18-N-2379-AJ-0039','s18-T-2379-AJ-0022','s18-R-2379-AJ-0023'), | |
('s19-N-2379-AJ-0040','s19-T-2379-AJ-0024','s19-R-2379-AJ-0025'), | |
('s20-N-2379-AJ-0041','s20-T-2379-AJ-0026','s20-R-2379-AJ-0027'), | |
('s21-N-2379-AJ-0042','s21-T-2379-AJ-0028','s21-R-2379-AJ-0029')] | |
normal_samples = \ | |
['s02-N-2379-AJ-0030', | |
's03-N-2379-AJ-0031', | |
's04-N-2379-AJ-0032', | |
's05-N-2379-AJ-0007', | |
's09-N-2379-AJ-0033', | |
's10-N-2379-AJ-0034', | |
's11-N-2379-AJ-0035', | |
's12-N-2379-AJ-0036', | |
's15-N-2379-AJ-0037', | |
's17-N-2379-AJ-0038', | |
's18-N-2379-AJ-0039', | |
's19-N-2379-AJ-0040', | |
's20-N-2379-AJ-0041', | |
's21-N-2379-AJ-0042'] | |
query = "SELECT chrom, start, end, ref, alt, type, sub_type, qual, gene, rs_ids,\ | |
impact, impact_severity, num_het, num_hom_alt, \ | |
gt_ref_depths, gt_alt_depths \ | |
FROM variants \ | |
WHERE on_probes=1 \ | |
AND call_rate >= 0.95 \ | |
AND aaf_1kg_all is NULL \ | |
AND aaf_esp_all is NULL \ | |
AND qual >= 5" | |
def in_normals(gt_types, smp2idx): | |
for normal in normal_samples: | |
norm_idx = smp2idx[normal] | |
if gt_types[norm_idx] != HOM_REF and gt_types[norm_idx] != UNKNOWN: | |
return True | |
return False | |
smp2idx = gq.sample_to_idx | |
idx2smp = gq.idx_to_sample | |
gq.run(query, show_variant_samples=True) | |
print '\t'.join(['sample', 'chrom', 'start', 'end', 'qual', 'ref', \ | |
'alt', 'gene', 'type', 'sub_type', 'rs_ids', \ | |
'impact', 'impact_severity', 'num_het', 'ref_norm', \ | |
'ref_tum', 'ref_resist', 'alt_norm', 'alt_tum', 'alt_resist']) | |
for row in gq: | |
gt_types = row['gt_types'] | |
gt_depths = row['gt_depths'] | |
gt_ref_depths = row['gt_ref_depths'] | |
gt_alt_depths = row['gt_alt_depths'] | |
if in_normals(gt_types, smp2idx): | |
continue | |
for patient in patient_samples: | |
normal = patient[0] | |
tumor = patient[1] | |
resis = patient[2] | |
normal_idx = smp2idx[normal] | |
tumor_idx = smp2idx[tumor] | |
resis_idx = smp2idx[resis] | |
if gt_types[normal_idx] == HOM_REF and \ | |
((gt_types[tumor_idx] != HOM_REF and gt_types[tumor_idx] != UNKNOWN) or (gt_types[resis_idx] != HOM_REF and gt_types[resis_idx] != UNKNOWN)) and \ | |
gt_alt_depths[normal_idx] == 0 and \ | |
(gt_alt_depths[tumor_idx] > 1 or gt_alt_depths[resis_idx] > 1) and \ | |
gt_depths[normal_idx] >= 20 and \ | |
gt_depths[tumor_idx] >= 20 and \ | |
gt_depths[resis_idx] >= 20: | |
print '\t'.join(str(s) for s in [patient[0][0:3], row['chrom'], row['start'], row['end'], row['qual'], \ | |
row['ref'], row['alt'], row['gene'], row['type'], row['sub_type'], row['rs_ids'], row['impact'], row['impact_severity'], \ | |
row['num_het'],gt_ref_depths[normal_idx], gt_ref_depths[tumor_idx], gt_ref_depths[resis_idx], \ | |
gt_alt_depths[normal_idx], gt_alt_depths[tumor_idx], gt_alt_depths[resis_idx]]) | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#snpfreq.py by Jim Havrilla @ quinlanlab @ UVA | |
#gets snp frequency distribution plots for samples individually (patients) and altogether | |
#table | |
#A-C > T-G | |
#A-G > T-C | |
#A-T > T-A | |
#C-A > C-A | |
#C-G > C-G | |
#C-T > C-T | |
#G-A > C-T | |
#G-C > C-G | |
#G-T > C-A | |
#T-A > T-A | |
#T-C > T-C | |
#T-G > T-G | |
import fileinput | |
import sys | |
import rpy2.robjects as ro | |
from rpy2.robjects import r | |
from collections import Counter | |
from rpy2.robjects.packages import importr | |
gd=importr('grDevices') | |
l=[] | |
#take in file | |
for line in fileinput.input("mutations.txt"): | |
foo=line.rstrip().split("\t") | |
if foo[8] == "snp": | |
if foo[5]=="A" and foo[6]=="C": | |
foo=(foo[0],"T->G") | |
elif foo[5]=="A" and foo[6]=="G": | |
foo=(foo[0],"T->C") | |
elif foo[5]=="A" and foo[6]=="T": | |
foo=(foo[0],"T->A") | |
elif foo[5]=="G" and foo[6]=="A": | |
foo=(foo[0],"C->T") | |
elif foo[5]=="G" and foo[6]=="C": | |
foo=(foo[0],"C->G") | |
elif foo[5]=="G" and foo[6]=="T": | |
foo=(foo[0],"C->A") | |
else: | |
foo=(foo[0],foo[5]+"->"+foo[6]) | |
l.append(foo) | |
#sort by sample (patient) | |
l.sort() | |
#get counts for each sample (patient) and plot | |
m=[] | |
new=l[0][0] | |
for (t1,t2) in l: | |
if t1!=new: | |
c=Counter(m) | |
gd.jpeg("plot"+new+".jpeg") | |
r.barplot(ro.IntVector(c.values()),names=c.keys(),main="Mutational signatures in human cancer - "+new,legend=c.keys(),col=r.c("darkblue","red","black","yellow","green","purple"),xlab="snp type",ylab="number of snps") | |
gd.dev_off() | |
new=t1 | |
m=[] | |
m.append(t2) | |
#plot from R for all samples (patients) | |
c=Counter([t2 for (t1,t2) in l]) | |
gd.jpeg("allplot.jpeg") | |
r.barplot(ro.IntVector(c.values()),names=c.keys(),main="Mutational signatures in human cancer",legend=c.keys(),col=r.c("darkblue","red","black","yellow","green","purple"),xlab="snp type",ylab="number of snps") | |
gd.dev_off() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment