Skip to content

Instantly share code, notes, and snippets.

@cbare
Created February 14, 2015 00:23
Show Gist options
  • Select an option

  • Save cbare/5c4a75337ebec8b8c806 to your computer and use it in GitHub Desktop.

Select an option

Save cbare/5c4a75337ebec8b8c806 to your computer and use it in GitHub Desktop.
Code scraps for cleaning annotations on pilot-63 VCF files.
from itertools import izip
import argparse
import os
import re
import sys
import synapseclient
import synapseclient.utils as utils
from synapseclient import Project, File, Folder, Activity
from synapseclient import Schema, Column, Table, Row, RowSet, as_table_columns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm
## used to remove properties from Synapse query results
property_keys= ['id', 'name', 'description',
'modifiedOn', 'modifiedByPrincipalId',
'createdOn', 'createdByPrincipalId',
'versionLabel', 'versionComment', 'versionNumber',
'nodeType', 'concreteType',
'benefactorId', 'parentId', 'eTag']
## remove the annoying prefixes from synapse query results
def unlist_singletons(value):
if isinstance(value,list) and len(value)==1:
return value[0]
else:
return value
def clean_key(key):
"""remove the prefix that Synapse queries add to field names"""
prefix, new_key = key.split('.', 1)
return new_key
def clean_result(result):
return {clean_key(key):unlist_singletons(value) for key,value in result.iteritems()}
def filter_keys(dictionary, keys_to_remove):
return {key:value for key,value in dictionary.iteritems() if key not in keys_to_remove}
def clean_query_results(results, keys_to_remove=[]):
return [filter_keys(clean_result(result), keys_to_remove) for result in results]
def query2df(queryContent, keepSynapseFields=set(['id', 'name'])):
"""Converts the returned query object from Synapse into a Pandas DataFrame
Arguments:
- `queryContent`: content returned from query or chunkedQuery
- `keepSynapseFields`: Synapse properties are removed from query results except those named here
"""
return pd.DataFrame(clean_query_results(queryContent,
keys_to_remove=set(k for k in property_keys if k not in set(keepSynapseFields))))
DATA_SOURCE_PROJECT_ID = "syn2351328"
class Source(object):
def __init__(self, name, folder_id):
self.name = name
self.folder_id = folder_id
def __repr__(self):
return "Source(name=\"{name}\", folder_id=\"{folder_id}\")".format(**self.__dict__)
sources = [
# Center Folder
Source("Broad", "syn3165121"),
Source("BSC", "syn3165143"),
Source("DKFZ", "syn3104289"),
Source("EMBL", "syn3153529"),
Source("McGill", "syn3165151"),
Source("MDA_HGSC", "syn3167886"),
Source("MDA_KChen", "syn3165149"),
Source("OICR", "syn3167076"),
Source("SANGER", "syn3155834"),
Source("SFU", "syn3165152"),
Source("UCSC", "syn3107237"),
Source("WUSTL", "syn3165146"),
Source("Yale", "syn3165120")
]
dfs = []
for source in sources:
results = syn.chunkedQuery('select * from file where parentId=="%s"' % source.folder_id)
df = query2df(results)
print source.name, df.shape
df['source'] = source.name
dfs.append(df)
df_all = pd.concat(dfs, ignore_index=True)
## find duplicates where an equivalent .vcf and .vcf.gz file both exist
def is_dup(name):
return name.endswith('.vcf') and (name+".gz" in df_all.name.values)
dups = df_all.name.apply(is_dup)
df = df_all.ix[ (~df_all.center.isnull()) & (df_all.fileType=='vcf') & (df_all.call_type=='somatic') & (~dup), :]
df = query2df(syn.chunkedQuery('select * from file where benefactorId=="%s"' % DATA_SOURCE_PROJECT_ID))
annotation_keys = ['center', 'center_name', 'workflow_name', 'variant_workflow',
'call_type', 'dataType', 'dataSubType', 'variant_type', 'fileType']
for df in dfs:
series = df.apply(lambda x: Counter(x).most_common())
print
print series['source']
for key in annotation_keys:
if key in series:
print key, '=', series[key]
pd.crosstab(df_all['center'].replace(float('NaN'), 'NA'), df_all['center_name'].replace(float('NaN'), 'NA'))
pd.crosstab(df_all['center'].replace(float('NaN'), 'NA'), df_all['workflow_name'].replace(float('NaN'), 'NA'))
pd.crosstab(df_all['center'].replace(float('NaN'), 'NA'), df_all['call_type'].replace(float('NaN'), 'NA'))
def find_sample_id_source(sample_id):
results = []
for col in sample_df.columns:
if sample_id in sample_df[col].values:
results.append(col)
return tuple(results)
results = syn.tableQuery('select * from syn2887105')
sample_df = results.asDataFrame()
## figure out which centers named their samples with which IDs
sample_id_sources = df_all.sample_id.apply(find_sample_id_source)
pd.crosstab(sample_id_sources, df_all.source)
# In [254]: pd.crosstab(sample_id_sources, df_all.source)
# Out[254]:
# source BSC Broad DKFZ EMBL MDA_HGSC \
# sample_id
# () 0 0 0 0 0
# (Donor ID,) 0 0 0 0 0
# (Normal Analysis ID,) 0 0 0 0 0
# (Tumour Analysis ID,) 567 998 1068 252 108
# (Tumour Analyzed Sample/Aliquot GUUID,) 0 0 0 0 0
# source MDA_KChen McGill OICR SANGER SFU \
# sample_id
# () 0 0 0 9024 0
# (Donor ID,) 0 0 0 0 0
# (Normal Analysis ID,) 0 0 0 0 0
# (Tumour Analysis ID,) 216 78 252 0 126
# (Tumour Analyzed Sample/Aliquot GUUID,) 0 0 0 896 0
# source UCSC WUSTL Yale
# sample_id
# () 0 6 0
# (Donor ID,) 165 0 0
# (Normal Analysis ID,) 0 0 59
# (Tumour Analysis ID,) 0 358 0
# (Tumour Analyzed Sample/Aliquot GUUID,) 0 0 0
## fix UCSC sample_id -> donor_id and Tumour Analysis ID -> analysis_id_tumor
results = clean_query_results(syn.chunkedQuery('select id, name from file where parentId=="syn3107237"'))
for result in results:
print result['id'], result['name']
e = syn.get(result['id'], downloadFile=False)
a = e.annotations
a['donor_id'] = a['sample_id']
a['analysis_id_tumor'] = sample_df['Tumour Analysis ID'][ sample_df['Donor ID']==e.sample_id[0] ][0]
print syn.setAnnotations(e, a)
## fix sample_id -> analysis_id_tumor and Donor ID -> donor_id
for source in sources:
if source.name in ["BSC", "Broad", "DKFZ", "EMBL", "MDA_HGSC", "MDA_KChen", "McGill", "OICR", "SFU", "WUSTL"]:
results = clean_query_results(syn.chunkedQuery('select id, name from file where parentId=="%s"' % source.folder_id))
for result in results:
if result['id'] not in already_done:
print result['id'], result['name']
e = syn.get(result['id'], downloadFile=False)
a = e.annotations
a['analysis_id_tumor'] = a['sample_id']
a['donor_id'] = sample_df['Donor ID'][ sample_df['Tumour Analysis ID']==e.sample_id[0] ][0]
print syn.setAnnotations(e, a)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment