cbare · February 14, 2015 00:23
diff --git a/pilot-63_annotation_cleaning_code_scraps.py b/pilot-63_annotation_cleaning_code_scraps.py
 from itertools import izip
 import argparse
 import os
 import re
 import sys
 import synapseclient
 import synapseclient.utils as utils
 from synapseclient import Project, File, Folder, Activity
 from synapseclient import Schema, Column, Table, Row, RowSet, as_table_columns
 import pandas as pd
 import numpy as np
 import matplotlib.pyplot as plt
 import matplotlib.cm as cm


 ## used to remove properties from Synapse query results
 property_keys= ['id', 'name', 'description',
                'modifiedOn', 'modifiedByPrincipalId',
                'createdOn', 'createdByPrincipalId',
                'versionLabel', 'versionComment', 'versionNumber',
                'nodeType', 'concreteType',
                'benefactorId', 'parentId', 'eTag']

 ## remove the annoying prefixes from synapse query results
 def unlist_singletons(value):
    if isinstance(value,list) and len(value)==1:
        return value[0]
    else:
        return value

 def clean_key(key):
    """remove the prefix that Synapse queries add to field names"""
    prefix, new_key = key.split('.', 1)
    return new_key

 def clean_result(result):
    return {clean_key(key):unlist_singletons(value) for key,value in result.iteritems()}

 def filter_keys(dictionary, keys_to_remove):
    return {key:value for key,value in dictionary.iteritems() if key not in keys_to_remove}

 def clean_query_results(results, keys_to_remove=[]):
    return [filter_keys(clean_result(result), keys_to_remove) for result in results]

 def query2df(queryContent, keepSynapseFields=set(['id', 'name'])):
    """Converts the returned query object from Synapse into a Pandas DataFrame
    
    Arguments:
    - `queryContent`: content returned from query or chunkedQuery
    - `keepSynapseFields`: Synapse properties are removed from query results except those named here
    """
    return pd.DataFrame(clean_query_results(queryContent,
        keys_to_remove=set(k for k in property_keys if k not in set(keepSynapseFields))))


 DATA_SOURCE_PROJECT_ID = "syn2351328"



 class Source(object):
    def __init__(self, name, folder_id):
        self.name = name
        self.folder_id = folder_id
    def __repr__(self):
        return "Source(name=\"{name}\", folder_id=\"{folder_id}\")".format(**self.__dict__)

 sources = [
 #          Center        Folder
    Source("Broad",      "syn3165121"),
    Source("BSC",        "syn3165143"),
    Source("DKFZ",       "syn3104289"),
    Source("EMBL",       "syn3153529"),
    Source("McGill",     "syn3165151"),
    Source("MDA_HGSC",   "syn3167886"),
    Source("MDA_KChen",  "syn3165149"),
    Source("OICR",       "syn3167076"),
    Source("SANGER",     "syn3155834"),
    Source("SFU",        "syn3165152"),
    Source("UCSC",       "syn3107237"),
    Source("WUSTL",      "syn3165146"),
    Source("Yale",       "syn3165120")
    ]

 dfs = []
 for source in sources:
    results = syn.chunkedQuery('select * from file where parentId=="%s"' % source.folder_id)
    df = query2df(results)
    print source.name, df.shape
    df['source'] = source.name
    dfs.append(df)

 df_all = pd.concat(dfs, ignore_index=True)

 ## find duplicates where an equivalent .vcf and .vcf.gz file both exist
 def is_dup(name):
    return name.endswith('.vcf') and (name+".gz" in df_all.name.values)
 dups = df_all.name.apply(is_dup)

 df = df_all.ix[ (~df_all.center.isnull()) & (df_all.fileType=='vcf') & (df_all.call_type=='somatic') & (~dup), :]

 df = query2df(syn.chunkedQuery('select * from file where benefactorId=="%s"' % DATA_SOURCE_PROJECT_ID))

 annotation_keys = ['center', 'center_name', 'workflow_name', 'variant_workflow',
                   'call_type', 'dataType', 'dataSubType', 'variant_type', 'fileType']

 for df in dfs:
    series = df.apply(lambda x: Counter(x).most_common())
    print
    print series['source']

    for key in annotation_keys:
        if key in series:
            print key, '=', series[key]


 pd.crosstab(df_all['center'].replace(float('NaN'), 'NA'), df_all['center_name'].replace(float('NaN'), 'NA'))

 pd.crosstab(df_all['center'].replace(float('NaN'), 'NA'), df_all['workflow_name'].replace(float('NaN'), 'NA'))

 pd.crosstab(df_all['center'].replace(float('NaN'), 'NA'), df_all['call_type'].replace(float('NaN'), 'NA'))




 def find_sample_id_source(sample_id):
    results = []
    for col in sample_df.columns:
        if sample_id in sample_df[col].values:
            results.append(col)
    return tuple(results)


 results = syn.tableQuery('select * from syn2887105')
 sample_df = results.asDataFrame()

 ## figure out which centers named their samples with which IDs
 sample_id_sources = df_all.sample_id.apply(find_sample_id_source)
 pd.crosstab(sample_id_sources, df_all.source)

 # In [254]: pd.crosstab(sample_id_sources, df_all.source)
 # Out[254]: 
 # source                                   BSC  Broad  DKFZ  EMBL  MDA_HGSC  \
 # sample_id                                                                   
 # ()                                         0      0     0     0         0   
 # (Donor ID,)                                0      0     0     0         0   
 # (Normal Analysis ID,)                      0      0     0     0         0   
 # (Tumour Analysis ID,)                    567    998  1068   252       108   
 # (Tumour Analyzed Sample/Aliquot GUUID,)    0      0     0     0         0   

 # source                                   MDA_KChen  McGill  OICR  SANGER  SFU  \
 # sample_id                                                                       
 # ()                                               0       0     0    9024    0   
 # (Donor ID,)                                      0       0     0       0    0   
 # (Normal Analysis ID,)                            0       0     0       0    0   
 # (Tumour Analysis ID,)                          216      78   252       0  126   
 # (Tumour Analyzed Sample/Aliquot GUUID,)          0       0     0     896    0   

 # source                                   UCSC  WUSTL  Yale  
 # sample_id                                                   
 # ()                                          0      6     0  
 # (Donor ID,)                               165      0     0  
 # (Normal Analysis ID,)                       0      0    59  
 # (Tumour Analysis ID,)                       0    358     0  
 # (Tumour Analyzed Sample/Aliquot GUUID,)     0      0     0  


 ## fix UCSC sample_id -> donor_id and Tumour Analysis ID -> analysis_id_tumor
 results = clean_query_results(syn.chunkedQuery('select id, name from file where parentId=="syn3107237"'))
 for result in results:
    print result['id'], result['name']
    e = syn.get(result['id'], downloadFile=False)
    a = e.annotations
    a['donor_id'] = a['sample_id']
    a['analysis_id_tumor'] = sample_df['Tumour Analysis ID'][ sample_df['Donor ID']==e.sample_id[0] ][0]
    print syn.setAnnotations(e, a)


 ## fix sample_id -> analysis_id_tumor and Donor ID -> donor_id
 for source in sources:
    if source.name in ["BSC", "Broad", "DKFZ", "EMBL", "MDA_HGSC", "MDA_KChen", "McGill", "OICR", "SFU", "WUSTL"]:
        results = clean_query_results(syn.chunkedQuery('select id, name from file where parentId=="%s"' % source.folder_id))
        for result in results:
            if result['id'] not in already_done:
                print result['id'], result['name']
                e = syn.get(result['id'], downloadFile=False)
                a = e.annotations
                a['analysis_id_tumor'] = a['sample_id']
                a['donor_id'] = sample_df['Donor ID'][ sample_df['Tumour Analysis ID']==e.sample_id[0] ][0]
                print syn.setAnnotations(e, a)
	from itertools import izip
	import argparse
	import os
	import re
	import sys
	import synapseclient
	import synapseclient.utils as utils
	from synapseclient import Project, File, Folder, Activity
	from synapseclient import Schema, Column, Table, Row, RowSet, as_table_columns
	import pandas as pd
	import numpy as np
	import matplotlib.pyplot as plt
	import matplotlib.cm as cm


	## used to remove properties from Synapse query results
	property_keys= ['id', 'name', 'description',
	'modifiedOn', 'modifiedByPrincipalId',
	'createdOn', 'createdByPrincipalId',
	'versionLabel', 'versionComment', 'versionNumber',
	'nodeType', 'concreteType',
	'benefactorId', 'parentId', 'eTag']

	## remove the annoying prefixes from synapse query results
	def unlist_singletons(value):
	if isinstance(value,list) and len(value)==1:
	return value[0]
	else:
	return value

	def clean_key(key):
	"""remove the prefix that Synapse queries add to field names"""
	prefix, new_key = key.split('.', 1)
	return new_key

	def clean_result(result):
	return {clean_key(key):unlist_singletons(value) for key,value in result.iteritems()}

	def filter_keys(dictionary, keys_to_remove):
	return {key:value for key,value in dictionary.iteritems() if key not in keys_to_remove}

	def clean_query_results(results, keys_to_remove=[]):
	return [filter_keys(clean_result(result), keys_to_remove) for result in results]

	def query2df(queryContent, keepSynapseFields=set(['id', 'name'])):
	"""Converts the returned query object from Synapse into a Pandas DataFrame

	Arguments:
	- `queryContent`: content returned from query or chunkedQuery
	- `keepSynapseFields`: Synapse properties are removed from query results except those named here
	"""
	return pd.DataFrame(clean_query_results(queryContent,
	keys_to_remove=set(k for k in property_keys if k not in set(keepSynapseFields))))


	DATA_SOURCE_PROJECT_ID = "syn2351328"



	class Source(object):
	def __init__(self, name, folder_id):
	self.name = name
	self.folder_id = folder_id
	def __repr__(self):
	return "Source(name=\"{name}\", folder_id=\"{folder_id}\")".format(**self.__dict__)

	sources = [
	# Center Folder
	Source("Broad", "syn3165121"),
	Source("BSC", "syn3165143"),
	Source("DKFZ", "syn3104289"),
	Source("EMBL", "syn3153529"),
	Source("McGill", "syn3165151"),
	Source("MDA_HGSC", "syn3167886"),
	Source("MDA_KChen", "syn3165149"),
	Source("OICR", "syn3167076"),
	Source("SANGER", "syn3155834"),
	Source("SFU", "syn3165152"),
	Source("UCSC", "syn3107237"),
	Source("WUSTL", "syn3165146"),
	Source("Yale", "syn3165120")
	]

	dfs = []
	for source in sources:
	results = syn.chunkedQuery('select * from file where parentId=="%s"' % source.folder_id)
	df = query2df(results)
	print source.name, df.shape
	df['source'] = source.name
	dfs.append(df)

	df_all = pd.concat(dfs, ignore_index=True)

	## find duplicates where an equivalent .vcf and .vcf.gz file both exist
	def is_dup(name):
	return name.endswith('.vcf') and (name+".gz" in df_all.name.values)
	dups = df_all.name.apply(is_dup)

	df = df_all.ix[ (~df_all.center.isnull()) & (df_all.fileType=='vcf') & (df_all.call_type=='somatic') & (~dup), :]

	df = query2df(syn.chunkedQuery('select * from file where benefactorId=="%s"' % DATA_SOURCE_PROJECT_ID))

	annotation_keys = ['center', 'center_name', 'workflow_name', 'variant_workflow',
	'call_type', 'dataType', 'dataSubType', 'variant_type', 'fileType']

	for df in dfs:
	series = df.apply(lambda x: Counter(x).most_common())
	print
	print series['source']

	for key in annotation_keys:
	if key in series:
	print key, '=', series[key]


	pd.crosstab(df_all['center'].replace(float('NaN'), 'NA'), df_all['center_name'].replace(float('NaN'), 'NA'))

	pd.crosstab(df_all['center'].replace(float('NaN'), 'NA'), df_all['workflow_name'].replace(float('NaN'), 'NA'))

	pd.crosstab(df_all['center'].replace(float('NaN'), 'NA'), df_all['call_type'].replace(float('NaN'), 'NA'))




	def find_sample_id_source(sample_id):
	results = []
	for col in sample_df.columns:
	if sample_id in sample_df[col].values:
	results.append(col)
	return tuple(results)


	results = syn.tableQuery('select * from syn2887105')
	sample_df = results.asDataFrame()

	## figure out which centers named their samples with which IDs
	sample_id_sources = df_all.sample_id.apply(find_sample_id_source)
	pd.crosstab(sample_id_sources, df_all.source)

	# In [254]: pd.crosstab(sample_id_sources, df_all.source)
	# Out[254]:
	# source BSC Broad DKFZ EMBL MDA_HGSC \
	# sample_id
	# () 0 0 0 0 0
	# (Donor ID,) 0 0 0 0 0
	# (Normal Analysis ID,) 0 0 0 0 0
	# (Tumour Analysis ID,) 567 998 1068 252 108
	# (Tumour Analyzed Sample/Aliquot GUUID,) 0 0 0 0 0

	# source MDA_KChen McGill OICR SANGER SFU \
	# sample_id
	# () 0 0 0 9024 0
	# (Donor ID,) 0 0 0 0 0
	# (Normal Analysis ID,) 0 0 0 0 0
	# (Tumour Analysis ID,) 216 78 252 0 126
	# (Tumour Analyzed Sample/Aliquot GUUID,) 0 0 0 896 0

	# source UCSC WUSTL Yale
	# sample_id
	# () 0 6 0
	# (Donor ID,) 165 0 0
	# (Normal Analysis ID,) 0 0 59
	# (Tumour Analysis ID,) 0 358 0
	# (Tumour Analyzed Sample/Aliquot GUUID,) 0 0 0


	## fix UCSC sample_id -> donor_id and Tumour Analysis ID -> analysis_id_tumor
	results = clean_query_results(syn.chunkedQuery('select id, name from file where parentId=="syn3107237"'))
	for result in results:
	print result['id'], result['name']
	e = syn.get(result['id'], downloadFile=False)
	a = e.annotations
	a['donor_id'] = a['sample_id']
	a['analysis_id_tumor'] = sample_df['Tumour Analysis ID'][ sample_df['Donor ID']==e.sample_id[0] ][0]
	print syn.setAnnotations(e, a)


	## fix sample_id -> analysis_id_tumor and Donor ID -> donor_id
	for source in sources:
	if source.name in ["BSC", "Broad", "DKFZ", "EMBL", "MDA_HGSC", "MDA_KChen", "McGill", "OICR", "SFU", "WUSTL"]:
	results = clean_query_results(syn.chunkedQuery('select id, name from file where parentId=="%s"' % source.folder_id))
	for result in results:
	if result['id'] not in already_done:
	print result['id'], result['name']
	e = syn.get(result['id'], downloadFile=False)
	a = e.annotations
	a['analysis_id_tumor'] = a['sample_id']
	a['donor_id'] = sample_df['Donor ID'][ sample_df['Tumour Analysis ID']==e.sample_id[0] ][0]
	print syn.setAnnotations(e, a)
No results found