Created
February 14, 2015 00:23
-
-
Save cbare/5c4a75337ebec8b8c806 to your computer and use it in GitHub Desktop.
Code scraps for cleaning annotations on pilot-63 VCF files.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from itertools import izip | |
| import argparse | |
| import os | |
| import re | |
| import sys | |
| import synapseclient | |
| import synapseclient.utils as utils | |
| from synapseclient import Project, File, Folder, Activity | |
| from synapseclient import Schema, Column, Table, Row, RowSet, as_table_columns | |
| import pandas as pd | |
| import numpy as np | |
| import matplotlib.pyplot as plt | |
| import matplotlib.cm as cm | |
| ## used to remove properties from Synapse query results | |
| property_keys= ['id', 'name', 'description', | |
| 'modifiedOn', 'modifiedByPrincipalId', | |
| 'createdOn', 'createdByPrincipalId', | |
| 'versionLabel', 'versionComment', 'versionNumber', | |
| 'nodeType', 'concreteType', | |
| 'benefactorId', 'parentId', 'eTag'] | |
| ## remove the annoying prefixes from synapse query results | |
| def unlist_singletons(value): | |
| if isinstance(value,list) and len(value)==1: | |
| return value[0] | |
| else: | |
| return value | |
| def clean_key(key): | |
| """remove the prefix that Synapse queries add to field names""" | |
| prefix, new_key = key.split('.', 1) | |
| return new_key | |
| def clean_result(result): | |
| return {clean_key(key):unlist_singletons(value) for key,value in result.iteritems()} | |
| def filter_keys(dictionary, keys_to_remove): | |
| return {key:value for key,value in dictionary.iteritems() if key not in keys_to_remove} | |
| def clean_query_results(results, keys_to_remove=[]): | |
| return [filter_keys(clean_result(result), keys_to_remove) for result in results] | |
| def query2df(queryContent, keepSynapseFields=set(['id', 'name'])): | |
| """Converts the returned query object from Synapse into a Pandas DataFrame | |
| Arguments: | |
| - `queryContent`: content returned from query or chunkedQuery | |
| - `keepSynapseFields`: Synapse properties are removed from query results except those named here | |
| """ | |
| return pd.DataFrame(clean_query_results(queryContent, | |
| keys_to_remove=set(k for k in property_keys if k not in set(keepSynapseFields)))) | |
| DATA_SOURCE_PROJECT_ID = "syn2351328" | |
| class Source(object): | |
| def __init__(self, name, folder_id): | |
| self.name = name | |
| self.folder_id = folder_id | |
| def __repr__(self): | |
| return "Source(name=\"{name}\", folder_id=\"{folder_id}\")".format(**self.__dict__) | |
| sources = [ | |
| # Center Folder | |
| Source("Broad", "syn3165121"), | |
| Source("BSC", "syn3165143"), | |
| Source("DKFZ", "syn3104289"), | |
| Source("EMBL", "syn3153529"), | |
| Source("McGill", "syn3165151"), | |
| Source("MDA_HGSC", "syn3167886"), | |
| Source("MDA_KChen", "syn3165149"), | |
| Source("OICR", "syn3167076"), | |
| Source("SANGER", "syn3155834"), | |
| Source("SFU", "syn3165152"), | |
| Source("UCSC", "syn3107237"), | |
| Source("WUSTL", "syn3165146"), | |
| Source("Yale", "syn3165120") | |
| ] | |
| dfs = [] | |
| for source in sources: | |
| results = syn.chunkedQuery('select * from file where parentId=="%s"' % source.folder_id) | |
| df = query2df(results) | |
| print source.name, df.shape | |
| df['source'] = source.name | |
| dfs.append(df) | |
| df_all = pd.concat(dfs, ignore_index=True) | |
| ## find duplicates where an equivalent .vcf and .vcf.gz file both exist | |
| def is_dup(name): | |
| return name.endswith('.vcf') and (name+".gz" in df_all.name.values) | |
| dups = df_all.name.apply(is_dup) | |
| df = df_all.ix[ (~df_all.center.isnull()) & (df_all.fileType=='vcf') & (df_all.call_type=='somatic') & (~dup), :] | |
| df = query2df(syn.chunkedQuery('select * from file where benefactorId=="%s"' % DATA_SOURCE_PROJECT_ID)) | |
| annotation_keys = ['center', 'center_name', 'workflow_name', 'variant_workflow', | |
| 'call_type', 'dataType', 'dataSubType', 'variant_type', 'fileType'] | |
| for df in dfs: | |
| series = df.apply(lambda x: Counter(x).most_common()) | |
| print series['source'] | |
| for key in annotation_keys: | |
| if key in series: | |
| print key, '=', series[key] | |
| pd.crosstab(df_all['center'].replace(float('NaN'), 'NA'), df_all['center_name'].replace(float('NaN'), 'NA')) | |
| pd.crosstab(df_all['center'].replace(float('NaN'), 'NA'), df_all['workflow_name'].replace(float('NaN'), 'NA')) | |
| pd.crosstab(df_all['center'].replace(float('NaN'), 'NA'), df_all['call_type'].replace(float('NaN'), 'NA')) | |
| def find_sample_id_source(sample_id): | |
| results = [] | |
| for col in sample_df.columns: | |
| if sample_id in sample_df[col].values: | |
| results.append(col) | |
| return tuple(results) | |
| results = syn.tableQuery('select * from syn2887105') | |
| sample_df = results.asDataFrame() | |
| ## figure out which centers named their samples with which IDs | |
| sample_id_sources = df_all.sample_id.apply(find_sample_id_source) | |
| pd.crosstab(sample_id_sources, df_all.source) | |
| # In [254]: pd.crosstab(sample_id_sources, df_all.source) | |
| # Out[254]: | |
| # source BSC Broad DKFZ EMBL MDA_HGSC \ | |
| # sample_id | |
| # () 0 0 0 0 0 | |
| # (Donor ID,) 0 0 0 0 0 | |
| # (Normal Analysis ID,) 0 0 0 0 0 | |
| # (Tumour Analysis ID,) 567 998 1068 252 108 | |
| # (Tumour Analyzed Sample/Aliquot GUUID,) 0 0 0 0 0 | |
| # source MDA_KChen McGill OICR SANGER SFU \ | |
| # sample_id | |
| # () 0 0 0 9024 0 | |
| # (Donor ID,) 0 0 0 0 0 | |
| # (Normal Analysis ID,) 0 0 0 0 0 | |
| # (Tumour Analysis ID,) 216 78 252 0 126 | |
| # (Tumour Analyzed Sample/Aliquot GUUID,) 0 0 0 896 0 | |
| # source UCSC WUSTL Yale | |
| # sample_id | |
| # () 0 6 0 | |
| # (Donor ID,) 165 0 0 | |
| # (Normal Analysis ID,) 0 0 59 | |
| # (Tumour Analysis ID,) 0 358 0 | |
| # (Tumour Analyzed Sample/Aliquot GUUID,) 0 0 0 | |
| ## fix UCSC sample_id -> donor_id and Tumour Analysis ID -> analysis_id_tumor | |
| results = clean_query_results(syn.chunkedQuery('select id, name from file where parentId=="syn3107237"')) | |
| for result in results: | |
| print result['id'], result['name'] | |
| e = syn.get(result['id'], downloadFile=False) | |
| a = e.annotations | |
| a['donor_id'] = a['sample_id'] | |
| a['analysis_id_tumor'] = sample_df['Tumour Analysis ID'][ sample_df['Donor ID']==e.sample_id[0] ][0] | |
| print syn.setAnnotations(e, a) | |
| ## fix sample_id -> analysis_id_tumor and Donor ID -> donor_id | |
| for source in sources: | |
| if source.name in ["BSC", "Broad", "DKFZ", "EMBL", "MDA_HGSC", "MDA_KChen", "McGill", "OICR", "SFU", "WUSTL"]: | |
| results = clean_query_results(syn.chunkedQuery('select id, name from file where parentId=="%s"' % source.folder_id)) | |
| for result in results: | |
| if result['id'] not in already_done: | |
| print result['id'], result['name'] | |
| e = syn.get(result['id'], downloadFile=False) | |
| a = e.annotations | |
| a['analysis_id_tumor'] = a['sample_id'] | |
| a['donor_id'] = sample_df['Donor ID'][ sample_df['Tumour Analysis ID']==e.sample_id[0] ][0] | |
| print syn.setAnnotations(e, a) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment