Last active
August 29, 2015 14:08
-
-
Save cbare/c5c299930afd77178ac9 to your computer and use it in GitHub Desktop.
runAttractorInCRCSC: A sample analysis script for use in the [Mad Scientist Stu demo](https://www.synapse.org/#!Help:Collaboratorium)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| ########################################################### | |
| ## | |
| ## runAttractorInCRCSC | |
| ## | |
| ## A sample data analysis for use with the Mad Scientist | |
| ## Stu demo of Synapse: | |
| ## https://www.synapse.org/#!Help:Collaboratorium | |
| ########################################################### | |
| """ | |
| Perform Step 5 of the Mad Scientist Stu Synapse Tutorial. | |
| The runAttractorInCRCSC function extracts gene names from a co-expressed | |
| cluster of genes computed by the Attractor Metagenes team. It then analyzed | |
| TCGA expression data finding the correlation of each gene with the mean | |
| expression of the genes in the cluster. We use this to pick out a gene | |
| strongly anti-correlated with the cluster. | |
| """ | |
| import synapseclient | |
| from synapseclient import Activity, File, Folder, Project | |
| import synapseclient.utils as synutils | |
| import os | |
| import pandas as pd | |
| import tempfile | |
| import sys | |
| def runAttractorInCRCSC(syn, project): | |
| """ | |
| Perform Step 5 of the Mad Scientist Stu Synapse Tutorial. | |
| :param syn: a `Synapse connection object <http://python-docs.synapse.org/#connecting-to-synapse>`_. | |
| :param project: a `Synapse project <http://python-docs.synapse.org/Entity.html#synapseclient.entity.Project>`_ or its Synapse ID. | |
| """ | |
| ## gist URL that holds this script: | |
| this_script="https://gist.github.com/cbare/c5c299930afd77178ac9" | |
| ## PULL FROM THE LINK TO "COAD.attractors.rnaseq.txt" THAT WAS CREATED IN THE PROJECT AT AN EARLIER STAGE | |
| print "... finding link to attractors rnaseq file ..." | |
| projectId = synutils.id_of(project) | |
| results = syn.chunkedQuery('SELECT id, name FROM link WHERE benefactorId=="%s" and name=="COAD.attractors.rnaseq.txt"' % projectId) | |
| for result in results: | |
| link = syn.get(result['link.id']) | |
| break | |
| else: | |
| raise RuntimeError('You need to add a link to "COAD.attractors.rnaseq.txt" to your project!') | |
| ## GET THE ATTRACTOR METAGENES FOR TCGA COAD FROM THE LINK | |
| print "... reading in attractor metagenes ..." | |
| syn_amg_coad = syn.get(link.linksTo['targetId']) | |
| amg_coad = pd.read_table(syn_amg_coad.path) | |
| ## extract gene names from strings that look like "MARCH7|64844" where | |
| ## the gene name and gene ID are separated by a pipe character | |
| attractor_genes = [g.split('|')[0] for g in amg_coad['Attractor001:Genes']] | |
| ## GRAB THE CURATED RNA-SEQ EXPRESSION DATA ACROSS TCGA COAD AND READ | |
| print "... reading in large expression matrix from CRCSC (this might take a minute) ..." | |
| ## read a gene expression matrix | |
| syn_crc_expr = syn.get("syn2326100") | |
| crc_expr = pd.read_table(syn_crc_expr.path, index_col=0) | |
| ## filter out genes whose expression has a standard deviation of 0 | |
| crc_expr = crc_expr.ix[crc_expr.std(1)>0.00000001, :] | |
| ## SUBSET TO ONLY ATTRACTORS PULLED FROM COAD | |
| print "... computing the top attractor in CRCSC ..." | |
| crc_am = crc_expr.ix[attractor_genes, :].mean() | |
| ## LOOK AT RANK-BASED CORRELATION (SPEARMAN) | |
| print "... calculating spearman correlation with all genes in CRCSC dataset ..." | |
| def spearman_correlation_with_mean_expr_of_amg_cluster(crc_expr_row): | |
| return crc_am.corr(crc_expr_row, method='spearman') | |
| am_corr = crc_expr.apply(spearman_correlation_with_mean_expr_of_amg_cluster, axis=1) | |
| am_corr.sort() | |
| ## GRAB GENE WITH THE MOST NEGATIVE CORRELATION | |
| this_gene = am_corr.index[0] | |
| ## WRITE OUT TO FILE TO STORE IN SYNAPSE | |
| filepath = os.path.join(tempfile.gettempdir(), "coadAttractorMetagene-correlationCRCSC.tsv") | |
| am_corr.to_csv(filepath, sep='\t') | |
| ## FIND FOLDER TO STORE RESULTS | |
| results = syn.chunkedQuery('SELECT id, name FROM folder WHERE benefactorId=="%s"' % projectId) | |
| for result in results: | |
| if result['folder.name'].lower() == "results": | |
| folder = syn.get(result['folder.id']) | |
| break | |
| else: | |
| raise RuntimeError('You need to add a folder called "results" to your project!') | |
| ## CREATE PROVENANCE: DESCRIBE WHAT WAS USED TO GENERATE THE RESULT | |
| print "... recording provenance for this analysis ..." | |
| act = Activity(name="COAD Attractor Metagene correlation with CRCSC TCGA", | |
| used=[syn_amg_coad, syn_crc_expr], | |
| executed=this_script) | |
| ## SET ANNOTATIONS AND STORE RESULT BACK IN SYNAPSE | |
| print "... storing correlation analysis back in Synapse ..." | |
| syn_file = File(path=filepath, parent=folder, min=this_gene, projectId=projectId) | |
| syn_file = syn.store(syn_file, activity=act) | |
| ## SHOW THE NEWLY CREATED RESULT | |
| ## doesn;t work well in ipython notebook: syn.onweb(syn_file) | |
| return syn_file | |
| if __name__ == '__main__': | |
| ## Create a Synapse object | |
| syn = synapseclient.Synapse() | |
| ## If you once do syn.login("YourUserName", "YourPassword", rememberMe=True), | |
| ## the client will cache an API key and you will be able to login | |
| ## without putting your credentials in your scripts, like this: | |
| syn.login() | |
| project = sys.argv[1] | |
| print "... working in project", project | |
| runAttractorInCRCSC(syn, project) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment