Skip to content

Instantly share code, notes, and snippets.

@cbare
Last active August 29, 2015 14:08
Show Gist options
  • Select an option

  • Save cbare/c5c299930afd77178ac9 to your computer and use it in GitHub Desktop.

Select an option

Save cbare/c5c299930afd77178ac9 to your computer and use it in GitHub Desktop.
runAttractorInCRCSC: A sample analysis script for use in the [Mad Scientist Stu demo](https://www.synapse.org/#!Help:Collaboratorium)
###########################################################
##
## runAttractorInCRCSC
##
## A sample data analysis for use with the Mad Scientist
## Stu demo of Synapse:
## https://www.synapse.org/#!Help:Collaboratorium
###########################################################
"""
Perform Step 5 of the Mad Scientist Stu Synapse Tutorial.
The runAttractorInCRCSC function extracts gene names from a co-expressed
cluster of genes computed by the Attractor Metagenes team. It then analyzed
TCGA expression data finding the correlation of each gene with the mean
expression of the genes in the cluster. We use this to pick out a gene
strongly anti-correlated with the cluster.
"""
import synapseclient
from synapseclient import Activity, File, Folder, Project
import synapseclient.utils as synutils
import os
import pandas as pd
import tempfile
import sys
def runAttractorInCRCSC(syn, project):
"""
Perform Step 5 of the Mad Scientist Stu Synapse Tutorial.
:param syn: a `Synapse connection object <http://python-docs.synapse.org/#connecting-to-synapse>`_.
:param project: a `Synapse project <http://python-docs.synapse.org/Entity.html#synapseclient.entity.Project>`_ or its Synapse ID.
"""
## gist URL that holds this script:
this_script="https://gist.github.com/cbare/c5c299930afd77178ac9"
## PULL FROM THE LINK TO "COAD.attractors.rnaseq.txt" THAT WAS CREATED IN THE PROJECT AT AN EARLIER STAGE
print "... finding link to attractors rnaseq file ..."
projectId = synutils.id_of(project)
results = syn.chunkedQuery('SELECT id, name FROM link WHERE benefactorId=="%s" and name=="COAD.attractors.rnaseq.txt"' % projectId)
for result in results:
link = syn.get(result['link.id'])
break
else:
raise RuntimeError('You need to add a link to "COAD.attractors.rnaseq.txt" to your project!')
## GET THE ATTRACTOR METAGENES FOR TCGA COAD FROM THE LINK
print "... reading in attractor metagenes ..."
syn_amg_coad = syn.get(link.linksTo['targetId'])
amg_coad = pd.read_table(syn_amg_coad.path)
## extract gene names from strings that look like "MARCH7|64844" where
## the gene name and gene ID are separated by a pipe character
attractor_genes = [g.split('|')[0] for g in amg_coad['Attractor001:Genes']]
## GRAB THE CURATED RNA-SEQ EXPRESSION DATA ACROSS TCGA COAD AND READ
print "... reading in large expression matrix from CRCSC (this might take a minute) ..."
## read a gene expression matrix
syn_crc_expr = syn.get("syn2326100")
crc_expr = pd.read_table(syn_crc_expr.path, index_col=0)
## filter out genes whose expression has a standard deviation of 0
crc_expr = crc_expr.ix[crc_expr.std(1)>0.00000001, :]
## SUBSET TO ONLY ATTRACTORS PULLED FROM COAD
print "... computing the top attractor in CRCSC ..."
crc_am = crc_expr.ix[attractor_genes, :].mean()
## LOOK AT RANK-BASED CORRELATION (SPEARMAN)
print "... calculating spearman correlation with all genes in CRCSC dataset ..."
def spearman_correlation_with_mean_expr_of_amg_cluster(crc_expr_row):
return crc_am.corr(crc_expr_row, method='spearman')
am_corr = crc_expr.apply(spearman_correlation_with_mean_expr_of_amg_cluster, axis=1)
am_corr.sort()
## GRAB GENE WITH THE MOST NEGATIVE CORRELATION
this_gene = am_corr.index[0]
## WRITE OUT TO FILE TO STORE IN SYNAPSE
filepath = os.path.join(tempfile.gettempdir(), "coadAttractorMetagene-correlationCRCSC.tsv")
am_corr.to_csv(filepath, sep='\t')
## FIND FOLDER TO STORE RESULTS
results = syn.chunkedQuery('SELECT id, name FROM folder WHERE benefactorId=="%s"' % projectId)
for result in results:
if result['folder.name'].lower() == "results":
folder = syn.get(result['folder.id'])
break
else:
raise RuntimeError('You need to add a folder called "results" to your project!')
## CREATE PROVENANCE: DESCRIBE WHAT WAS USED TO GENERATE THE RESULT
print "... recording provenance for this analysis ..."
act = Activity(name="COAD Attractor Metagene correlation with CRCSC TCGA",
used=[syn_amg_coad, syn_crc_expr],
executed=this_script)
## SET ANNOTATIONS AND STORE RESULT BACK IN SYNAPSE
print "... storing correlation analysis back in Synapse ..."
syn_file = File(path=filepath, parent=folder, min=this_gene, projectId=projectId)
syn_file = syn.store(syn_file, activity=act)
## SHOW THE NEWLY CREATED RESULT
## doesn;t work well in ipython notebook: syn.onweb(syn_file)
return syn_file
if __name__ == '__main__':
## Create a Synapse object
syn = synapseclient.Synapse()
## If you once do syn.login("YourUserName", "YourPassword", rememberMe=True),
## the client will cache an API key and you will be able to login
## without putting your credentials in your scripts, like this:
syn.login()
project = sys.argv[1]
print "... working in project", project
runAttractorInCRCSC(syn, project)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment