alexlenail · November 11, 2016 15:58
diff --git a/analyze_gene_clusters_enrichr.py b/analyze_gene_clusters_enrichr.py
 import pdb
 import pandas as pd
 import os
 import json
 import requests
 from argparse import ArgumentParser

 ENRICHR_ADDLIST = 'http://amp.pharm.mssm.edu/Enrichr/addList'
 ENRICHR_EXPORT = 'http://amp.pharm.mssm.edu/Enrichr/export'
 DATABASES =['KEGG_2016', 'Reactome_2016', 'BioCarta_2016']
 # DATABASES =['KEGG_2016', 'GO_Biological_Process_2015', 'GO_Cellular_Component_2015', 'GO_Molecular_Function_2015', 'WikiPathways_2016']

 def main():
 	"""
 	This script takes the output of a communities algorithm and then searches the
 	list of terms for different enriched terms using the enrichr API.
 	VERSION 2016-10-25
 	"""

 	parser=ArgumentParser(description='Read in communities from clustering algoritithm')
 	parser.add_argument('filename',help='Input file with the cluster communities')

 	args=parser.parse_args()
 	filename=args.filename

 	get_enrichment(filename)


 def get_enrichment(filename):
 	"""
 	Reads a file, groups the entries by cluster ID (col 1),
 	for the genes in each cluster, uploads to enrichr,
 	then downloads the enrichments for each (clustered genes X databse)
 	"""

 	print 'The file for clustering is ' + filename
 	outputDir = os.path.dirname(os.path.abspath(filename))

 	df = pd.read_table(filename, header=None)
 	df.columns = ['cluster_number','genes']

 	for clusterID, genes in df.groupby(['cluster_number'])['genes']:
 		genes_str = '\n'.join(genes.tolist())
 		clusterID = 'Cluster_' + str(clusterID)

 		userListID = add_gene_list(genes_str, clusterID)

 		for database in DATABASES:
 			download_enrichr(userListID, clusterID, outputDir, database)


 def add_gene_list(genes_str, clusterID):
 	""" Uploads a list of genes to enrichr """
 	payload = {
 		'list': (None, genes_str),
 		'description': (None, clusterID)
 	}

 	response = requests.post(ENRICHR_ADDLIST, files=payload)
 	if not response.ok: raise Exception('Error analyzing gene list')

 	json_response = json.loads(response.text)
 	userListID = json_response['userListId']
 	return userListID


 def download_enrichr(userListID, clusterID, outputDir, database):
 	""" Downloads and writes to file enrichments for a list of genes given a database """

 	query_string = '?userListId=%s&filename=%s&backgroundType=%s'

 	filename = outputDir +'/' + database + '_' + clusterID

 	url = ENRICHR_EXPORT + query_string % (userListID, filename, database)
 	print "Querying " + database + " for " + clusterID
 	response = requests.get(url, stream=True)
 	if not response.ok:
 		print "failed once, trying again: "
 		return download_enrichr(userListID, clusterID, outputDir, database)

 	with open(filename + '.txt', 'wb') as f:
 		for chunk in response.iter_content(chunk_size=1024):
 			if chunk:
 				f.write(chunk)


 if __name__=='__main__':
 	main()
	import pdb
	import pandas as pd
	import os
	import json
	import requests
	from argparse import ArgumentParser

	ENRICHR_ADDLIST = 'http://amp.pharm.mssm.edu/Enrichr/addList'
	ENRICHR_EXPORT = 'http://amp.pharm.mssm.edu/Enrichr/export'
	DATABASES =['KEGG_2016', 'Reactome_2016', 'BioCarta_2016']
	# DATABASES =['KEGG_2016', 'GO_Biological_Process_2015', 'GO_Cellular_Component_2015', 'GO_Molecular_Function_2015', 'WikiPathways_2016']

	def main():
	"""
	This script takes the output of a communities algorithm and then searches the
	list of terms for different enriched terms using the enrichr API.
	VERSION 2016-10-25
	"""

	parser=ArgumentParser(description='Read in communities from clustering algoritithm')
	parser.add_argument('filename',help='Input file with the cluster communities')

	args=parser.parse_args()
	filename=args.filename

	get_enrichment(filename)


	def get_enrichment(filename):
	"""
	Reads a file, groups the entries by cluster ID (col 1),
	for the genes in each cluster, uploads to enrichr,
	then downloads the enrichments for each (clustered genes X databse)
	"""

	print 'The file for clustering is ' + filename
	outputDir = os.path.dirname(os.path.abspath(filename))

	df = pd.read_table(filename, header=None)
	df.columns = ['cluster_number','genes']

	for clusterID, genes in df.groupby(['cluster_number'])['genes']:
	genes_str = '\n'.join(genes.tolist())
	clusterID = 'Cluster_' + str(clusterID)

	userListID = add_gene_list(genes_str, clusterID)

	for database in DATABASES:
	download_enrichr(userListID, clusterID, outputDir, database)


	def add_gene_list(genes_str, clusterID):
	""" Uploads a list of genes to enrichr """
	payload = {
	'list': (None, genes_str),
	'description': (None, clusterID)
	}

	response = requests.post(ENRICHR_ADDLIST, files=payload)
	if not response.ok: raise Exception('Error analyzing gene list')

	json_response = json.loads(response.text)
	userListID = json_response['userListId']
	return userListID


	def download_enrichr(userListID, clusterID, outputDir, database):
	""" Downloads and writes to file enrichments for a list of genes given a database """

	query_string = '?userListId=%s&filename=%s&backgroundType=%s'

	filename = outputDir +'/' + database + '_' + clusterID

	url = ENRICHR_EXPORT + query_string % (userListID, filename, database)
	print "Querying " + database + " for " + clusterID
	response = requests.get(url, stream=True)
	if not response.ok:
	print "failed once, trying again: "
	return download_enrichr(userListID, clusterID, outputDir, database)

	with open(filename + '.txt', 'wb') as f:
	for chunk in response.iter_content(chunk_size=1024):
	if chunk:
	f.write(chunk)


	if __name__=='__main__':
	main()