Created
June 27, 2019 09:34
-
-
Save avrilcoghlan/133806c5caf9d0483dfae4b6741174f8 to your computer and use it in GitHub Desktop.
Script to retrieve predicted targets from ChEMBL for an input list of ChEMBL compounds
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import sys | |
import pandas as pd # uses pandas python module to view and analyse data | |
import requests # this is used to access json files | |
#====================================================================# | |
# call the 'target prediction' API to find the predicted targets of our list of compounds: | |
def find_predicted_targets_of_compounds(cmpd_chembl_ids): | |
# For the identified compounds, extract their predicted targets from the 'target prediction' ChEMBL API. | |
# Specify the input parameters: | |
cmpd_chembl_ids = ",".join(cmpd_chembl_ids[0:]) #Amend the format of the text string of compounds so that it is suitable for the API call | |
limit = 100 #Limit the number of records pulled back for each url call | |
# Set up the call to the ChEMBL 'molecule' API | |
# Remember that there is a limit to the number of records returned in any one API call (default is 20 records, maximum is 1000 records) | |
# So need to iterate over several pages of records to gather all relevant information together! | |
url_stem = "https://www.ebi.ac.uk" #This is the stem of the url | |
url_full_string = url_stem + "/chembl/api/data/target_prediction.json?molecule_chembl_id__in={}&limit={}".format(cmpd_chembl_ids, limit) #This is the full url with the specified input parameters | |
url_full = requests.get( url_full_string ).json() #This calls the information back from the API using the 'requests' module, and converts it to json format | |
url_targetpredictions = url_full['target_predictions'] #This is a list of the results for target predictions | |
# This 'while' loop iterates over several pages of records (if required), and collates the list of results | |
while url_full['page_meta']['next']: | |
url_full = requests.get(url_stem + url_full['page_meta']['next']).json() | |
url_targetpredictions = url_targetpredictions + url_full['target_predictions'] #Add result (as a list) to previous list of results | |
no_predictions = len(url_targetpredictions) | |
if no_predictions == 0: | |
# print("No predictions for compounds",cmpd_chembl_ids) | |
targ_df = pd.DataFrame(url_targetpredictions) | |
else: | |
# Convert the list of results into a Pandas dataframe: | |
targ_df = pd.DataFrame(url_targetpredictions) | |
# Print out some useful information: | |
# print("This is the url string that calls the 'Target prediction' API with the specified query\n{}".format(url_full_string) ) | |
# print("\nThese are the available columns for the Target prediction API:\n{}".format(targ_df.columns)) | |
# Select only relevant columns: | |
targ_df = targ_df[[ 'molecule_chembl_id','probability', 'target_accession', 'target_chembl_id', 'value']] | |
# print targ_df | |
return targ_df | |
#====================================================================# | |
# read in the input list of compounds of interest: | |
def read_input_list_of_compounds(input_compoundlist_file, output_file): | |
cnt = 0 | |
# open the output file: | |
with open(output_file, 'w') as f: | |
# read in the list of compounds: | |
compounds = list() # create an empty list to store the compounds in | |
inputfileObj = open(input_compoundlist_file, "r") | |
compound_set_count = 0 # we will retrieve data for 10 compounds at a time | |
for line in inputfileObj: | |
line = line.rstrip() | |
temp = line.split() | |
# CHEMBL10 | |
compound = temp[0] # e.g. CHEMBL10 | |
cnt += 1 | |
compounds.append(compound) | |
# if the list of compounds has 10 compounds, find the compound info. for these compounds: | |
if len(compounds) == 10: | |
compound_set_count += 1 | |
# using a list of known compounds, find predicted targets for those compounds: | |
print(cnt,"Finding predicted targets for compounds",compounds) | |
targ_df = find_predicted_targets_of_compounds(compounds) | |
# Export the data frame to a csv file: | |
# Followed expamples from https://stackoverflow.com/questions/37357727/pandas-write-tab-separated-dataframe-with-literal-tabs-with-no-quotes | |
# and https://datatofish.com/export-dataframe-to-csv and https://stackoverflow.com/questions/17530542/how-to-add-pandas-data-to-an-existing-csv-file | |
if compound_set_count == 1: | |
targ_df.to_csv(f, sep="\t", index=None, header=True) # only write a header for the first set of 10 targets | |
else: | |
targ_df.to_csv(f, sep="\t", index=None, header=False) | |
# empty the list of compounds: | |
compounds.clear() # from https://www.geeksforgeeks.org/different-ways-to-clear-a-list-in-python/ | |
inputfileObj.close() | |
# if there are some compounds left in the compound list, find their predicted targets: | |
if len(compounds) > 0: | |
# find the predicted targets for these targets: | |
print(cnt,"Finding predicted targets for compounds",compounds) | |
targ_df = find_predicted_targets_of_compounds(compounds) | |
targ_df.to_csv(f, sep="\t", index=None, header=False) | |
#====================================================================# | |
def main(): | |
# check the command-line arguments: | |
if len(sys.argv) != 3 or os.path.exists(sys.argv[1]) == False: | |
print("Usage: %s input_compoundlist_file output_file" % sys.argv[0]) | |
sys.exit(1) | |
input_compoundlist_file = sys.argv[1] # input file with a list of ChEMBL compounds of interest | |
output_file = sys.argv[2] | |
# read in the input list of compounds of interest: | |
print("Reading in compound list...") | |
read_input_list_of_compounds(input_compoundlist_file, output_file) | |
print("FINISHED\n") | |
#====================================================================# | |
if __name__=="__main__": | |
main() | |
#====================================================================# | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment