Last active
August 31, 2020 23:47
-
-
Save PatWalters/7614dbadaa4b1dcd6650c80a6e3640e4 to your computer and use it in GitHub Desktop.
A few ChEMBL query examples
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import pandas as pd\n", | |
"import mysql.connector as sql" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"hostname = 'localhost'\n", | |
"database = 'chembl_27'\n", | |
"user = 'pwalters'\n", | |
"password = 'imnottellin'\n", | |
"con = sql.connect(host=hostname, database=database, user=user, password=password)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"**Retrieve marketed drugs from ChEMBL**" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"drug_query = \"\"\"select distinct canonical_smiles, cs.molregno, f.ingredient from compound_structures cs\n", | |
"join formulations f on cs.molregno = f.molregno\n", | |
"join products p on p.product_id = f.product_id\n", | |
"join compound_properties cp on cp.molregno = cs.molregno\n", | |
"where p.oral = 1\n", | |
"and cp.mw_freebase < 1000\"\"\"" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"drug_df = pd.read_sql(drug_query,con=con)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>canonical_smiles</th>\n", | |
" <th>molregno</th>\n", | |
" <th>ingredient</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>Nc1ccc(S(=O)(=O)Nc2ccccn2)cc1</td>\n", | |
" <td>32842</td>\n", | |
" <td>SULFAPYRIDINE</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>CCC(C)C1(CC)C(=O)[N-]C(=O)NC1=O.[Na+]</td>\n", | |
" <td>674933</td>\n", | |
" <td>BUTABARBITAL SODIUM</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>Cl.N=C(N)N</td>\n", | |
" <td>674679</td>\n", | |
" <td>GUANIDINE HYDROCHLORIDE</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>CC1=CC(=O)c2ccccc2C1=O</td>\n", | |
" <td>19344</td>\n", | |
" <td>MENADIONE</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>Cn1c(=O)c2[nH]cnc2n(C)c1=O.Cn1c(=O)c2[nH]cnc2n...</td>\n", | |
" <td>794445</td>\n", | |
" <td>AMINOPHYLLINE</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" canonical_smiles molregno \\\n", | |
"0 Nc1ccc(S(=O)(=O)Nc2ccccn2)cc1 32842 \n", | |
"1 CCC(C)C1(CC)C(=O)[N-]C(=O)NC1=O.[Na+] 674933 \n", | |
"2 Cl.N=C(N)N 674679 \n", | |
"3 CC1=CC(=O)c2ccccc2C1=O 19344 \n", | |
"4 Cn1c(=O)c2[nH]cnc2n(C)c1=O.Cn1c(=O)c2[nH]cnc2n... 794445 \n", | |
"\n", | |
" ingredient \n", | |
"0 SULFAPYRIDINE \n", | |
"1 BUTABARBITAL SODIUM \n", | |
"2 GUANIDINE HYDROCHLORIDE \n", | |
"3 MENADIONE \n", | |
"4 AMINOPHYLLINE " | |
] | |
}, | |
"execution_count": 6, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"drug_df.head()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"**Retrieve journal information along with structures from ChEMBL**" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"journal_query = \"\"\"select canonical_smiles, cs.molregno, journal from docs\n", | |
"join compound_records cr on docs.doc_id = cr.doc_id\n", | |
"join compound_structures cs on cs.molregno = cr.molregno\"\"\"" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"journal_df = pd.read_sql(journal_query,con=con)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>canonical_smiles</th>\n", | |
" <th>molregno</th>\n", | |
" <th>journal</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>Cc1cc(-n2ncc(=O)[nH]c2=O)ccc1C(=O)c1ccccc1Cl</td>\n", | |
" <td>1</td>\n", | |
" <td>J. Med. Chem.</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>Cc1cc(-n2ncc(=O)[nH]c2=O)ccc1C(=O)c1ccc(C#N)cc1</td>\n", | |
" <td>2</td>\n", | |
" <td>J. Med. Chem.</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>Cc1cc(-n2ncc(=O)[nH]c2=O)cc(C)c1C(O)c1ccc(Cl)cc1</td>\n", | |
" <td>3</td>\n", | |
" <td>J. Med. Chem.</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>Cc1ccc(C(=O)c2ccc(-n3ncc(=O)[nH]c3=O)cc2)cc1</td>\n", | |
" <td>4</td>\n", | |
" <td>J. Med. Chem.</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>Cc1cc(-n2ncc(=O)[nH]c2=O)ccc1C(=O)c1ccc(Cl)cc1</td>\n", | |
" <td>5</td>\n", | |
" <td>J. Med. Chem.</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" canonical_smiles molregno journal\n", | |
"0 Cc1cc(-n2ncc(=O)[nH]c2=O)ccc1C(=O)c1ccccc1Cl 1 J. Med. Chem.\n", | |
"1 Cc1cc(-n2ncc(=O)[nH]c2=O)ccc1C(=O)c1ccc(C#N)cc1 2 J. Med. Chem.\n", | |
"2 Cc1cc(-n2ncc(=O)[nH]c2=O)cc(C)c1C(O)c1ccc(Cl)cc1 3 J. Med. Chem.\n", | |
"3 Cc1ccc(C(=O)c2ccc(-n3ncc(=O)[nH]c3=O)cc2)cc1 4 J. Med. Chem.\n", | |
"4 Cc1cc(-n2ncc(=O)[nH]c2=O)ccc1C(=O)c1ccc(Cl)cc1 5 J. Med. Chem." | |
] | |
}, | |
"execution_count": 9, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"journal_df.head()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"**Retrieve COVID-19 drug repurposing data from ChEMBL**" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"covid_query = \"\"\"select cs.canonical_smiles, cs.molregno, usan_tbl.synonyms, act.assay_id, act.standard_value, act.standard_type, act.standard_units from activities act\n", | |
" join compound_structures cs on cs.molregno = act.molregno\n", | |
" join assays a on act.assay_id = a.assay_id\n", | |
" join source s on a.src_id = s.src_id\n", | |
" left join (select molregno, synonyms from molecule_synonyms where syn_type = 'USAN') as usan_tbl on usan_tbl.molregno = cs.molregno\n", | |
" where src_description = 'SARS-CoV-2 Screening Data'\"\"\"" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"covid_df = pd.read_sql(covid_query,con=con)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>canonical_smiles</th>\n", | |
" <th>molregno</th>\n", | |
" <th>synonyms</th>\n", | |
" <th>assay_id</th>\n", | |
" <th>standard_value</th>\n", | |
" <th>standard_type</th>\n", | |
" <th>standard_units</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>O=C(O)c1ccc(N2C(=O)/C(=C/c3ccc(-c4ccc([N+](=O)...</td>\n", | |
" <td>1033892</td>\n", | |
" <td>None</td>\n", | |
" <td>1804482</td>\n", | |
" <td>8.709636e+06</td>\n", | |
" <td>IC50</td>\n", | |
" <td>nM</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>O=C(O)/C(Cc1ccccc1[N+](=O)[O-])=N\\Nc1nc(-c2ccc...</td>\n", | |
" <td>425006</td>\n", | |
" <td>None</td>\n", | |
" <td>1804482</td>\n", | |
" <td>2.089300e+03</td>\n", | |
" <td>IC50</td>\n", | |
" <td>nM</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>CCNC(=O)c1cc2c(-c3cc(C(C)(C)O)ccc3Oc3c(C)cc(F)...</td>\n", | |
" <td>2335425</td>\n", | |
" <td>None</td>\n", | |
" <td>1804482</td>\n", | |
" <td>3.467368e+06</td>\n", | |
" <td>IC50</td>\n", | |
" <td>nM</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>CCC(=O)CCCCC[C@@H]1NC(=O)[C@H]2CCCCN2C(=O)[C@H...</td>\n", | |
" <td>275631</td>\n", | |
" <td>None</td>\n", | |
" <td>1804482</td>\n", | |
" <td>9.996846e+08</td>\n", | |
" <td>IC50</td>\n", | |
" <td>nM</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>C[C@@H](C1CCCCC1)n1c(-c2cc3c(cc2Br)OCO3)nc2cc(...</td>\n", | |
" <td>2260454</td>\n", | |
" <td>None</td>\n", | |
" <td>1804482</td>\n", | |
" <td>6.025600e+02</td>\n", | |
" <td>IC50</td>\n", | |
" <td>nM</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" canonical_smiles molregno synonyms \\\n", | |
"0 O=C(O)c1ccc(N2C(=O)/C(=C/c3ccc(-c4ccc([N+](=O)... 1033892 None \n", | |
"1 O=C(O)/C(Cc1ccccc1[N+](=O)[O-])=N\\Nc1nc(-c2ccc... 425006 None \n", | |
"2 CCNC(=O)c1cc2c(-c3cc(C(C)(C)O)ccc3Oc3c(C)cc(F)... 2335425 None \n", | |
"3 CCC(=O)CCCCC[C@@H]1NC(=O)[C@H]2CCCCN2C(=O)[C@H... 275631 None \n", | |
"4 C[C@@H](C1CCCCC1)n1c(-c2cc3c(cc2Br)OCO3)nc2cc(... 2260454 None \n", | |
"\n", | |
" assay_id standard_value standard_type standard_units \n", | |
"0 1804482 8.709636e+06 IC50 nM \n", | |
"1 1804482 2.089300e+03 IC50 nM \n", | |
"2 1804482 3.467368e+06 IC50 nM \n", | |
"3 1804482 9.996846e+08 IC50 nM \n", | |
"4 1804482 6.025600e+02 IC50 nM " | |
] | |
}, | |
"execution_count": 12, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"covid_df.head()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.7.6" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 4 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment