Skip to content

Instantly share code, notes, and snippets.

@PatWalters
Created November 23, 2023 01:55
Show Gist options
  • Save PatWalters/c64d59c262ba1993d20af815cd5e0b5a to your computer and use it in GitHub Desktop.
Save PatWalters/c64d59c262ba1993d20af815cd5e0b5a to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 10,
"id": "a34531cd",
"metadata": {},
"outputs": [],
"source": [
"from astartes.molecules import train_test_split_molecules\n",
"from astartes.utils.warnings import ImperfectSplittingWarning\n",
"from astartes import samplers\n",
"import useful_rdkit_utils as uru\n",
"from lightgbm import LGBMRegressor\n",
"import numpy as np\n",
"from sklearn.metrics import r2_score\n",
"import warnings\n",
"from tqdm.auto import tqdm\n",
"import seaborn as sns\n",
"import pandas as pd\n",
"from rdkit.Chem.Scaffolds import MurckoScaffold"
]
},
{
"cell_type": "markdown",
"id": "f3482017",
"metadata": {},
"source": [
"Get the samplers from astartes"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "c411d228",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"('dbscan', 'scaffold', 'kmeans', 'optisim', 'sphere_exclusion', 'time_based')"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"samplers.IMPLEMENTED_EXTRAPOLATION_SAMPLERS"
]
},
{
"cell_type": "markdown",
"id": "b250624a",
"metadata": {},
"source": [
"Grab some data and add fingerprints"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "f3d315ee",
"metadata": {},
"outputs": [],
"source": [
"df = pd.read_csv(\"https://raw.githubusercontent.com/PatWalters/yamc/main/data/B-raf.smi\",\n",
" names=[\"SMILES\",\"Name\",\"pIC50\"],\n",
" sep=\" \")\n",
"df['numpy_fp'] = df.SMILES.apply(uru.smi2numpy_fp)"
]
},
{
"cell_type": "markdown",
"id": "7f8eadcd",
"metadata": {},
"source": [
"Create a list of samplers and hyperparameters. The hyperparameters are just quick estimates based on some empirical tests"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "d9d570b7",
"metadata": {},
"outputs": [],
"source": [
"param_list = [['random',{}],\n",
" ['scaffold',{}],\n",
" ['dbscan', dict(eps=6.0)],\n",
" ['kmeans',dict(n_clusters=100)],\n",
" ['optisim',dict(n_clusters=100)],\n",
" ['sphere_exclusion',{}]]"
]
},
{
"cell_type": "markdown",
"id": "0b65480c",
"metadata": {},
"source": [
"The astartes package warns when the training and test set sizes are not exactly what you asked for. This will happend with methods that use clusters to generate splits. I know this will happen and don't want to see the warnings. "
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "ce225a16",
"metadata": {},
"outputs": [],
"source": [
"warnings.filterwarnings(\"ignore\", category=ImperfectSplittingWarning)"
]
},
{
"cell_type": "markdown",
"id": "bd6bba70",
"metadata": {},
"source": [
"Run 10 folds of cross validation with each of the splits"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "5bee0345",
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "1505821f17ef460c88f3d413c9bd4067",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
" 0%| | 0/6 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "e32702ce5a3a4c769846ad7558603579",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
" 0%| | 0/10 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "dc19a3c79a284705a1a7e89559d5ae4c",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
" 0%| | 0/10 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "d101f4dd8c7a4256b84fc1cc0a22db0d",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
" 0%| | 0/10 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "89589b095b07414899029fc44734c8ed",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
" 0%| | 0/10 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "eb7457a99aaa4b69bc68a5489a65d072",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
" 0%| | 0/10 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "313e1751f54748488ec8105cf1886fbb",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
" 0%| | 0/10 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"output = []\n",
"for sampler, hopts in tqdm(param_list):\n",
" for i in tqdm(range(0,10),leave=True):\n",
" res = train_test_split_molecules(molecules=df.SMILES.values, return_indices=True, sampler=sampler,\n",
" random_state=i, hopts=hopts)\n",
" train_idx, test_idx = res[-2:]\n",
" train = df.iloc[train_idx]\n",
" test = df.iloc[test_idx]\n",
" lgbm = LGBMRegressor()\n",
" lgbm.fit(np.stack(train.numpy_fp),train.pIC50)\n",
" pred = lgbm.predict(np.stack(test.numpy_fp))\n",
" output.append([sampler, len(train), len(test), r2_score(test.pIC50, pred)])"
]
},
{
"cell_type": "code",
"execution_count": 44,
"id": "7dc70551",
"metadata": {},
"outputs": [],
"source": [
"output_df = pd.DataFrame(output,columns=['sampler','n_train','n_test','r2'])"
]
},
{
"cell_type": "code",
"execution_count": 49,
"id": "1f94d823",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<Axes: xlabel='sampler', ylabel='r2'>"
]
},
"execution_count": 49,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "",
"text/plain": [
"<Figure size 1000x600 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"sns.set(rc={'figure.figsize': (10, 6)})\n",
"sns.set_style('whitegrid')\n",
"sns.set_context('notebook')\n",
"sns.boxplot(x='sampler',y='r2',data=output_df)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "56d9c7d3",
"metadata": {},
"outputs": [],
"source": [
"df['murcko'] = df.SMILES.apply(MurckoScaffold.MurckoScaffoldSmiles)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "85a3614c",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"murcko\n",
"c1ccc(Nc2ncnc3ccc(-c4ccccc4)cc23)cc1 58\n",
"c1ccc(-c2ccc3ncnc(Nc4cccc5[nH]ncc45)c3c2)cc1 40\n",
"c1ccc(-c2[nH]c(C3CC3)nc2-c2ccncn2)cc1 27\n",
"O=C(Nc1ccc2c(c1)OCO2)c1ccccc1NCc1ccncc1 19\n",
"c1ccc(-c2ccc3nccc(Nc4cccc5[nH]ncc45)c3c2)cc1 16\n",
"c1ccc(Nc2ncnc3[nH]ccc23)cc1 15\n",
"O=c1oc2ccccc2c(=O)n1Cc1ccccc1 13\n",
"N=C1CCc2cc(-c3c[nH]nc3-c3ccncc3)ccc21 13\n",
"c1ccc(-c2c(-c3ccncc3)nn3c(C4CC5CCC(C4)N5)ccnc23)cc1 12\n",
"c1ccc(Nc2ncnc3ccccc23)cc1 12\n",
"c1ccc2c(c1)[nH]c1cnccc12 12\n",
"c1ccc(Nc2ncccc2-c2ncnc3[nH]cnc23)cc1 12\n",
"c1ccc(Nc2cncc(-c3ccccc3)n2)cc1 12\n",
"c1cc(-c2c(-c3ccncc3)nn3c(C4CC5CCC(C4)N5)ccnc23)c2cn[nH]c2c1 12\n",
"O=C(Nc1ccon1)c1cccc(Nc2ncnc3cncnc23)c1 12\n",
"c1ccc(Nc2nc3cc(Oc4ccncc4)ccc3[nH]2)cc1 10\n",
"O=C(Nc1cccc(Nc2ccnc(NC(=O)C3CC3)c2)c1)c1ccccc1 9\n",
"O=C(Nc1ccccc1)c1cccc(Nc2ncccc2-c2ncnc3[nH]cnc23)c1 9\n",
"c1ccc(-c2ncnc3sccc23)cc1 9\n",
"c1ccc(-c2nc(NC3CC3)nc3sccc23)cc1 8\n",
"c1ccc(Nc2ncnc3cnc(N4CCNCC4)nc23)cc1 8\n",
"O=C(Nc1cccc(-c2ccnc3ccnn23)c1)c1ccccc1 8\n",
"O=C(Nc1cccc(Nc2ncnc3cncnc23)c1)c1ccccc1 8\n",
"O=C(Nc1ccccc1)Nc1cccc(-c2n[nH]cc2-c2ccnc3[nH]ccc23)c1 8\n",
"c1ccc(Nc2ncnc3[nH]cc(-c4ccccc4)c23)cc1 7\n",
"c1ccc(Nc2ncnc3[nH]c(-c4ccccc4)cc23)cc1 7\n",
"O=C(Nc1cccc(Nc2ncccc2-c2ncnc3[nH]cnc23)c1)c1ccccc1 6\n",
"c1cc(-c2nn(C3CCNCC3)cc2-c2ccc3c(c2)Cc2c[nH]nc2-3)ccn1 6\n",
"c1ccc(-c2c(-c3ccncc3)nn3c(-c4ccc(N5CC6CC5CN6)cc4)ccnc23)cc1 6\n",
"O=C(Nc1ccn[nH]1)c1cccc(Nc2ncnc3cncnc23)c1 6\n",
"Name: count, dtype: int64"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.murcko.value_counts().head(30)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8cee79cf",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.3"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment