PatWalters · October 12, 2024 00:58
diff --git a/label_leash_data.ipynb b/label_leash_data.ipynb
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 98,
   "id": "ab054ad6-ffbb-4eb4-84d3-3bc24806e5de",
   "metadata": {},
   "outputs": [],
   "source": [
    "import duckdb\n",
    "import pandas as pd\n",
    "import json\n",
    "from rdkit import Chem\n",
    "from tqdm.auto import tqdm"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f22c222f-68bc-436d-92a2-af68d21d94d7",
   "metadata": {},
   "source": [
    "Read the training and validation parquet files and find the unique R1, R2, R3 building blocks"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "id": "3ef7a9c5-0511-4f77-bf8a-f2bcef4bee3b",
   "metadata": {},
   "outputs": [],
   "source": [
    "con = duckdb.connect()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "id": "abb4426b-def2-44ab-b4ee-b8f9cdded222",
   "metadata": {},
   "outputs": [],
   "source": [
    "files = json.dumps(['train*.parquet','../val/*.parquet'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "id": "f311f647-fd50-4e2a-b7b2-d7df2d57b7f5",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_list = []\n",
    "for i in [1,2,3]:\n",
    "    query = f\"\"\"select distinct buildingblock{i}_smiles from read_parquet({files})\"\"\"\n",
    "    tmp_df = con.query(query).df()  \n",
    "    tmp_df.columns = [\"SMILES\"]\n",
    "    tmp_df['rgroup'] = f\"R_{i}\"\n",
    "    df_list.append(tmp_df)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d1f7204b-fa9f-4235-a2dd-d86a212e921a",
   "metadata": {},
   "source": [
    "Concatenate the unique reagent lists"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "id": "3ee6e5bf-c897-4ba9-a492-db5f32dfedee",
   "metadata": {},
   "outputs": [],
   "source": [
    "combo_df = pd.concat(df_list)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "de55c3c4-b0b4-4ee1-a43a-4511096cb6f5",
   "metadata": {},
   "source": [
    "Evaluate the reagent counts"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "id": "a67f5187-6e17-478f-ae8f-eb514cb2133b",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "rgroup\n",
       "R_3    908\n",
       "R_2    727\n",
       "R_1    288\n",
       "Name: count, dtype: int64"
      ]
     },
     "execution_count": 57,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "combo_df.rgroup.value_counts()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "39d11e46-89a4-42ce-b9f6-ecafabfb9f76",
   "metadata": {},
   "source": [
    "Read the test set"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "id": "dfd220cc-65c0-440d-8d3c-31181fb893b2",
   "metadata": {},
   "outputs": [],
   "source": [
    "test_df = pd.read_parquet(\"../test/test_000000000000.parquet\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "84d6c3ef-f709-454b-a6a6-bd51f0dfa6a9",
   "metadata": {},
   "source": [
    "Label the reagents based on their presence in the training set"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 84,
   "id": "c7710f8b-ceb5-470f-b492-b34c26753137",
   "metadata": {},
   "outputs": [],
   "source": [
    "train_reagent_list = []\n",
    "for i in [1,2,3]:\n",
    "    reagent_set = set(combo_df.query(f\"rgroup == 'R_{i}'\").SMILES.values)\n",
    "    test_df[f\"train_R_{i}\"] = test_df[f\"buildingblock{i}_smiles\"].isin(reagent_set).astype(int)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "2913ce2b-de1c-46a9-a796-c2bcba369c42",
   "metadata": {},
   "source": [
    "Create a new column to indicate how many training set reagents each molecule has"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "895ec519-6dee-4815-8a7d-0be1e778e3a0",
   "metadata": {},
   "outputs": [],
   "source": [
    "test_df['reagent_sum'] = test_df.train_R_1 + test_df.train_R_2 + test_df.train_R_3"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a0601a6e-2415-4ae9-8d25-e6e3ea11840c",
   "metadata": {},
   "source": [
    "Look at the **reagent_sum** distribution.  Note that a value of **0** could indicate a molecule in traizine1 or kinase0. Molecules **reagent_sum**==3 are in triazine0."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 89,
   "id": "56932968-dff5-49d2-861e-c854946a4b6b",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "reagent_sum\n",
       "3    560085\n",
       "0    533966\n",
       "Name: count, dtype: int64"
      ]
     },
     "execution_count": 89,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test_df.reagent_sum.value_counts()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "dd911091-01db-469b-9e23-2b509415e46b",
   "metadata": {},
   "source": [
    "A quick class for substructure searching from SMILES"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 90,
   "id": "01be0f3e-6982-4d5e-8606-e40b578362d5",
   "metadata": {},
   "outputs": [],
   "source": [
    "class SmilesSubsearch:\n",
    "    def __init__(self, smarts):\n",
    "        self.query_mol = Chem.MolFromSmarts(smarts)\n",
    "\n",
    "    def search(self, smi):\n",
    "        mol = Chem.MolFromSmiles(smi)\n",
    "        if mol:\n",
    "            return mol.HasSubstructMatch(self.query_mol)\n",
    "        return False"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a4dd2386-4a3b-4cf7-a278-96da72b9fb3a",
   "metadata": {},
   "source": [
    "Instantiate a **SmilesSubsearch** object to search for triazines"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 93,
   "id": "0c1fba98-aecc-4d62-ae50-c5ebd10f4515",
   "metadata": {},
   "outputs": [],
   "source": [
    "smiles_subsearch = SmilesSubsearch(\"c1ncncn1\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "8a270947-ef35-4d7a-ad40-1adf464999c1",
   "metadata": {},
   "source": [
    "Identify the triazines"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 99,
   "id": "d1102d1b-bf74-4b42-b0b1-d5261d832e25",
   "metadata": {},
   "outputs": [],
   "source": [
    "tqdm.pandas()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 107,
   "id": "67c7464a-a659-47bd-9429-1a187a31979d",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "df1eb275d97b4b5bb302076b40038062",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/1094051 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "test_df['triazine'] = test_df.molecule_smiles.progress_apply(smiles_subsearch.search).astype(int)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "76b6ca0e-a23e-4000-8688-a28a01af869f",
   "metadata": {},
   "source": [
    "Count triazines"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 108,
   "id": "481b02bf-19f5-4760-bb8e-1be5675ee6c1",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "triazine\n",
       "1    594051\n",
       "0    500000\n",
       "Name: count, dtype: int64"
      ]
     },
     "execution_count": 108,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test_df.triazine.value_counts()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ed8eaf2f-cf6c-458b-9197-5e6ad667cb9e",
   "metadata": {},
   "source": [
    "This is the **kinase0** library"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 114,
   "id": "ce57bb0a-2869-476b-9159-1f6923a7c934",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "reagent_sum\n",
       "0    500000\n",
       "Name: count, dtype: int64"
      ]
     },
     "execution_count": 114,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test_df.query(\"triazine == 0\").reagent_sum.value_counts()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "676ef62d-c26c-4964-81de-7111ee862bf2",
   "metadata": {},
   "source": [
    "This is the two traizine libraries"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 115,
   "id": "bb66aec1-cf39-4857-8d18-a3cecdad9d61",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "reagent_sum\n",
       "3    560085\n",
       "0     33966\n",
       "Name: count, dtype: int64"
      ]
     },
     "execution_count": 115,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test_df.query(\"triazine == 1\").reagent_sum.value_counts()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4ae3e4c3-3837-452e-85a1-80314a5ddfaa",
   "metadata": {},
   "source": [
    "Create an indicator variable to separate the three libraries"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 111,
   "id": "4853b840-1fd0-420f-8f57-b9cf50ecf9d2",
   "metadata": {},
   "outputs": [],
   "source": [
    "test_df['library'] = test_df.reagent_sum + test_df.triazine"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 116,
   "id": "bd635c9b-b6d5-4106-87b8-2b9572fd5cf0",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "library\n",
       "4    560085\n",
       "0    500000\n",
       "1     33966\n",
       "Name: count, dtype: int64"
      ]
     },
     "execution_count": 116,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test_df.library.value_counts()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "112abf5f-749f-4880-b709-344cc105ffc4",
   "metadata": {},
   "source": [
    "Give the libraries names "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 117,
   "id": "99c25299-061f-4254-9f31-237a737ac107",
   "metadata": {},
   "outputs": [],
   "source": [
    "name_dict =  {4: \"triazine0\", 0: \"kinase0\", 1: \"triazine1\"}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 119,
   "id": "6a2894bb-7a88-4abe-afe1-79133116e2e2",
   "metadata": {},
   "outputs": [],
   "source": [
    "test_df['lib_name'] = [name_dict[x] for x in test_df.library]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 120,
   "id": "707c7b8e-3769-4ba2-90a1-0ba27440f7dc",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "lib_name\n",
       "triazine0    560085\n",
       "kinase0      500000\n",
       "triazine1     33966\n",
       "Name: count, dtype: int64"
      ]
     },
     "execution_count": 120,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test_df.lib_name.value_counts()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0efbdb5c-309c-454e-8e39-6ccfb0aef9c8",
   "metadata": {},
   "source": [
    "Save **test_df** to a parquet file "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 122,
   "id": "d9606928-26e0-4a9a-987a-d49346d3df9f",
   "metadata": {},
   "outputs": [],
   "source": [
    "test_df.to_parquet(\"test_processed.parquet\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a5bb031d-cb46-4f57-bd4d-e0611341c1a9",
   "metadata": {},
   "source": [
    "Create a new dataframe with just the binders"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 127,
   "id": "a2aa157e-c152-4bde-aca2-b3eee5e0f118",
   "metadata": {},
   "outputs": [],
   "source": [
    "binder_df = test_df.query(\"binds\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 130,
   "id": "68f593de-ada3-4c50-a433-ded3d161dc03",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "lib_name\n",
       "triazine0    2872\n",
       "triazine1    1319\n",
       "kinase0       539\n",
       "Name: count, dtype: int64"
      ]
     },
     "execution_count": 130,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "binder_df.lib_name.value_counts()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "6db850a7-d468-48a1-a64f-19210ebaf2f9",
   "metadata": {},
   "source": [
    "Write the binders to SMILES file divided by protein target and library"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 141,
   "id": "c407fe53-8289-4747-87ba-907573db65e9",
   "metadata": {},
   "outputs": [],
   "source": [
    "!mkdir ../PW2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 143,
   "id": "8639a7cc-a61b-473b-a13e-f41b0bf6cd56",
   "metadata": {},
   "outputs": [],
   "source": [
    "for k,v in binder_df.groupby([\"protein_name\",\"lib_name\"]):\n",
    "    protein_name, lib_name = k \n",
    "    tmp_df = v.copy()\n",
    "    prot_libname = protein_name + \"_\" + lib_name\n",
    "    tmp_df['mol_name'] = [f\"{prot_libname}_{i:04d}\" for i in range(0,len(tmp_df))]\n",
    "    tmp_df[[\"molecule_smiles\",\"mol_name\"]].to_csv(f\"../PW2/{prot_libname}.smi\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 144,
   "id": "06bf811f-4e8c-49db-94a0-c4355ce1f003",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "BRD4_kinase0.smi    HSA_kinase0.smi    sEH_kinase0.smi\n",
      "BRD4_triazine0.smi  HSA_triazine0.smi  sEH_triazine0.smi\n",
      "BRD4_triazine1.smi  HSA_triazine1.smi  sEH_triazine1.smi\n"
     ]
    }
   ],
   "source": [
    "!ls ../PW2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b478b550-c04b-4f37-889e-8384afb22766",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 98,
	"id": "ab054ad6-ffbb-4eb4-84d3-3bc24806e5de",
	"metadata": {},
	"outputs": [],
	"source": [
	"import duckdb\n",
	"import pandas as pd\n",
	"import json\n",
	"from rdkit import Chem\n",
	"from tqdm.auto import tqdm"
	]
	},
	{
	"cell_type": "markdown",
	"id": "f22c222f-68bc-436d-92a2-af68d21d94d7",
	"metadata": {},
	"source": [
	"Read the training and validation parquet files and find the unique R1, R2, R3 building blocks"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 48,
	"id": "3ef7a9c5-0511-4f77-bf8a-f2bcef4bee3b",
	"metadata": {},
	"outputs": [],
	"source": [
	"con = duckdb.connect()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 51,
	"id": "abb4426b-def2-44ab-b4ee-b8f9cdded222",
	"metadata": {},
	"outputs": [],
	"source": [
	"files = json.dumps(['train.parquet','../val/.parquet'])"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 55,
	"id": "f311f647-fd50-4e2a-b7b2-d7df2d57b7f5",
	"metadata": {},
	"outputs": [],
	"source": [
	"df_list = []\n",
	"for i in [1,2,3]:\n",
	" query = f\"\"\"select distinct buildingblock{i}_smiles from read_parquet({files})\"\"\"\n",
	" tmp_df = con.query(query).df() \n",
	" tmp_df.columns = [\"SMILES\"]\n",
	" tmp_df['rgroup'] = f\"R_{i}\"\n",
	" df_list.append(tmp_df)"
	]
	},
	{
	"cell_type": "markdown",
	"id": "d1f7204b-fa9f-4235-a2dd-d86a212e921a",
	"metadata": {},
	"source": [
	"Concatenate the unique reagent lists"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 56,
	"id": "3ee6e5bf-c897-4ba9-a492-db5f32dfedee",
	"metadata": {},
	"outputs": [],
	"source": [
	"combo_df = pd.concat(df_list)"
	]
	},
	{
	"cell_type": "markdown",
	"id": "de55c3c4-b0b4-4ee1-a43a-4511096cb6f5",
	"metadata": {},
	"source": [
	"Evaluate the reagent counts"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 57,
	"id": "a67f5187-6e17-478f-ae8f-eb514cb2133b",
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"rgroup\n",
	"R_3 908\n",
	"R_2 727\n",
	"R_1 288\n",
	"Name: count, dtype: int64"
	]
	},
	"execution_count": 57,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"combo_df.rgroup.value_counts()"
	]
	},
	{
	"cell_type": "markdown",
	"id": "39d11e46-89a4-42ce-b9f6-ecafabfb9f76",
	"metadata": {},
	"source": [
	"Read the test set"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 58,
	"id": "dfd220cc-65c0-440d-8d3c-31181fb893b2",
	"metadata": {},
	"outputs": [],
	"source": [
	"test_df = pd.read_parquet(\"../test/test_000000000000.parquet\")"
	]
	},
	{
	"cell_type": "markdown",
	"id": "84d6c3ef-f709-454b-a6a6-bd51f0dfa6a9",
	"metadata": {},
	"source": [
	"Label the reagents based on their presence in the training set"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 84,
	"id": "c7710f8b-ceb5-470f-b492-b34c26753137",
	"metadata": {},
	"outputs": [],
	"source": [
	"train_reagent_list = []\n",
	"for i in [1,2,3]:\n",
	" reagent_set = set(combo_df.query(f\"rgroup == 'R_{i}'\").SMILES.values)\n",
	" test_df[f\"train_R_{i}\"] = test_df[f\"buildingblock{i}_smiles\"].isin(reagent_set).astype(int)"
	]
	},
	{
	"cell_type": "markdown",
	"id": "2913ce2b-de1c-46a9-a796-c2bcba369c42",
	"metadata": {},
	"source": [
	"Create a new column to indicate how many training set reagents each molecule has"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "895ec519-6dee-4815-8a7d-0be1e778e3a0",
	"metadata": {},
	"outputs": [],
	"source": [
	"test_df['reagent_sum'] = test_df.train_R_1 + test_df.train_R_2 + test_df.train_R_3"
	]
	},
	{
	"cell_type": "markdown",
	"id": "a0601a6e-2415-4ae9-8d25-e6e3ea11840c",
	"metadata": {},
	"source": [
	"Look at the reagent_sum distribution. Note that a value of 0 could indicate a molecule in traizine1 or kinase0. Molecules reagent_sum==3 are in triazine0."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 89,
	"id": "56932968-dff5-49d2-861e-c854946a4b6b",
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"reagent_sum\n",
	"3 560085\n",
	"0 533966\n",
	"Name: count, dtype: int64"
	]
	},
	"execution_count": 89,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"test_df.reagent_sum.value_counts()"
	]
	},
	{
	"cell_type": "markdown",
	"id": "dd911091-01db-469b-9e23-2b509415e46b",
	"metadata": {},
	"source": [
	"A quick class for substructure searching from SMILES"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 90,
	"id": "01be0f3e-6982-4d5e-8606-e40b578362d5",
	"metadata": {},
	"outputs": [],
	"source": [
	"class SmilesSubsearch:\n",
	" def __init__(self, smarts):\n",
	" self.query_mol = Chem.MolFromSmarts(smarts)\n",
	"\n",
	" def search(self, smi):\n",
	" mol = Chem.MolFromSmiles(smi)\n",
	" if mol:\n",
	" return mol.HasSubstructMatch(self.query_mol)\n",
	" return False"
	]
	},
	{
	"cell_type": "markdown",
	"id": "a4dd2386-4a3b-4cf7-a278-96da72b9fb3a",
	"metadata": {},
	"source": [
	"Instantiate a SmilesSubsearch object to search for triazines"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 93,
	"id": "0c1fba98-aecc-4d62-ae50-c5ebd10f4515",
	"metadata": {},
	"outputs": [],
	"source": [
	"smiles_subsearch = SmilesSubsearch(\"c1ncncn1\")"
	]
	},
	{
	"cell_type": "markdown",
	"id": "8a270947-ef35-4d7a-ad40-1adf464999c1",
	"metadata": {},
	"source": [
	"Identify the triazines"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 99,
	"id": "d1102d1b-bf74-4b42-b0b1-d5261d832e25",
	"metadata": {},
	"outputs": [],
	"source": [
	"tqdm.pandas()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 107,
	"id": "67c7464a-a659-47bd-9429-1a187a31979d",
	"metadata": {},
	"outputs": [
	{
	"data": {
	"application/vnd.jupyter.widget-view+json": {
	"model_id": "df1eb275d97b4b5bb302076b40038062",
	"version_major": 2,
	"version_minor": 0
	},
	"text/plain": [
	" 0%\| \| 0/1094051 [00:00<?, ?it/s]"
	]
	},
	"metadata": {},
	"output_type": "display_data"
	}
	],
	"source": [
	"test_df['triazine'] = test_df.molecule_smiles.progress_apply(smiles_subsearch.search).astype(int)"
	]
	},
	{
	"cell_type": "markdown",
	"id": "76b6ca0e-a23e-4000-8688-a28a01af869f",
	"metadata": {},
	"source": [
	"Count triazines"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 108,
	"id": "481b02bf-19f5-4760-bb8e-1be5675ee6c1",
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"triazine\n",
	"1 594051\n",
	"0 500000\n",
	"Name: count, dtype: int64"
	]
	},
	"execution_count": 108,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"test_df.triazine.value_counts()"
	]
	},
	{
	"cell_type": "markdown",
	"id": "ed8eaf2f-cf6c-458b-9197-5e6ad667cb9e",
	"metadata": {},
	"source": [
	"This is the kinase0 library"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 114,
	"id": "ce57bb0a-2869-476b-9159-1f6923a7c934",
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"reagent_sum\n",
	"0 500000\n",
	"Name: count, dtype: int64"
	]
	},
	"execution_count": 114,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"test_df.query(\"triazine == 0\").reagent_sum.value_counts()"
	]
	},
	{
	"cell_type": "markdown",
	"id": "676ef62d-c26c-4964-81de-7111ee862bf2",
	"metadata": {},
	"source": [
	"This is the two traizine libraries"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 115,
	"id": "bb66aec1-cf39-4857-8d18-a3cecdad9d61",
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"reagent_sum\n",
	"3 560085\n",
	"0 33966\n",
	"Name: count, dtype: int64"
	]
	},
	"execution_count": 115,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"test_df.query(\"triazine == 1\").reagent_sum.value_counts()"
	]
	},
	{
	"cell_type": "markdown",
	"id": "4ae3e4c3-3837-452e-85a1-80314a5ddfaa",
	"metadata": {},
	"source": [
	"Create an indicator variable to separate the three libraries"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 111,
	"id": "4853b840-1fd0-420f-8f57-b9cf50ecf9d2",
	"metadata": {},
	"outputs": [],
	"source": [
	"test_df['library'] = test_df.reagent_sum + test_df.triazine"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 116,
	"id": "bd635c9b-b6d5-4106-87b8-2b9572fd5cf0",
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"library\n",
	"4 560085\n",
	"0 500000\n",
	"1 33966\n",
	"Name: count, dtype: int64"
	]
	},
	"execution_count": 116,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"test_df.library.value_counts()"
	]
	},
	{
	"cell_type": "markdown",
	"id": "112abf5f-749f-4880-b709-344cc105ffc4",
	"metadata": {},
	"source": [
	"Give the libraries names "
	]
	},
	{
	"cell_type": "code",
	"execution_count": 117,
	"id": "99c25299-061f-4254-9f31-237a737ac107",
	"metadata": {},
	"outputs": [],
	"source": [
	"name_dict = {4: \"triazine0\", 0: \"kinase0\", 1: \"triazine1\"}"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 119,
	"id": "6a2894bb-7a88-4abe-afe1-79133116e2e2",
	"metadata": {},
	"outputs": [],
	"source": [
	"test_df['lib_name'] = [name_dict[x] for x in test_df.library]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 120,
	"id": "707c7b8e-3769-4ba2-90a1-0ba27440f7dc",
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"lib_name\n",
	"triazine0 560085\n",
	"kinase0 500000\n",
	"triazine1 33966\n",
	"Name: count, dtype: int64"
	]
	},
	"execution_count": 120,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"test_df.lib_name.value_counts()"
	]
	},
	{
	"cell_type": "markdown",
	"id": "0efbdb5c-309c-454e-8e39-6ccfb0aef9c8",
	"metadata": {},
	"source": [
	"Save test_df to a parquet file "
	]
	},
	{
	"cell_type": "code",
	"execution_count": 122,
	"id": "d9606928-26e0-4a9a-987a-d49346d3df9f",
	"metadata": {},
	"outputs": [],
	"source": [
	"test_df.to_parquet(\"test_processed.parquet\")"
	]
	},
	{
	"cell_type": "markdown",
	"id": "a5bb031d-cb46-4f57-bd4d-e0611341c1a9",
	"metadata": {},
	"source": [
	"Create a new dataframe with just the binders"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 127,
	"id": "a2aa157e-c152-4bde-aca2-b3eee5e0f118",
	"metadata": {},
	"outputs": [],
	"source": [
	"binder_df = test_df.query(\"binds\")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 130,
	"id": "68f593de-ada3-4c50-a433-ded3d161dc03",
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"lib_name\n",
	"triazine0 2872\n",
	"triazine1 1319\n",
	"kinase0 539\n",
	"Name: count, dtype: int64"
	]
	},
	"execution_count": 130,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"binder_df.lib_name.value_counts()"
	]
	},
	{
	"cell_type": "markdown",
	"id": "6db850a7-d468-48a1-a64f-19210ebaf2f9",
	"metadata": {},
	"source": [
	"Write the binders to SMILES file divided by protein target and library"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 141,
	"id": "c407fe53-8289-4747-87ba-907573db65e9",
	"metadata": {},
	"outputs": [],
	"source": [
	"!mkdir ../PW2"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 143,
	"id": "8639a7cc-a61b-473b-a13e-f41b0bf6cd56",
	"metadata": {},
	"outputs": [],
	"source": [
	"for k,v in binder_df.groupby([\"protein_name\",\"lib_name\"]):\n",
	" protein_name, lib_name = k \n",
	" tmp_df = v.copy()\n",
	" prot_libname = protein_name + \"_\" + lib_name\n",
	" tmp_df['mol_name'] = [f\"{prot_libname}_{i:04d}\" for i in range(0,len(tmp_df))]\n",
	" tmp_df[[\"molecule_smiles\",\"mol_name\"]].to_csv(f\"../PW2/{prot_libname}.smi\")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 144,
	"id": "06bf811f-4e8c-49db-94a0-c4355ce1f003",
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"BRD4_kinase0.smi HSA_kinase0.smi sEH_kinase0.smi\n",
	"BRD4_triazine0.smi HSA_triazine0.smi sEH_triazine0.smi\n",
	"BRD4_triazine1.smi HSA_triazine1.smi sEH_triazine1.smi\n"
	]
	}
	],
	"source": [
	"!ls ../PW2"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "b478b550-c04b-4f37-889e-8384afb22766",
	"metadata": {},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3 (ipykernel)",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.11.9"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 5
	}