PatWalters · October 11, 2025 02:46
diff --git a/bblean_ChEMBL.ipynb b/bblean_ChEMBL.ipynb
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "204e2f2e-1226-4c3b-ab78-d3e5ccac8a61",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import bblean\n",
    "import matplotlib.pyplot as plt\n",
    "import mols2grid\n",
    "from tqdm.auto import tqdm\n",
    "from rdkit import Chem"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "615c9912-ddd8-43ee-a824-9eafbb79d81b",
   "metadata": {},
   "outputs": [],
   "source": [
    "tqdm.pandas()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "1bc3df0c-c90a-4ecc-90f4-d02c9e4d09ff",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 4.06 s, sys: 208 ms, total: 4.27 s\n",
      "Wall time: 4.27 s\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "2854815"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%time\n",
    "chembl_df = pd.read_csv(\"chembl_36_chemreps.txt\",sep=\"\\t\")\n",
    "len(chembl_df)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "3090db1c-306d-4376-9089-d5dde5835bb2",
   "metadata": {},
   "source": [
    "A quick function to remove janky SMILES"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "95395ff4-2fd1-431a-a77e-57e162162ab9",
   "metadata": {},
   "outputs": [],
   "source": [
    "def check_smiles(smi):\n",
    "    mol = Chem.MolFromSmiles(smi)\n",
    "    return mol is not None"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a2685de9-ede1-430e-a0bb-792be6b13de2",
   "metadata": {},
   "source": [
    "Remove the janky SMILES"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "1d847a46-fd15-457c-b9be-a61258832a47",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "866ae6d6c27b43e8ac0fef7b8a8f582f",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/2854815 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[22:24:38] Explicit valence for atom # 13 P, 7, is greater than permitted\n",
      "[22:24:39] Explicit valence for atom # 29 P, 7, is greater than permitted\n",
      "[22:24:39] Explicit valence for atom # 91 P, 7, is greater than permitted\n",
      "[22:25:17] WARNING: not removing hydrogen atom without neighbors\n",
      "[22:25:29] Explicit valence for atom # 17 P, 7, is greater than permitted\n",
      "[22:25:29] Explicit valence for atom # 1 P, 7, is greater than permitted\n",
      "[22:25:29] Explicit valence for atom # 1 P, 7, is greater than permitted\n",
      "[22:25:30] Explicit valence for atom # 1 P, 7, is greater than permitted\n",
      "[22:25:32] WARNING: not removing hydrogen atom without neighbors\n",
      "[22:25:32] WARNING: not removing hydrogen atom without neighbors\n",
      "[22:25:41] Explicit valence for atom # 19 P, 7, is greater than permitted\n",
      "[22:25:41] Explicit valence for atom # 1 P, 7, is greater than permitted\n",
      "[22:25:42] Explicit valence for atom # 16 P, 7, is greater than permitted\n",
      "[22:26:03] Explicit valence for atom # 3 Ar, 1, is greater than permitted\n",
      "[22:26:32] WARNING: not removing hydrogen atom without neighbors\n",
      "[22:26:46] WARNING: not removing hydrogen atom without neighbors\n",
      "[22:26:47] WARNING: not removing hydrogen atom without neighbors\n",
      "[22:26:48] WARNING: not removing hydrogen atom without neighbors\n",
      "[22:27:09] WARNING: not removing hydrogen atom without neighbors\n",
      "[22:27:09] WARNING: not removing hydrogen atom without neighbors\n",
      "[22:27:09] WARNING: not removing hydrogen atom without neighbors\n",
      "[22:27:11] Can't kekulize mol.  Unkekulized atoms: 1 2 3 4 5 6 10 11 15 16 17 19 20 21\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 2min 40s, sys: 1.01 s, total: 2min 41s\n",
      "Wall time: 2min 41s\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[22:27:17] Explicit valence for atom # 1 As, 7, is greater than permitted\n",
      "[22:27:17] WARNING: not removing hydrogen atom without neighbors\n",
      "[22:27:17] WARNING: not removing hydrogen atom without neighbors\n",
      "[22:27:17] WARNING: not removing hydrogen atom without neighbors\n",
      "[22:27:17] WARNING: not removing hydrogen atom without neighbors\n",
      "[22:27:17] Explicit valence for atom # 1 P, 7, is greater than permitted\n",
      "[22:27:17] Explicit valence for atom # 34 P, 7, is greater than permitted\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "chembl_df['valid'] = chembl_df.canonical_smiles.progress_apply(check_smiles)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "a6619aef-4d9b-44db-a3e6-7966c3c2113c",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "2854800"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "chembl_df = chembl_df.query(\"valid\").copy()\n",
    "len(chembl_df)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "7ed2841e-741d-47d4-b00e-0c73a223fb36",
   "metadata": {},
   "source": [
    "By default **bblean** can't handle SMILES longer than 388"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "19f9c5bd-6f94-4623-9d7f-d1b8e211eac0",
   "metadata": {},
   "outputs": [],
   "source": [
    "chembl_df['len'] = chembl_df.canonical_smiles.str.len()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "aca2a8bf-bf80-4648-ab5f-2830e6330c6f",
   "metadata": {},
   "outputs": [],
   "source": [
    "chembl_df = chembl_df.query(\"len < 388\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "8751fdab-5766-490d-8edb-14916357d9a7",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "2844613"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(chembl_df)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9beb1184-acda-4e4e-8695-5b01782d9cd6",
   "metadata": {},
   "source": [
    "Format the SMILES the way that bblean wants them.  There's probably a better way to do this. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "8aaf3b43-1a19-4ff5-a050-24679bef3c9d",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 326 ms, sys: 168 ms, total: 495 ms\n",
      "Wall time: 491 ms\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "smiles = (chembl_df.canonical_smiles.values+\"\\n\").astype(dtype='<U388')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "3e132523-c03b-4aee-8cf4-9e6813a633e8",
   "metadata": {},
   "source": [
    "Generate the fingerprints"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "0de13a71-7754-4c1f-81c6-661ab078f545",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[22:28:46] WARNING: not removing hydrogen atom without neighbors\n",
      "[22:29:02] WARNING: not removing hydrogen atom without neighbors\n",
      "[22:29:02] WARNING: not removing hydrogen atom without neighbors\n",
      "[22:30:14] WARNING: not removing hydrogen atom without neighbors\n",
      "[22:30:31] WARNING: not removing hydrogen atom without neighbors\n",
      "[22:30:32] WARNING: not removing hydrogen atom without neighbors\n",
      "[22:30:34] WARNING: not removing hydrogen atom without neighbors\n",
      "[22:30:59] WARNING: not removing hydrogen atom without neighbors\n",
      "[22:30:59] WARNING: not removing hydrogen atom without neighbors\n",
      "[22:30:59] WARNING: not removing hydrogen atom without neighbors\n",
      "[22:31:09] WARNING: not removing hydrogen atom without neighbors\n",
      "[22:31:09] WARNING: not removing hydrogen atom without neighbors\n",
      "[22:31:09] WARNING: not removing hydrogen atom without neighbors\n",
      "[22:31:09] WARNING: not removing hydrogen atom without neighbors\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 4min 7s, sys: 37.3 s, total: 4min 45s\n",
      "Wall time: 5min 42s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "fps = bblean.fingerprints.fps_from_smiles(smiles, pack=True, n_features=2048)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "705cbaa3-8fd0-40e3-8527-e14a3dce3624",
   "metadata": {},
   "source": [
    "Cluster"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "5e1b8fde-dd95-468d-9374-911370b5e1a1",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 3min 13s, sys: 2.05 s, total: 3min 15s\n",
      "Wall time: 3min 16s\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "BitBirch(threshold=0.65, branching_factor=50, merge_criterion='diameter')"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%time\n",
    "bb_tree = bblean.BitBirch(branching_factor=50, threshold=0.65, merge_criterion=\"diameter\")\n",
    "bb_tree.fit(fps, n_features=2048)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "8a1f4de2-f91b-4e7d-9d15-3a6231bc5c9d",
   "metadata": {},
   "outputs": [],
   "source": [
    "clusters = bb_tree.get_cluster_mol_ids()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "4b9edd98-3938-4be1-82a3-e3700a9e605a",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1.889534"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(clusters)/1e6"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4cc5a460-7425-453d-aaf9-6f700eea440f",
   "metadata": {},
   "source": [
    "Add a cluster column to the dataframe"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "8e6333af-03d8-4f02-863c-0bf9f49e1fc8",
   "metadata": {},
   "outputs": [],
   "source": [
    "cluster_list = [-1] * len(chembl_df)\n",
    "for idx,cluster in enumerate(clusters):\n",
    "    for id in cluster:\n",
    "        cluster_list[id] = idx\n",
    "chembl_df['cluster'] = cluster_list"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "46e4361e-4143-4853-a554-07fbd7d1a053",
   "metadata": {},
   "source": [
    "Look at some clusters"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "60aed7bd-921f-47f8-b84d-1172b4c2159e",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "cluster\n",
       "65         96\n",
       "124        51\n",
       "136        49\n",
       "27         46\n",
       "171        45\n",
       "           ..\n",
       "1522831     1\n",
       "1796423     1\n",
       "1212826     1\n",
       "1642975     1\n",
       "181076      1\n",
       "Name: count, Length: 1061197, dtype: int64"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "chembl_df.query(\"len < 50\").cluster.value_counts()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a480c2a6-cc21-4039-b558-0b098fc457ec",
   "metadata": {},
   "source": [
    "Repeat the with threshold = 0.55"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "f2a3747e-93ce-400e-b309-f20573a41d73",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 19.6 s, sys: 208 ms, total: 19.8 s\n",
      "Wall time: 19.8 s\n"
     ]
    },
    {
     "ename": "ValueError",
     "evalue": "could not broadcast input array from shape (0,) into shape (256,)",
     "output_type": "error",
     "traceback": [
      "\u001b[31m---------------------------------------------------------------------------\u001b[39m",
      "\u001b[31mValueError\u001b[39m                                Traceback (most recent call last)",
      "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[18]\u001b[39m\u001b[32m, line 1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m \u001b[43mget_ipython\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m.\u001b[49m\u001b[43mrun_cell_magic\u001b[49m\u001b[43m(\u001b[49m\u001b[33;43m'\u001b[39;49m\u001b[33;43mtime\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[33;43m'\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[33;43m'\u001b[39;49m\u001b[33;43mbb_tree = bblean.BitBirch(branching_factor=50, threshold=0.55, merge_criterion=\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mdiameter\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43m)\u001b[39;49m\u001b[38;5;130;43;01m\\n\u001b[39;49;00m\u001b[33;43mbb_tree.fit(fps, n_features=2048)\u001b[39;49m\u001b[38;5;130;43;01m\\n\u001b[39;49;00m\u001b[33;43m'\u001b[39;49m\u001b[43m)\u001b[49m\n",
      "\u001b[36mFile \u001b[39m\u001b[32m/opt/homebrew/Caskroom/miniforge/base/envs/rdkit_2025_09/lib/python3.11/site-packages/IPython/core/interactiveshell.py:2565\u001b[39m, in \u001b[36mInteractiveShell.run_cell_magic\u001b[39m\u001b[34m(self, magic_name, line, cell)\u001b[39m\n\u001b[32m   2563\u001b[39m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mself\u001b[39m.builtin_trap:\n\u001b[32m   2564\u001b[39m     args = (magic_arg_s, cell)\n\u001b[32m-> \u001b[39m\u001b[32m2565\u001b[39m     result = \u001b[43mfn\u001b[49m\u001b[43m(\u001b[49m\u001b[43m*\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m   2567\u001b[39m \u001b[38;5;66;03m# The code below prevents the output from being displayed\u001b[39;00m\n\u001b[32m   2568\u001b[39m \u001b[38;5;66;03m# when using magics with decorator @output_can_be_silenced\u001b[39;00m\n\u001b[32m   2569\u001b[39m \u001b[38;5;66;03m# when the last Python token in the expression is a ';'.\u001b[39;00m\n\u001b[32m   2570\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mgetattr\u001b[39m(fn, magic.MAGIC_OUTPUT_CAN_BE_SILENCED, \u001b[38;5;28;01mFalse\u001b[39;00m):\n",
      "\u001b[36mFile \u001b[39m\u001b[32m/opt/homebrew/Caskroom/miniforge/base/envs/rdkit_2025_09/lib/python3.11/site-packages/IPython/core/magics/execution.py:1470\u001b[39m, in \u001b[36mExecutionMagics.time\u001b[39m\u001b[34m(self, line, cell, local_ns)\u001b[39m\n\u001b[32m   1468\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m interrupt_occured:\n\u001b[32m   1469\u001b[39m     \u001b[38;5;28;01mif\u001b[39;00m exit_on_interrupt \u001b[38;5;129;01mand\u001b[39;00m captured_exception:\n\u001b[32m-> \u001b[39m\u001b[32m1470\u001b[39m         \u001b[38;5;28;01mraise\u001b[39;00m captured_exception\n\u001b[32m   1471\u001b[39m     \u001b[38;5;28;01mreturn\u001b[39;00m\n\u001b[32m   1472\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m out\n",
      "\u001b[36mFile \u001b[39m\u001b[32m/opt/homebrew/Caskroom/miniforge/base/envs/rdkit_2025_09/lib/python3.11/site-packages/IPython/core/magics/execution.py:1439\u001b[39m, in \u001b[36mExecutionMagics.time\u001b[39m\u001b[34m(self, line, cell, local_ns)\u001b[39m\n\u001b[32m   1437\u001b[39m     \u001b[38;5;28;01mif\u001b[39;00m expr_val \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[32m   1438\u001b[39m         code_2 = \u001b[38;5;28mself\u001b[39m.shell.compile(expr_val, source, \u001b[33m'\u001b[39m\u001b[33meval\u001b[39m\u001b[33m'\u001b[39m)\n\u001b[32m-> \u001b[39m\u001b[32m1439\u001b[39m         out = \u001b[38;5;28;43meval\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mcode_2\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mglob\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mlocal_ns\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m   1440\u001b[39m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mKeyboardInterrupt\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[32m   1441\u001b[39m     captured_exception = e\n",
      "\u001b[36mFile \u001b[39m\u001b[32m<timed exec>:2\u001b[39m\n",
      "\u001b[36mFile \u001b[39m\u001b[32m~/software/bblean/bblean/bitbirch.py:734\u001b[39m, in \u001b[36mBitBirch.fit\u001b[39m\u001b[34m(self, X, reinsert_indices, input_is_packed, n_features)\u001b[39m\n\u001b[32m    730\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m idx, fp \u001b[38;5;129;01min\u001b[39;00m iterable:\n\u001b[32m    731\u001b[39m     subcluster = _BFSubcluster(\n\u001b[32m    732\u001b[39m         linear_sum=fp, mol_indices=[idx], n_features=n_features\n\u001b[32m    733\u001b[39m     )\n\u001b[32m--> \u001b[39m\u001b[32m734\u001b[39m     split = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_root\u001b[49m\u001b[43m.\u001b[49m\u001b[43minsert_bf_subcluster\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m    735\u001b[39m \u001b[43m        \u001b[49m\u001b[43msubcluster\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmerge_accept_fn\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mthreshold\u001b[49m\n\u001b[32m    736\u001b[39m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m    738\u001b[39m     \u001b[38;5;28;01mif\u001b[39;00m split:\n\u001b[32m    739\u001b[39m         new_subcluster1, new_subcluster2 = _split_node(\u001b[38;5;28mself\u001b[39m._root)\n",
      "\u001b[36mFile \u001b[39m\u001b[32m~/software/bblean/bblean/bitbirch.py:339\u001b[39m, in \u001b[36m_BFNode.insert_bf_subcluster\u001b[39m\u001b[34m(self, subcluster, merge_accept_fn, threshold)\u001b[39m\n\u001b[32m    336\u001b[39m \u001b[38;5;66;03m# If the subcluster has a child, we need a recursive strategy.\u001b[39;00m\n\u001b[32m    337\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m closest_subcluster.child_ \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[32m--> \u001b[39m\u001b[32m339\u001b[39m     split_child = \u001b[43mclosest_subcluster\u001b[49m\u001b[43m.\u001b[49m\u001b[43mchild_\u001b[49m\u001b[43m.\u001b[49m\u001b[43minsert_bf_subcluster\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m    340\u001b[39m \u001b[43m        \u001b[49m\u001b[43msubcluster\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmerge_accept_fn\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mthreshold\u001b[49m\n\u001b[32m    341\u001b[39m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m    343\u001b[39m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m split_child:\n\u001b[32m    344\u001b[39m         \u001b[38;5;66;03m# If it is determined that the child need not be split, we\u001b[39;00m\n\u001b[32m    345\u001b[39m         \u001b[38;5;66;03m# can just update the closest_subcluster\u001b[39;00m\n\u001b[32m    346\u001b[39m         closest_subcluster.update(subcluster)\n",
      "\u001b[36mFile \u001b[39m\u001b[32m~/software/bblean/bblean/bitbirch.py:359\u001b[39m, in \u001b[36m_BFNode.insert_bf_subcluster\u001b[39m\u001b[34m(self, subcluster, merge_accept_fn, threshold)\u001b[39m\n\u001b[32m    352\u001b[39m \u001b[38;5;66;03m# things not too good. we need to redistribute the subclusters in\u001b[39;00m\n\u001b[32m    353\u001b[39m \u001b[38;5;66;03m# our child node, and add a new subcluster in the parent\u001b[39;00m\n\u001b[32m    354\u001b[39m \u001b[38;5;66;03m# subcluster to accommodate the new child.\u001b[39;00m\n\u001b[32m    355\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m    356\u001b[39m     new_subcluster1, new_subcluster2 = _split_node(\n\u001b[32m    357\u001b[39m         closest_subcluster.child_\n\u001b[32m    358\u001b[39m     )\n\u001b[32m--> \u001b[39m\u001b[32m359\u001b[39m     \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mupdate_split_subclusters\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m    360\u001b[39m \u001b[43m        \u001b[49m\u001b[43mclosest_subcluster\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mnew_subcluster1\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mnew_subcluster2\u001b[49m\n\u001b[32m    361\u001b[39m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m    363\u001b[39m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(\u001b[38;5;28mself\u001b[39m.subclusters_) > \u001b[38;5;28mself\u001b[39m.branching_factor:\n\u001b[32m    364\u001b[39m         \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;01mTrue\u001b[39;00m\n",
      "\u001b[36mFile \u001b[39m\u001b[32m~/software/bblean/bblean/bitbirch.py:319\u001b[39m, in \u001b[36m_BFNode.update_split_subclusters\u001b[39m\u001b[34m(self, subcluster, new_subcluster1, new_subcluster2)\u001b[39m\n\u001b[32m    317\u001b[39m idx = \u001b[38;5;28mself\u001b[39m.subclusters_.index(subcluster)\n\u001b[32m    318\u001b[39m \u001b[38;5;28mself\u001b[39m.subclusters_[idx] = new_subcluster1\n\u001b[32m--> \u001b[39m\u001b[32m319\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43minit_centroids_\u001b[49m\u001b[43m[\u001b[49m\u001b[43midx\u001b[49m\u001b[43m]\u001b[49m = new_subcluster1.centroid_\n\u001b[32m    320\u001b[39m \u001b[38;5;66;03m# Append new_subcluster2\u001b[39;00m\n\u001b[32m    321\u001b[39m \u001b[38;5;28mself\u001b[39m.append_subcluster(new_subcluster2)\n",
      "\u001b[31mValueError\u001b[39m: could not broadcast input array from shape (0,) into shape (256,)"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "bb_tree = bblean.BitBirch(branching_factor=50, threshold=0.55, merge_criterion=\"diameter\")\n",
    "bb_tree.fit(fps, n_features=2048)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b6c8d74f-b1f5-44fd-a340-2aa163c4abec",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 1,
	"id": "204e2f2e-1226-4c3b-ab78-d3e5ccac8a61",
	"metadata": {},
	"outputs": [],
	"source": [
	"import pandas as pd\n",
	"import bblean\n",
	"import matplotlib.pyplot as plt\n",
	"import mols2grid\n",
	"from tqdm.auto import tqdm\n",
	"from rdkit import Chem"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"id": "615c9912-ddd8-43ee-a824-9eafbb79d81b",
	"metadata": {},
	"outputs": [],
	"source": [
	"tqdm.pandas()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"id": "1bc3df0c-c90a-4ecc-90f4-d02c9e4d09ff",
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"CPU times: user 4.06 s, sys: 208 ms, total: 4.27 s\n",
	"Wall time: 4.27 s\n"
	]
	},
	{
	"data": {
	"text/plain": [
	"2854815"
	]
	},
	"execution_count": 3,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"%%time\n",
	"chembl_df = pd.read_csv(\"chembl_36_chemreps.txt\",sep=\"\\t\")\n",
	"len(chembl_df)"
	]
	},
	{
	"cell_type": "markdown",
	"id": "3090db1c-306d-4376-9089-d5dde5835bb2",
	"metadata": {},
	"source": [
	"A quick function to remove janky SMILES"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"id": "95395ff4-2fd1-431a-a77e-57e162162ab9",
	"metadata": {},
	"outputs": [],
	"source": [
	"def check_smiles(smi):\n",
	" mol = Chem.MolFromSmiles(smi)\n",
	" return mol is not None"
	]
	},
	{
	"cell_type": "markdown",
	"id": "a2685de9-ede1-430e-a0bb-792be6b13de2",
	"metadata": {},
	"source": [
	"Remove the janky SMILES"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"id": "1d847a46-fd15-457c-b9be-a61258832a47",
	"metadata": {},
	"outputs": [
	{
	"data": {
	"application/vnd.jupyter.widget-view+json": {
	"model_id": "866ae6d6c27b43e8ac0fef7b8a8f582f",
	"version_major": 2,
	"version_minor": 0
	},
	"text/plain": [
	" 0%\| \| 0/2854815 [00:00<?, ?it/s]"
	]
	},
	"metadata": {},
	"output_type": "display_data"
	},
	{
	"name": "stderr",
	"output_type": "stream",
	"text": [
	"[22:24:38] Explicit valence for atom # 13 P, 7, is greater than permitted\n",
	"[22:24:39] Explicit valence for atom # 29 P, 7, is greater than permitted\n",
	"[22:24:39] Explicit valence for atom # 91 P, 7, is greater than permitted\n",
	"[22:25:17] WARNING: not removing hydrogen atom without neighbors\n",
	"[22:25:29] Explicit valence for atom # 17 P, 7, is greater than permitted\n",
	"[22:25:29] Explicit valence for atom # 1 P, 7, is greater than permitted\n",
	"[22:25:29] Explicit valence for atom # 1 P, 7, is greater than permitted\n",
	"[22:25:30] Explicit valence for atom # 1 P, 7, is greater than permitted\n",
	"[22:25:32] WARNING: not removing hydrogen atom without neighbors\n",
	"[22:25:32] WARNING: not removing hydrogen atom without neighbors\n",
	"[22:25:41] Explicit valence for atom # 19 P, 7, is greater than permitted\n",
	"[22:25:41] Explicit valence for atom # 1 P, 7, is greater than permitted\n",
	"[22:25:42] Explicit valence for atom # 16 P, 7, is greater than permitted\n",
	"[22:26:03] Explicit valence for atom # 3 Ar, 1, is greater than permitted\n",
	"[22:26:32] WARNING: not removing hydrogen atom without neighbors\n",
	"[22:26:46] WARNING: not removing hydrogen atom without neighbors\n",
	"[22:26:47] WARNING: not removing hydrogen atom without neighbors\n",
	"[22:26:48] WARNING: not removing hydrogen atom without neighbors\n",
	"[22:27:09] WARNING: not removing hydrogen atom without neighbors\n",
	"[22:27:09] WARNING: not removing hydrogen atom without neighbors\n",
	"[22:27:09] WARNING: not removing hydrogen atom without neighbors\n",
	"[22:27:11] Can't kekulize mol. Unkekulized atoms: 1 2 3 4 5 6 10 11 15 16 17 19 20 21\n"
	]
	},
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"CPU times: user 2min 40s, sys: 1.01 s, total: 2min 41s\n",
	"Wall time: 2min 41s\n"
	]
	},
	{
	"name": "stderr",
	"output_type": "stream",
	"text": [
	"[22:27:17] Explicit valence for atom # 1 As, 7, is greater than permitted\n",
	"[22:27:17] WARNING: not removing hydrogen atom without neighbors\n",
	"[22:27:17] WARNING: not removing hydrogen atom without neighbors\n",
	"[22:27:17] WARNING: not removing hydrogen atom without neighbors\n",
	"[22:27:17] WARNING: not removing hydrogen atom without neighbors\n",
	"[22:27:17] Explicit valence for atom # 1 P, 7, is greater than permitted\n",
	"[22:27:17] Explicit valence for atom # 34 P, 7, is greater than permitted\n"
	]
	}
	],
	"source": [
	"%%time\n",
	"chembl_df['valid'] = chembl_df.canonical_smiles.progress_apply(check_smiles)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"id": "a6619aef-4d9b-44db-a3e6-7966c3c2113c",
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"2854800"
	]
	},
	"execution_count": 6,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"chembl_df = chembl_df.query(\"valid\").copy()\n",
	"len(chembl_df)"
	]
	},
	{
	"cell_type": "markdown",
	"id": "7ed2841e-741d-47d4-b00e-0c73a223fb36",
	"metadata": {},
	"source": [
	"By default bblean can't handle SMILES longer than 388"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 7,
	"id": "19f9c5bd-6f94-4623-9d7f-d1b8e211eac0",
	"metadata": {},
	"outputs": [],
	"source": [
	"chembl_df['len'] = chembl_df.canonical_smiles.str.len()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 8,
	"id": "aca2a8bf-bf80-4648-ab5f-2830e6330c6f",
	"metadata": {},
	"outputs": [],
	"source": [
	"chembl_df = chembl_df.query(\"len < 388\")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 9,
	"id": "8751fdab-5766-490d-8edb-14916357d9a7",
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"2844613"
	]
	},
	"execution_count": 9,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"len(chembl_df)"
	]
	},
	{
	"cell_type": "markdown",
	"id": "9beb1184-acda-4e4e-8695-5b01782d9cd6",
	"metadata": {},
	"source": [
	"Format the SMILES the way that bblean wants them. There's probably a better way to do this. "
	]
	},
	{
	"cell_type": "code",
	"execution_count": 10,
	"id": "8aaf3b43-1a19-4ff5-a050-24679bef3c9d",
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"CPU times: user 326 ms, sys: 168 ms, total: 495 ms\n",
	"Wall time: 491 ms\n"
	]
	}
	],
	"source": [
	"%%time\n",
	"smiles = (chembl_df.canonical_smiles.values+\"\\n\").astype(dtype='<U388')"
	]
	},
	{
	"cell_type": "markdown",
	"id": "3e132523-c03b-4aee-8cf4-9e6813a633e8",
	"metadata": {},
	"source": [
	"Generate the fingerprints"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 11,
	"id": "0de13a71-7754-4c1f-81c6-661ab078f545",
	"metadata": {},
	"outputs": [
	{
	"name": "stderr",
	"output_type": "stream",
	"text": [
	"[22:28:46] WARNING: not removing hydrogen atom without neighbors\n",
	"[22:29:02] WARNING: not removing hydrogen atom without neighbors\n",
	"[22:29:02] WARNING: not removing hydrogen atom without neighbors\n",
	"[22:30:14] WARNING: not removing hydrogen atom without neighbors\n",
	"[22:30:31] WARNING: not removing hydrogen atom without neighbors\n",
	"[22:30:32] WARNING: not removing hydrogen atom without neighbors\n",
	"[22:30:34] WARNING: not removing hydrogen atom without neighbors\n",
	"[22:30:59] WARNING: not removing hydrogen atom without neighbors\n",
	"[22:30:59] WARNING: not removing hydrogen atom without neighbors\n",
	"[22:30:59] WARNING: not removing hydrogen atom without neighbors\n",
	"[22:31:09] WARNING: not removing hydrogen atom without neighbors\n",
	"[22:31:09] WARNING: not removing hydrogen atom without neighbors\n",
	"[22:31:09] WARNING: not removing hydrogen atom without neighbors\n",
	"[22:31:09] WARNING: not removing hydrogen atom without neighbors\n"
	]
	},
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"CPU times: user 4min 7s, sys: 37.3 s, total: 4min 45s\n",
	"Wall time: 5min 42s\n"
	]
	}
	],
	"source": [
	"%%time\n",
	"fps = bblean.fingerprints.fps_from_smiles(smiles, pack=True, n_features=2048)"
	]
	},
	{
	"cell_type": "markdown",
	"id": "705cbaa3-8fd0-40e3-8527-e14a3dce3624",
	"metadata": {},
	"source": [
	"Cluster"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 12,
	"id": "5e1b8fde-dd95-468d-9374-911370b5e1a1",
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"CPU times: user 3min 13s, sys: 2.05 s, total: 3min 15s\n",
	"Wall time: 3min 16s\n"
	]
	},
	{
	"data": {
	"text/plain": [
	"BitBirch(threshold=0.65, branching_factor=50, merge_criterion='diameter')"
	]
	},
	"execution_count": 12,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"%%time\n",
	"bb_tree = bblean.BitBirch(branching_factor=50, threshold=0.65, merge_criterion=\"diameter\")\n",
	"bb_tree.fit(fps, n_features=2048)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 13,
	"id": "8a1f4de2-f91b-4e7d-9d15-3a6231bc5c9d",
	"metadata": {},
	"outputs": [],
	"source": [
	"clusters = bb_tree.get_cluster_mol_ids()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 14,
	"id": "4b9edd98-3938-4be1-82a3-e3700a9e605a",
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"1.889534"
	]
	},
	"execution_count": 14,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"len(clusters)/1e6"
	]
	},
	{
	"cell_type": "markdown",
	"id": "4cc5a460-7425-453d-aaf9-6f700eea440f",
	"metadata": {},
	"source": [
	"Add a cluster column to the dataframe"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 15,
	"id": "8e6333af-03d8-4f02-863c-0bf9f49e1fc8",
	"metadata": {},
	"outputs": [],
	"source": [
	"cluster_list = [-1] * len(chembl_df)\n",
	"for idx,cluster in enumerate(clusters):\n",
	" for id in cluster:\n",
	" cluster_list[id] = idx\n",
	"chembl_df['cluster'] = cluster_list"
	]
	},
	{
	"cell_type": "markdown",
	"id": "46e4361e-4143-4853-a554-07fbd7d1a053",
	"metadata": {},
	"source": [
	"Look at some clusters"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 16,
	"id": "60aed7bd-921f-47f8-b84d-1172b4c2159e",
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"cluster\n",
	"65 96\n",
	"124 51\n",
	"136 49\n",
	"27 46\n",
	"171 45\n",
	" ..\n",
	"1522831 1\n",
	"1796423 1\n",
	"1212826 1\n",
	"1642975 1\n",
	"181076 1\n",
	"Name: count, Length: 1061197, dtype: int64"
	]
	},
	"execution_count": 16,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"chembl_df.query(\"len < 50\").cluster.value_counts()"
	]
	},
	{
	"cell_type": "markdown",
	"id": "a480c2a6-cc21-4039-b558-0b098fc457ec",
	"metadata": {},
	"source": [
	"Repeat the with threshold = 0.55"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 18,
	"id": "f2a3747e-93ce-400e-b309-f20573a41d73",
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"CPU times: user 19.6 s, sys: 208 ms, total: 19.8 s\n",
	"Wall time: 19.8 s\n"
	]
	},
	{
	"ename": "ValueError",
	"evalue": "could not broadcast input array from shape (0,) into shape (256,)",
	"output_type": "error",
	"traceback": [
	"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
	"\u001b[31mValueError\u001b[39m Traceback (most recent call last)",
	"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[18]\u001b[39m\u001b[32m, line 1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m \u001b[43mget_ipython\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m.\u001b[49m\u001b[43mrun_cell_magic\u001b[49m\u001b[43m(\u001b[49m\u001b[33;43m'\u001b[39;49m\u001b[33;43mtime\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[33;43m'\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[33;43m'\u001b[39;49m\u001b[33;43mbb_tree = bblean.BitBirch(branching_factor=50, threshold=0.55, merge_criterion=\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mdiameter\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43m)\u001b[39;49m\u001b[38;5;130;43;01m\\n\u001b[39;49;00m\u001b[33;43mbb_tree.fit(fps, n_features=2048)\u001b[39;49m\u001b[38;5;130;43;01m\\n\u001b[39;49;00m\u001b[33;43m'\u001b[39;49m\u001b[43m)\u001b[49m\n",
	"\u001b[36mFile \u001b[39m\u001b[32m/opt/homebrew/Caskroom/miniforge/base/envs/rdkit_2025_09/lib/python3.11/site-packages/IPython/core/interactiveshell.py:2565\u001b[39m, in \u001b[36mInteractiveShell.run_cell_magic\u001b[39m\u001b[34m(self, magic_name, line, cell)\u001b[39m\n\u001b[32m 2563\u001b[39m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mself\u001b[39m.builtin_trap:\n\u001b[32m 2564\u001b[39m args = (magic_arg_s, cell)\n\u001b[32m-> \u001b[39m\u001b[32m2565\u001b[39m result = \u001b[43mfn\u001b[49m\u001b[43m(\u001b[49m\u001b[43m\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 2567\u001b[39m \u001b[38;5;66;03m# The code below prevents the output from being displayed\u001b[39;00m\n\u001b[32m 2568\u001b[39m \u001b[38;5;66;03m# when using magics with decorator @output_can_be_silenced\u001b[39;00m\n\u001b[32m 2569\u001b[39m \u001b[38;5;66;03m# when the last Python token in the expression is a ';'.\u001b[39;00m\n\u001b[32m 2570\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mgetattr\u001b[39m(fn, magic.MAGIC_OUTPUT_CAN_BE_SILENCED, \u001b[38;5;28;01mFalse\u001b[39;00m):\n",
	"\u001b[36mFile \u001b[39m\u001b[32m/opt/homebrew/Caskroom/miniforge/base/envs/rdkit_2025_09/lib/python3.11/site-packages/IPython/core/magics/execution.py:1470\u001b[39m, in \u001b[36mExecutionMagics.time\u001b[39m\u001b[34m(self, line, cell, local_ns)\u001b[39m\n\u001b[32m 1468\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m interrupt_occured:\n\u001b[32m 1469\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m exit_on_interrupt \u001b[38;5;129;01mand\u001b[39;00m captured_exception:\n\u001b[32m-> \u001b[39m\u001b[32m1470\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m captured_exception\n\u001b[32m 1471\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m\n\u001b[32m 1472\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m out\n",
	"\u001b[36mFile \u001b[39m\u001b[32m/opt/homebrew/Caskroom/miniforge/base/envs/rdkit_2025_09/lib/python3.11/site-packages/IPython/core/magics/execution.py:1439\u001b[39m, in \u001b[36mExecutionMagics.time\u001b[39m\u001b[34m(self, line, cell, local_ns)\u001b[39m\n\u001b[32m 1437\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m expr_val \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[32m 1438\u001b[39m code_2 = \u001b[38;5;28mself\u001b[39m.shell.compile(expr_val, source, \u001b[33m'\u001b[39m\u001b[33meval\u001b[39m\u001b[33m'\u001b[39m)\n\u001b[32m-> \u001b[39m\u001b[32m1439\u001b[39m out = \u001b[38;5;28;43meval\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mcode_2\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mglob\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mlocal_ns\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 1440\u001b[39m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mKeyboardInterrupt\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[32m 1441\u001b[39m captured_exception = e\n",
	"\u001b[36mFile \u001b[39m\u001b[32m<timed exec>:2\u001b[39m\n",
	"\u001b[36mFile \u001b[39m\u001b[32m~/software/bblean/bblean/bitbirch.py:734\u001b[39m, in \u001b[36mBitBirch.fit\u001b[39m\u001b[34m(self, X, reinsert_indices, input_is_packed, n_features)\u001b[39m\n\u001b[32m 730\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m idx, fp \u001b[38;5;129;01min\u001b[39;00m iterable:\n\u001b[32m 731\u001b[39m subcluster = _BFSubcluster(\n\u001b[32m 732\u001b[39m linear_sum=fp, mol_indices=[idx], n_features=n_features\n\u001b[32m 733\u001b[39m )\n\u001b[32m--> \u001b[39m\u001b[32m734\u001b[39m split = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_root\u001b[49m\u001b[43m.\u001b[49m\u001b[43minsert_bf_subcluster\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 735\u001b[39m \u001b[43m \u001b[49m\u001b[43msubcluster\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmerge_accept_fn\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mthreshold\u001b[49m\n\u001b[32m 736\u001b[39m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 738\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m split:\n\u001b[32m 739\u001b[39m new_subcluster1, new_subcluster2 = _split_node(\u001b[38;5;28mself\u001b[39m._root)\n",
	"\u001b[36mFile \u001b[39m\u001b[32m~/software/bblean/bblean/bitbirch.py:339\u001b[39m, in \u001b[36m_BFNode.insert_bf_subcluster\u001b[39m\u001b[34m(self, subcluster, merge_accept_fn, threshold)\u001b[39m\n\u001b[32m 336\u001b[39m \u001b[38;5;66;03m# If the subcluster has a child, we need a recursive strategy.\u001b[39;00m\n\u001b[32m 337\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m closest_subcluster.child_ \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[32m--> \u001b[39m\u001b[32m339\u001b[39m split_child = \u001b[43mclosest_subcluster\u001b[49m\u001b[43m.\u001b[49m\u001b[43mchild_\u001b[49m\u001b[43m.\u001b[49m\u001b[43minsert_bf_subcluster\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 340\u001b[39m \u001b[43m \u001b[49m\u001b[43msubcluster\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmerge_accept_fn\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mthreshold\u001b[49m\n\u001b[32m 341\u001b[39m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 343\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m split_child:\n\u001b[32m 344\u001b[39m \u001b[38;5;66;03m# If it is determined that the child need not be split, we\u001b[39;00m\n\u001b[32m 345\u001b[39m \u001b[38;5;66;03m# can just update the closest_subcluster\u001b[39;00m\n\u001b[32m 346\u001b[39m closest_subcluster.update(subcluster)\n",
	"\u001b[36mFile \u001b[39m\u001b[32m~/software/bblean/bblean/bitbirch.py:359\u001b[39m, in \u001b[36m_BFNode.insert_bf_subcluster\u001b[39m\u001b[34m(self, subcluster, merge_accept_fn, threshold)\u001b[39m\n\u001b[32m 352\u001b[39m \u001b[38;5;66;03m# things not too good. we need to redistribute the subclusters in\u001b[39;00m\n\u001b[32m 353\u001b[39m \u001b[38;5;66;03m# our child node, and add a new subcluster in the parent\u001b[39;00m\n\u001b[32m 354\u001b[39m \u001b[38;5;66;03m# subcluster to accommodate the new child.\u001b[39;00m\n\u001b[32m 355\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m 356\u001b[39m new_subcluster1, new_subcluster2 = _split_node(\n\u001b[32m 357\u001b[39m closest_subcluster.child_\n\u001b[32m 358\u001b[39m )\n\u001b[32m--> \u001b[39m\u001b[32m359\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mupdate_split_subclusters\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 360\u001b[39m \u001b[43m \u001b[49m\u001b[43mclosest_subcluster\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mnew_subcluster1\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mnew_subcluster2\u001b[49m\n\u001b[32m 361\u001b[39m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 363\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(\u001b[38;5;28mself\u001b[39m.subclusters_) > \u001b[38;5;28mself\u001b[39m.branching_factor:\n\u001b[32m 364\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;01mTrue\u001b[39;00m\n",
	"\u001b[36mFile \u001b[39m\u001b[32m~/software/bblean/bblean/bitbirch.py:319\u001b[39m, in \u001b[36m_BFNode.update_split_subclusters\u001b[39m\u001b[34m(self, subcluster, new_subcluster1, new_subcluster2)\u001b[39m\n\u001b[32m 317\u001b[39m idx = \u001b[38;5;28mself\u001b[39m.subclusters_.index(subcluster)\n\u001b[32m 318\u001b[39m \u001b[38;5;28mself\u001b[39m.subclusters_[idx] = new_subcluster1\n\u001b[32m--> \u001b[39m\u001b[32m319\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43minit_centroids_\u001b[49m\u001b[43m[\u001b[49m\u001b[43midx\u001b[49m\u001b[43m]\u001b[49m = new_subcluster1.centroid_\n\u001b[32m 320\u001b[39m \u001b[38;5;66;03m# Append new_subcluster2\u001b[39;00m\n\u001b[32m 321\u001b[39m \u001b[38;5;28mself\u001b[39m.append_subcluster(new_subcluster2)\n",
	"\u001b[31mValueError\u001b[39m: could not broadcast input array from shape (0,) into shape (256,)"
	]
	}
	],
	"source": [
	"%%time\n",
	"bb_tree = bblean.BitBirch(branching_factor=50, threshold=0.55, merge_criterion=\"diameter\")\n",
	"bb_tree.fit(fps, n_features=2048)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "b6c8d74f-b1f5-44fd-a340-2aa163c4abec",
	"metadata": {},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3 (ipykernel)",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.11.13"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 5
	}