Skip to content

Instantly share code, notes, and snippets.

@PatWalters
Created October 11, 2025 02:46
Show Gist options
  • Save PatWalters/f07fcd25362192261ce39966b7742c1c to your computer and use it in GitHub Desktop.
Save PatWalters/f07fcd25362192261ce39966b7742c1c to your computer and use it in GitHub Desktop.
Demoing the bug
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "204e2f2e-1226-4c3b-ab78-d3e5ccac8a61",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import bblean\n",
"import matplotlib.pyplot as plt\n",
"import mols2grid\n",
"from tqdm.auto import tqdm\n",
"from rdkit import Chem"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "615c9912-ddd8-43ee-a824-9eafbb79d81b",
"metadata": {},
"outputs": [],
"source": [
"tqdm.pandas()"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "1bc3df0c-c90a-4ecc-90f4-d02c9e4d09ff",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 4.06 s, sys: 208 ms, total: 4.27 s\n",
"Wall time: 4.27 s\n"
]
},
{
"data": {
"text/plain": [
"2854815"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%time\n",
"chembl_df = pd.read_csv(\"chembl_36_chemreps.txt\",sep=\"\\t\")\n",
"len(chembl_df)"
]
},
{
"cell_type": "markdown",
"id": "3090db1c-306d-4376-9089-d5dde5835bb2",
"metadata": {},
"source": [
"A quick function to remove janky SMILES"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "95395ff4-2fd1-431a-a77e-57e162162ab9",
"metadata": {},
"outputs": [],
"source": [
"def check_smiles(smi):\n",
" mol = Chem.MolFromSmiles(smi)\n",
" return mol is not None"
]
},
{
"cell_type": "markdown",
"id": "a2685de9-ede1-430e-a0bb-792be6b13de2",
"metadata": {},
"source": [
"Remove the janky SMILES"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "1d847a46-fd15-457c-b9be-a61258832a47",
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "866ae6d6c27b43e8ac0fef7b8a8f582f",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
" 0%| | 0/2854815 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"[22:24:38] Explicit valence for atom # 13 P, 7, is greater than permitted\n",
"[22:24:39] Explicit valence for atom # 29 P, 7, is greater than permitted\n",
"[22:24:39] Explicit valence for atom # 91 P, 7, is greater than permitted\n",
"[22:25:17] WARNING: not removing hydrogen atom without neighbors\n",
"[22:25:29] Explicit valence for atom # 17 P, 7, is greater than permitted\n",
"[22:25:29] Explicit valence for atom # 1 P, 7, is greater than permitted\n",
"[22:25:29] Explicit valence for atom # 1 P, 7, is greater than permitted\n",
"[22:25:30] Explicit valence for atom # 1 P, 7, is greater than permitted\n",
"[22:25:32] WARNING: not removing hydrogen atom without neighbors\n",
"[22:25:32] WARNING: not removing hydrogen atom without neighbors\n",
"[22:25:41] Explicit valence for atom # 19 P, 7, is greater than permitted\n",
"[22:25:41] Explicit valence for atom # 1 P, 7, is greater than permitted\n",
"[22:25:42] Explicit valence for atom # 16 P, 7, is greater than permitted\n",
"[22:26:03] Explicit valence for atom # 3 Ar, 1, is greater than permitted\n",
"[22:26:32] WARNING: not removing hydrogen atom without neighbors\n",
"[22:26:46] WARNING: not removing hydrogen atom without neighbors\n",
"[22:26:47] WARNING: not removing hydrogen atom without neighbors\n",
"[22:26:48] WARNING: not removing hydrogen atom without neighbors\n",
"[22:27:09] WARNING: not removing hydrogen atom without neighbors\n",
"[22:27:09] WARNING: not removing hydrogen atom without neighbors\n",
"[22:27:09] WARNING: not removing hydrogen atom without neighbors\n",
"[22:27:11] Can't kekulize mol. Unkekulized atoms: 1 2 3 4 5 6 10 11 15 16 17 19 20 21\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 2min 40s, sys: 1.01 s, total: 2min 41s\n",
"Wall time: 2min 41s\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"[22:27:17] Explicit valence for atom # 1 As, 7, is greater than permitted\n",
"[22:27:17] WARNING: not removing hydrogen atom without neighbors\n",
"[22:27:17] WARNING: not removing hydrogen atom without neighbors\n",
"[22:27:17] WARNING: not removing hydrogen atom without neighbors\n",
"[22:27:17] WARNING: not removing hydrogen atom without neighbors\n",
"[22:27:17] Explicit valence for atom # 1 P, 7, is greater than permitted\n",
"[22:27:17] Explicit valence for atom # 34 P, 7, is greater than permitted\n"
]
}
],
"source": [
"%%time\n",
"chembl_df['valid'] = chembl_df.canonical_smiles.progress_apply(check_smiles)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "a6619aef-4d9b-44db-a3e6-7966c3c2113c",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"2854800"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"chembl_df = chembl_df.query(\"valid\").copy()\n",
"len(chembl_df)"
]
},
{
"cell_type": "markdown",
"id": "7ed2841e-741d-47d4-b00e-0c73a223fb36",
"metadata": {},
"source": [
"By default **bblean** can't handle SMILES longer than 388"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "19f9c5bd-6f94-4623-9d7f-d1b8e211eac0",
"metadata": {},
"outputs": [],
"source": [
"chembl_df['len'] = chembl_df.canonical_smiles.str.len()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "aca2a8bf-bf80-4648-ab5f-2830e6330c6f",
"metadata": {},
"outputs": [],
"source": [
"chembl_df = chembl_df.query(\"len < 388\")"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "8751fdab-5766-490d-8edb-14916357d9a7",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"2844613"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(chembl_df)"
]
},
{
"cell_type": "markdown",
"id": "9beb1184-acda-4e4e-8695-5b01782d9cd6",
"metadata": {},
"source": [
"Format the SMILES the way that bblean wants them. There's probably a better way to do this. "
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "8aaf3b43-1a19-4ff5-a050-24679bef3c9d",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 326 ms, sys: 168 ms, total: 495 ms\n",
"Wall time: 491 ms\n"
]
}
],
"source": [
"%%time\n",
"smiles = (chembl_df.canonical_smiles.values+\"\\n\").astype(dtype='<U388')"
]
},
{
"cell_type": "markdown",
"id": "3e132523-c03b-4aee-8cf4-9e6813a633e8",
"metadata": {},
"source": [
"Generate the fingerprints"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "0de13a71-7754-4c1f-81c6-661ab078f545",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"[22:28:46] WARNING: not removing hydrogen atom without neighbors\n",
"[22:29:02] WARNING: not removing hydrogen atom without neighbors\n",
"[22:29:02] WARNING: not removing hydrogen atom without neighbors\n",
"[22:30:14] WARNING: not removing hydrogen atom without neighbors\n",
"[22:30:31] WARNING: not removing hydrogen atom without neighbors\n",
"[22:30:32] WARNING: not removing hydrogen atom without neighbors\n",
"[22:30:34] WARNING: not removing hydrogen atom without neighbors\n",
"[22:30:59] WARNING: not removing hydrogen atom without neighbors\n",
"[22:30:59] WARNING: not removing hydrogen atom without neighbors\n",
"[22:30:59] WARNING: not removing hydrogen atom without neighbors\n",
"[22:31:09] WARNING: not removing hydrogen atom without neighbors\n",
"[22:31:09] WARNING: not removing hydrogen atom without neighbors\n",
"[22:31:09] WARNING: not removing hydrogen atom without neighbors\n",
"[22:31:09] WARNING: not removing hydrogen atom without neighbors\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 4min 7s, sys: 37.3 s, total: 4min 45s\n",
"Wall time: 5min 42s\n"
]
}
],
"source": [
"%%time\n",
"fps = bblean.fingerprints.fps_from_smiles(smiles, pack=True, n_features=2048)"
]
},
{
"cell_type": "markdown",
"id": "705cbaa3-8fd0-40e3-8527-e14a3dce3624",
"metadata": {},
"source": [
"Cluster"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "5e1b8fde-dd95-468d-9374-911370b5e1a1",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 3min 13s, sys: 2.05 s, total: 3min 15s\n",
"Wall time: 3min 16s\n"
]
},
{
"data": {
"text/plain": [
"BitBirch(threshold=0.65, branching_factor=50, merge_criterion='diameter')"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%time\n",
"bb_tree = bblean.BitBirch(branching_factor=50, threshold=0.65, merge_criterion=\"diameter\")\n",
"bb_tree.fit(fps, n_features=2048)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "8a1f4de2-f91b-4e7d-9d15-3a6231bc5c9d",
"metadata": {},
"outputs": [],
"source": [
"clusters = bb_tree.get_cluster_mol_ids()"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "4b9edd98-3938-4be1-82a3-e3700a9e605a",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"1.889534"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(clusters)/1e6"
]
},
{
"cell_type": "markdown",
"id": "4cc5a460-7425-453d-aaf9-6f700eea440f",
"metadata": {},
"source": [
"Add a cluster column to the dataframe"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "8e6333af-03d8-4f02-863c-0bf9f49e1fc8",
"metadata": {},
"outputs": [],
"source": [
"cluster_list = [-1] * len(chembl_df)\n",
"for idx,cluster in enumerate(clusters):\n",
" for id in cluster:\n",
" cluster_list[id] = idx\n",
"chembl_df['cluster'] = cluster_list"
]
},
{
"cell_type": "markdown",
"id": "46e4361e-4143-4853-a554-07fbd7d1a053",
"metadata": {},
"source": [
"Look at some clusters"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "60aed7bd-921f-47f8-b84d-1172b4c2159e",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"cluster\n",
"65 96\n",
"124 51\n",
"136 49\n",
"27 46\n",
"171 45\n",
" ..\n",
"1522831 1\n",
"1796423 1\n",
"1212826 1\n",
"1642975 1\n",
"181076 1\n",
"Name: count, Length: 1061197, dtype: int64"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"chembl_df.query(\"len < 50\").cluster.value_counts()"
]
},
{
"cell_type": "markdown",
"id": "a480c2a6-cc21-4039-b558-0b098fc457ec",
"metadata": {},
"source": [
"Repeat the with threshold = 0.55"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "f2a3747e-93ce-400e-b309-f20573a41d73",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 19.6 s, sys: 208 ms, total: 19.8 s\n",
"Wall time: 19.8 s\n"
]
},
{
"ename": "ValueError",
"evalue": "could not broadcast input array from shape (0,) into shape (256,)",
"output_type": "error",
"traceback": [
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
"\u001b[31mValueError\u001b[39m Traceback (most recent call last)",
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[18]\u001b[39m\u001b[32m, line 1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m \u001b[43mget_ipython\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m.\u001b[49m\u001b[43mrun_cell_magic\u001b[49m\u001b[43m(\u001b[49m\u001b[33;43m'\u001b[39;49m\u001b[33;43mtime\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[33;43m'\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[33;43m'\u001b[39;49m\u001b[33;43mbb_tree = bblean.BitBirch(branching_factor=50, threshold=0.55, merge_criterion=\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mdiameter\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43m)\u001b[39;49m\u001b[38;5;130;43;01m\\n\u001b[39;49;00m\u001b[33;43mbb_tree.fit(fps, n_features=2048)\u001b[39;49m\u001b[38;5;130;43;01m\\n\u001b[39;49;00m\u001b[33;43m'\u001b[39;49m\u001b[43m)\u001b[49m\n",
"\u001b[36mFile \u001b[39m\u001b[32m/opt/homebrew/Caskroom/miniforge/base/envs/rdkit_2025_09/lib/python3.11/site-packages/IPython/core/interactiveshell.py:2565\u001b[39m, in \u001b[36mInteractiveShell.run_cell_magic\u001b[39m\u001b[34m(self, magic_name, line, cell)\u001b[39m\n\u001b[32m 2563\u001b[39m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mself\u001b[39m.builtin_trap:\n\u001b[32m 2564\u001b[39m args = (magic_arg_s, cell)\n\u001b[32m-> \u001b[39m\u001b[32m2565\u001b[39m result = \u001b[43mfn\u001b[49m\u001b[43m(\u001b[49m\u001b[43m*\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 2567\u001b[39m \u001b[38;5;66;03m# The code below prevents the output from being displayed\u001b[39;00m\n\u001b[32m 2568\u001b[39m \u001b[38;5;66;03m# when using magics with decorator @output_can_be_silenced\u001b[39;00m\n\u001b[32m 2569\u001b[39m \u001b[38;5;66;03m# when the last Python token in the expression is a ';'.\u001b[39;00m\n\u001b[32m 2570\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mgetattr\u001b[39m(fn, magic.MAGIC_OUTPUT_CAN_BE_SILENCED, \u001b[38;5;28;01mFalse\u001b[39;00m):\n",
"\u001b[36mFile \u001b[39m\u001b[32m/opt/homebrew/Caskroom/miniforge/base/envs/rdkit_2025_09/lib/python3.11/site-packages/IPython/core/magics/execution.py:1470\u001b[39m, in \u001b[36mExecutionMagics.time\u001b[39m\u001b[34m(self, line, cell, local_ns)\u001b[39m\n\u001b[32m 1468\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m interrupt_occured:\n\u001b[32m 1469\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m exit_on_interrupt \u001b[38;5;129;01mand\u001b[39;00m captured_exception:\n\u001b[32m-> \u001b[39m\u001b[32m1470\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m captured_exception\n\u001b[32m 1471\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m\n\u001b[32m 1472\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m out\n",
"\u001b[36mFile \u001b[39m\u001b[32m/opt/homebrew/Caskroom/miniforge/base/envs/rdkit_2025_09/lib/python3.11/site-packages/IPython/core/magics/execution.py:1439\u001b[39m, in \u001b[36mExecutionMagics.time\u001b[39m\u001b[34m(self, line, cell, local_ns)\u001b[39m\n\u001b[32m 1437\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m expr_val \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[32m 1438\u001b[39m code_2 = \u001b[38;5;28mself\u001b[39m.shell.compile(expr_val, source, \u001b[33m'\u001b[39m\u001b[33meval\u001b[39m\u001b[33m'\u001b[39m)\n\u001b[32m-> \u001b[39m\u001b[32m1439\u001b[39m out = \u001b[38;5;28;43meval\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mcode_2\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mglob\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mlocal_ns\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 1440\u001b[39m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mKeyboardInterrupt\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[32m 1441\u001b[39m captured_exception = e\n",
"\u001b[36mFile \u001b[39m\u001b[32m<timed exec>:2\u001b[39m\n",
"\u001b[36mFile \u001b[39m\u001b[32m~/software/bblean/bblean/bitbirch.py:734\u001b[39m, in \u001b[36mBitBirch.fit\u001b[39m\u001b[34m(self, X, reinsert_indices, input_is_packed, n_features)\u001b[39m\n\u001b[32m 730\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m idx, fp \u001b[38;5;129;01min\u001b[39;00m iterable:\n\u001b[32m 731\u001b[39m subcluster = _BFSubcluster(\n\u001b[32m 732\u001b[39m linear_sum=fp, mol_indices=[idx], n_features=n_features\n\u001b[32m 733\u001b[39m )\n\u001b[32m--> \u001b[39m\u001b[32m734\u001b[39m split = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_root\u001b[49m\u001b[43m.\u001b[49m\u001b[43minsert_bf_subcluster\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 735\u001b[39m \u001b[43m \u001b[49m\u001b[43msubcluster\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmerge_accept_fn\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mthreshold\u001b[49m\n\u001b[32m 736\u001b[39m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 738\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m split:\n\u001b[32m 739\u001b[39m new_subcluster1, new_subcluster2 = _split_node(\u001b[38;5;28mself\u001b[39m._root)\n",
"\u001b[36mFile \u001b[39m\u001b[32m~/software/bblean/bblean/bitbirch.py:339\u001b[39m, in \u001b[36m_BFNode.insert_bf_subcluster\u001b[39m\u001b[34m(self, subcluster, merge_accept_fn, threshold)\u001b[39m\n\u001b[32m 336\u001b[39m \u001b[38;5;66;03m# If the subcluster has a child, we need a recursive strategy.\u001b[39;00m\n\u001b[32m 337\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m closest_subcluster.child_ \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[32m--> \u001b[39m\u001b[32m339\u001b[39m split_child = \u001b[43mclosest_subcluster\u001b[49m\u001b[43m.\u001b[49m\u001b[43mchild_\u001b[49m\u001b[43m.\u001b[49m\u001b[43minsert_bf_subcluster\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 340\u001b[39m \u001b[43m \u001b[49m\u001b[43msubcluster\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmerge_accept_fn\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mthreshold\u001b[49m\n\u001b[32m 341\u001b[39m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 343\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m split_child:\n\u001b[32m 344\u001b[39m \u001b[38;5;66;03m# If it is determined that the child need not be split, we\u001b[39;00m\n\u001b[32m 345\u001b[39m \u001b[38;5;66;03m# can just update the closest_subcluster\u001b[39;00m\n\u001b[32m 346\u001b[39m closest_subcluster.update(subcluster)\n",
"\u001b[36mFile \u001b[39m\u001b[32m~/software/bblean/bblean/bitbirch.py:359\u001b[39m, in \u001b[36m_BFNode.insert_bf_subcluster\u001b[39m\u001b[34m(self, subcluster, merge_accept_fn, threshold)\u001b[39m\n\u001b[32m 352\u001b[39m \u001b[38;5;66;03m# things not too good. we need to redistribute the subclusters in\u001b[39;00m\n\u001b[32m 353\u001b[39m \u001b[38;5;66;03m# our child node, and add a new subcluster in the parent\u001b[39;00m\n\u001b[32m 354\u001b[39m \u001b[38;5;66;03m# subcluster to accommodate the new child.\u001b[39;00m\n\u001b[32m 355\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m 356\u001b[39m new_subcluster1, new_subcluster2 = _split_node(\n\u001b[32m 357\u001b[39m closest_subcluster.child_\n\u001b[32m 358\u001b[39m )\n\u001b[32m--> \u001b[39m\u001b[32m359\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mupdate_split_subclusters\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 360\u001b[39m \u001b[43m \u001b[49m\u001b[43mclosest_subcluster\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mnew_subcluster1\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mnew_subcluster2\u001b[49m\n\u001b[32m 361\u001b[39m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 363\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(\u001b[38;5;28mself\u001b[39m.subclusters_) > \u001b[38;5;28mself\u001b[39m.branching_factor:\n\u001b[32m 364\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;01mTrue\u001b[39;00m\n",
"\u001b[36mFile \u001b[39m\u001b[32m~/software/bblean/bblean/bitbirch.py:319\u001b[39m, in \u001b[36m_BFNode.update_split_subclusters\u001b[39m\u001b[34m(self, subcluster, new_subcluster1, new_subcluster2)\u001b[39m\n\u001b[32m 317\u001b[39m idx = \u001b[38;5;28mself\u001b[39m.subclusters_.index(subcluster)\n\u001b[32m 318\u001b[39m \u001b[38;5;28mself\u001b[39m.subclusters_[idx] = new_subcluster1\n\u001b[32m--> \u001b[39m\u001b[32m319\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43minit_centroids_\u001b[49m\u001b[43m[\u001b[49m\u001b[43midx\u001b[49m\u001b[43m]\u001b[49m = new_subcluster1.centroid_\n\u001b[32m 320\u001b[39m \u001b[38;5;66;03m# Append new_subcluster2\u001b[39;00m\n\u001b[32m 321\u001b[39m \u001b[38;5;28mself\u001b[39m.append_subcluster(new_subcluster2)\n",
"\u001b[31mValueError\u001b[39m: could not broadcast input array from shape (0,) into shape (256,)"
]
}
],
"source": [
"%%time\n",
"bb_tree = bblean.BitBirch(branching_factor=50, threshold=0.55, merge_criterion=\"diameter\")\n",
"bb_tree.fit(fps, n_features=2048)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b6c8d74f-b1f5-44fd-a340-2aa163c4abec",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.13"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment