Created
October 11, 2025 02:46
-
-
Save PatWalters/f07fcd25362192261ce39966b7742c1c to your computer and use it in GitHub Desktop.
Demoing the bug
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "cells": [ | |
| { | |
| "cell_type": "code", | |
| "execution_count": 1, | |
| "id": "204e2f2e-1226-4c3b-ab78-d3e5ccac8a61", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "import pandas as pd\n", | |
| "import bblean\n", | |
| "import matplotlib.pyplot as plt\n", | |
| "import mols2grid\n", | |
| "from tqdm.auto import tqdm\n", | |
| "from rdkit import Chem" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 2, | |
| "id": "615c9912-ddd8-43ee-a824-9eafbb79d81b", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "tqdm.pandas()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 3, | |
| "id": "1bc3df0c-c90a-4ecc-90f4-d02c9e4d09ff", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "CPU times: user 4.06 s, sys: 208 ms, total: 4.27 s\n", | |
| "Wall time: 4.27 s\n" | |
| ] | |
| }, | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "2854815" | |
| ] | |
| }, | |
| "execution_count": 3, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "%%time\n", | |
| "chembl_df = pd.read_csv(\"chembl_36_chemreps.txt\",sep=\"\\t\")\n", | |
| "len(chembl_df)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "id": "3090db1c-306d-4376-9089-d5dde5835bb2", | |
| "metadata": {}, | |
| "source": [ | |
| "A quick function to remove janky SMILES" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 4, | |
| "id": "95395ff4-2fd1-431a-a77e-57e162162ab9", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "def check_smiles(smi):\n", | |
| " mol = Chem.MolFromSmiles(smi)\n", | |
| " return mol is not None" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "id": "a2685de9-ede1-430e-a0bb-792be6b13de2", | |
| "metadata": {}, | |
| "source": [ | |
| "Remove the janky SMILES" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 5, | |
| "id": "1d847a46-fd15-457c-b9be-a61258832a47", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "application/vnd.jupyter.widget-view+json": { | |
| "model_id": "866ae6d6c27b43e8ac0fef7b8a8f582f", | |
| "version_major": 2, | |
| "version_minor": 0 | |
| }, | |
| "text/plain": [ | |
| " 0%| | 0/2854815 [00:00<?, ?it/s]" | |
| ] | |
| }, | |
| "metadata": {}, | |
| "output_type": "display_data" | |
| }, | |
| { | |
| "name": "stderr", | |
| "output_type": "stream", | |
| "text": [ | |
| "[22:24:38] Explicit valence for atom # 13 P, 7, is greater than permitted\n", | |
| "[22:24:39] Explicit valence for atom # 29 P, 7, is greater than permitted\n", | |
| "[22:24:39] Explicit valence for atom # 91 P, 7, is greater than permitted\n", | |
| "[22:25:17] WARNING: not removing hydrogen atom without neighbors\n", | |
| "[22:25:29] Explicit valence for atom # 17 P, 7, is greater than permitted\n", | |
| "[22:25:29] Explicit valence for atom # 1 P, 7, is greater than permitted\n", | |
| "[22:25:29] Explicit valence for atom # 1 P, 7, is greater than permitted\n", | |
| "[22:25:30] Explicit valence for atom # 1 P, 7, is greater than permitted\n", | |
| "[22:25:32] WARNING: not removing hydrogen atom without neighbors\n", | |
| "[22:25:32] WARNING: not removing hydrogen atom without neighbors\n", | |
| "[22:25:41] Explicit valence for atom # 19 P, 7, is greater than permitted\n", | |
| "[22:25:41] Explicit valence for atom # 1 P, 7, is greater than permitted\n", | |
| "[22:25:42] Explicit valence for atom # 16 P, 7, is greater than permitted\n", | |
| "[22:26:03] Explicit valence for atom # 3 Ar, 1, is greater than permitted\n", | |
| "[22:26:32] WARNING: not removing hydrogen atom without neighbors\n", | |
| "[22:26:46] WARNING: not removing hydrogen atom without neighbors\n", | |
| "[22:26:47] WARNING: not removing hydrogen atom without neighbors\n", | |
| "[22:26:48] WARNING: not removing hydrogen atom without neighbors\n", | |
| "[22:27:09] WARNING: not removing hydrogen atom without neighbors\n", | |
| "[22:27:09] WARNING: not removing hydrogen atom without neighbors\n", | |
| "[22:27:09] WARNING: not removing hydrogen atom without neighbors\n", | |
| "[22:27:11] Can't kekulize mol. Unkekulized atoms: 1 2 3 4 5 6 10 11 15 16 17 19 20 21\n" | |
| ] | |
| }, | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "CPU times: user 2min 40s, sys: 1.01 s, total: 2min 41s\n", | |
| "Wall time: 2min 41s\n" | |
| ] | |
| }, | |
| { | |
| "name": "stderr", | |
| "output_type": "stream", | |
| "text": [ | |
| "[22:27:17] Explicit valence for atom # 1 As, 7, is greater than permitted\n", | |
| "[22:27:17] WARNING: not removing hydrogen atom without neighbors\n", | |
| "[22:27:17] WARNING: not removing hydrogen atom without neighbors\n", | |
| "[22:27:17] WARNING: not removing hydrogen atom without neighbors\n", | |
| "[22:27:17] WARNING: not removing hydrogen atom without neighbors\n", | |
| "[22:27:17] Explicit valence for atom # 1 P, 7, is greater than permitted\n", | |
| "[22:27:17] Explicit valence for atom # 34 P, 7, is greater than permitted\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "%%time\n", | |
| "chembl_df['valid'] = chembl_df.canonical_smiles.progress_apply(check_smiles)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 6, | |
| "id": "a6619aef-4d9b-44db-a3e6-7966c3c2113c", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "2854800" | |
| ] | |
| }, | |
| "execution_count": 6, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "chembl_df = chembl_df.query(\"valid\").copy()\n", | |
| "len(chembl_df)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "id": "7ed2841e-741d-47d4-b00e-0c73a223fb36", | |
| "metadata": {}, | |
| "source": [ | |
| "By default **bblean** can't handle SMILES longer than 388" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 7, | |
| "id": "19f9c5bd-6f94-4623-9d7f-d1b8e211eac0", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "chembl_df['len'] = chembl_df.canonical_smiles.str.len()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 8, | |
| "id": "aca2a8bf-bf80-4648-ab5f-2830e6330c6f", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "chembl_df = chembl_df.query(\"len < 388\")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 9, | |
| "id": "8751fdab-5766-490d-8edb-14916357d9a7", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "2844613" | |
| ] | |
| }, | |
| "execution_count": 9, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "len(chembl_df)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "id": "9beb1184-acda-4e4e-8695-5b01782d9cd6", | |
| "metadata": {}, | |
| "source": [ | |
| "Format the SMILES the way that bblean wants them. There's probably a better way to do this. " | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 10, | |
| "id": "8aaf3b43-1a19-4ff5-a050-24679bef3c9d", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "CPU times: user 326 ms, sys: 168 ms, total: 495 ms\n", | |
| "Wall time: 491 ms\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "%%time\n", | |
| "smiles = (chembl_df.canonical_smiles.values+\"\\n\").astype(dtype='<U388')" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "id": "3e132523-c03b-4aee-8cf4-9e6813a633e8", | |
| "metadata": {}, | |
| "source": [ | |
| "Generate the fingerprints" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 11, | |
| "id": "0de13a71-7754-4c1f-81c6-661ab078f545", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stderr", | |
| "output_type": "stream", | |
| "text": [ | |
| "[22:28:46] WARNING: not removing hydrogen atom without neighbors\n", | |
| "[22:29:02] WARNING: not removing hydrogen atom without neighbors\n", | |
| "[22:29:02] WARNING: not removing hydrogen atom without neighbors\n", | |
| "[22:30:14] WARNING: not removing hydrogen atom without neighbors\n", | |
| "[22:30:31] WARNING: not removing hydrogen atom without neighbors\n", | |
| "[22:30:32] WARNING: not removing hydrogen atom without neighbors\n", | |
| "[22:30:34] WARNING: not removing hydrogen atom without neighbors\n", | |
| "[22:30:59] WARNING: not removing hydrogen atom without neighbors\n", | |
| "[22:30:59] WARNING: not removing hydrogen atom without neighbors\n", | |
| "[22:30:59] WARNING: not removing hydrogen atom without neighbors\n", | |
| "[22:31:09] WARNING: not removing hydrogen atom without neighbors\n", | |
| "[22:31:09] WARNING: not removing hydrogen atom without neighbors\n", | |
| "[22:31:09] WARNING: not removing hydrogen atom without neighbors\n", | |
| "[22:31:09] WARNING: not removing hydrogen atom without neighbors\n" | |
| ] | |
| }, | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "CPU times: user 4min 7s, sys: 37.3 s, total: 4min 45s\n", | |
| "Wall time: 5min 42s\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "%%time\n", | |
| "fps = bblean.fingerprints.fps_from_smiles(smiles, pack=True, n_features=2048)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "id": "705cbaa3-8fd0-40e3-8527-e14a3dce3624", | |
| "metadata": {}, | |
| "source": [ | |
| "Cluster" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 12, | |
| "id": "5e1b8fde-dd95-468d-9374-911370b5e1a1", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "CPU times: user 3min 13s, sys: 2.05 s, total: 3min 15s\n", | |
| "Wall time: 3min 16s\n" | |
| ] | |
| }, | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "BitBirch(threshold=0.65, branching_factor=50, merge_criterion='diameter')" | |
| ] | |
| }, | |
| "execution_count": 12, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "%%time\n", | |
| "bb_tree = bblean.BitBirch(branching_factor=50, threshold=0.65, merge_criterion=\"diameter\")\n", | |
| "bb_tree.fit(fps, n_features=2048)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 13, | |
| "id": "8a1f4de2-f91b-4e7d-9d15-3a6231bc5c9d", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "clusters = bb_tree.get_cluster_mol_ids()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 14, | |
| "id": "4b9edd98-3938-4be1-82a3-e3700a9e605a", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "1.889534" | |
| ] | |
| }, | |
| "execution_count": 14, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "len(clusters)/1e6" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "id": "4cc5a460-7425-453d-aaf9-6f700eea440f", | |
| "metadata": {}, | |
| "source": [ | |
| "Add a cluster column to the dataframe" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 15, | |
| "id": "8e6333af-03d8-4f02-863c-0bf9f49e1fc8", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "cluster_list = [-1] * len(chembl_df)\n", | |
| "for idx,cluster in enumerate(clusters):\n", | |
| " for id in cluster:\n", | |
| " cluster_list[id] = idx\n", | |
| "chembl_df['cluster'] = cluster_list" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "id": "46e4361e-4143-4853-a554-07fbd7d1a053", | |
| "metadata": {}, | |
| "source": [ | |
| "Look at some clusters" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 16, | |
| "id": "60aed7bd-921f-47f8-b84d-1172b4c2159e", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "cluster\n", | |
| "65 96\n", | |
| "124 51\n", | |
| "136 49\n", | |
| "27 46\n", | |
| "171 45\n", | |
| " ..\n", | |
| "1522831 1\n", | |
| "1796423 1\n", | |
| "1212826 1\n", | |
| "1642975 1\n", | |
| "181076 1\n", | |
| "Name: count, Length: 1061197, dtype: int64" | |
| ] | |
| }, | |
| "execution_count": 16, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "chembl_df.query(\"len < 50\").cluster.value_counts()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "id": "a480c2a6-cc21-4039-b558-0b098fc457ec", | |
| "metadata": {}, | |
| "source": [ | |
| "Repeat the with threshold = 0.55" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 18, | |
| "id": "f2a3747e-93ce-400e-b309-f20573a41d73", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "CPU times: user 19.6 s, sys: 208 ms, total: 19.8 s\n", | |
| "Wall time: 19.8 s\n" | |
| ] | |
| }, | |
| { | |
| "ename": "ValueError", | |
| "evalue": "could not broadcast input array from shape (0,) into shape (256,)", | |
| "output_type": "error", | |
| "traceback": [ | |
| "\u001b[31m---------------------------------------------------------------------------\u001b[39m", | |
| "\u001b[31mValueError\u001b[39m Traceback (most recent call last)", | |
| "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[18]\u001b[39m\u001b[32m, line 1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m \u001b[43mget_ipython\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m.\u001b[49m\u001b[43mrun_cell_magic\u001b[49m\u001b[43m(\u001b[49m\u001b[33;43m'\u001b[39;49m\u001b[33;43mtime\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[33;43m'\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[33;43m'\u001b[39;49m\u001b[33;43mbb_tree = bblean.BitBirch(branching_factor=50, threshold=0.55, merge_criterion=\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mdiameter\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43m)\u001b[39;49m\u001b[38;5;130;43;01m\\n\u001b[39;49;00m\u001b[33;43mbb_tree.fit(fps, n_features=2048)\u001b[39;49m\u001b[38;5;130;43;01m\\n\u001b[39;49;00m\u001b[33;43m'\u001b[39;49m\u001b[43m)\u001b[49m\n", | |
| "\u001b[36mFile \u001b[39m\u001b[32m/opt/homebrew/Caskroom/miniforge/base/envs/rdkit_2025_09/lib/python3.11/site-packages/IPython/core/interactiveshell.py:2565\u001b[39m, in \u001b[36mInteractiveShell.run_cell_magic\u001b[39m\u001b[34m(self, magic_name, line, cell)\u001b[39m\n\u001b[32m 2563\u001b[39m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mself\u001b[39m.builtin_trap:\n\u001b[32m 2564\u001b[39m args = (magic_arg_s, cell)\n\u001b[32m-> \u001b[39m\u001b[32m2565\u001b[39m result = \u001b[43mfn\u001b[49m\u001b[43m(\u001b[49m\u001b[43m*\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 2567\u001b[39m \u001b[38;5;66;03m# The code below prevents the output from being displayed\u001b[39;00m\n\u001b[32m 2568\u001b[39m \u001b[38;5;66;03m# when using magics with decorator @output_can_be_silenced\u001b[39;00m\n\u001b[32m 2569\u001b[39m \u001b[38;5;66;03m# when the last Python token in the expression is a ';'.\u001b[39;00m\n\u001b[32m 2570\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mgetattr\u001b[39m(fn, magic.MAGIC_OUTPUT_CAN_BE_SILENCED, \u001b[38;5;28;01mFalse\u001b[39;00m):\n", | |
| "\u001b[36mFile \u001b[39m\u001b[32m/opt/homebrew/Caskroom/miniforge/base/envs/rdkit_2025_09/lib/python3.11/site-packages/IPython/core/magics/execution.py:1470\u001b[39m, in \u001b[36mExecutionMagics.time\u001b[39m\u001b[34m(self, line, cell, local_ns)\u001b[39m\n\u001b[32m 1468\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m interrupt_occured:\n\u001b[32m 1469\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m exit_on_interrupt \u001b[38;5;129;01mand\u001b[39;00m captured_exception:\n\u001b[32m-> \u001b[39m\u001b[32m1470\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m captured_exception\n\u001b[32m 1471\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m\n\u001b[32m 1472\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m out\n", | |
| "\u001b[36mFile \u001b[39m\u001b[32m/opt/homebrew/Caskroom/miniforge/base/envs/rdkit_2025_09/lib/python3.11/site-packages/IPython/core/magics/execution.py:1439\u001b[39m, in \u001b[36mExecutionMagics.time\u001b[39m\u001b[34m(self, line, cell, local_ns)\u001b[39m\n\u001b[32m 1437\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m expr_val \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[32m 1438\u001b[39m code_2 = \u001b[38;5;28mself\u001b[39m.shell.compile(expr_val, source, \u001b[33m'\u001b[39m\u001b[33meval\u001b[39m\u001b[33m'\u001b[39m)\n\u001b[32m-> \u001b[39m\u001b[32m1439\u001b[39m out = \u001b[38;5;28;43meval\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mcode_2\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mglob\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mlocal_ns\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 1440\u001b[39m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mKeyboardInterrupt\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[32m 1441\u001b[39m captured_exception = e\n", | |
| "\u001b[36mFile \u001b[39m\u001b[32m<timed exec>:2\u001b[39m\n", | |
| "\u001b[36mFile \u001b[39m\u001b[32m~/software/bblean/bblean/bitbirch.py:734\u001b[39m, in \u001b[36mBitBirch.fit\u001b[39m\u001b[34m(self, X, reinsert_indices, input_is_packed, n_features)\u001b[39m\n\u001b[32m 730\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m idx, fp \u001b[38;5;129;01min\u001b[39;00m iterable:\n\u001b[32m 731\u001b[39m subcluster = _BFSubcluster(\n\u001b[32m 732\u001b[39m linear_sum=fp, mol_indices=[idx], n_features=n_features\n\u001b[32m 733\u001b[39m )\n\u001b[32m--> \u001b[39m\u001b[32m734\u001b[39m split = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_root\u001b[49m\u001b[43m.\u001b[49m\u001b[43minsert_bf_subcluster\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 735\u001b[39m \u001b[43m \u001b[49m\u001b[43msubcluster\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmerge_accept_fn\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mthreshold\u001b[49m\n\u001b[32m 736\u001b[39m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 738\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m split:\n\u001b[32m 739\u001b[39m new_subcluster1, new_subcluster2 = _split_node(\u001b[38;5;28mself\u001b[39m._root)\n", | |
| "\u001b[36mFile \u001b[39m\u001b[32m~/software/bblean/bblean/bitbirch.py:339\u001b[39m, in \u001b[36m_BFNode.insert_bf_subcluster\u001b[39m\u001b[34m(self, subcluster, merge_accept_fn, threshold)\u001b[39m\n\u001b[32m 336\u001b[39m \u001b[38;5;66;03m# If the subcluster has a child, we need a recursive strategy.\u001b[39;00m\n\u001b[32m 337\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m closest_subcluster.child_ \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[32m--> \u001b[39m\u001b[32m339\u001b[39m split_child = \u001b[43mclosest_subcluster\u001b[49m\u001b[43m.\u001b[49m\u001b[43mchild_\u001b[49m\u001b[43m.\u001b[49m\u001b[43minsert_bf_subcluster\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 340\u001b[39m \u001b[43m \u001b[49m\u001b[43msubcluster\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmerge_accept_fn\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mthreshold\u001b[49m\n\u001b[32m 341\u001b[39m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 343\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m split_child:\n\u001b[32m 344\u001b[39m \u001b[38;5;66;03m# If it is determined that the child need not be split, we\u001b[39;00m\n\u001b[32m 345\u001b[39m \u001b[38;5;66;03m# can just update the closest_subcluster\u001b[39;00m\n\u001b[32m 346\u001b[39m closest_subcluster.update(subcluster)\n", | |
| "\u001b[36mFile \u001b[39m\u001b[32m~/software/bblean/bblean/bitbirch.py:359\u001b[39m, in \u001b[36m_BFNode.insert_bf_subcluster\u001b[39m\u001b[34m(self, subcluster, merge_accept_fn, threshold)\u001b[39m\n\u001b[32m 352\u001b[39m \u001b[38;5;66;03m# things not too good. we need to redistribute the subclusters in\u001b[39;00m\n\u001b[32m 353\u001b[39m \u001b[38;5;66;03m# our child node, and add a new subcluster in the parent\u001b[39;00m\n\u001b[32m 354\u001b[39m \u001b[38;5;66;03m# subcluster to accommodate the new child.\u001b[39;00m\n\u001b[32m 355\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m 356\u001b[39m new_subcluster1, new_subcluster2 = _split_node(\n\u001b[32m 357\u001b[39m closest_subcluster.child_\n\u001b[32m 358\u001b[39m )\n\u001b[32m--> \u001b[39m\u001b[32m359\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mupdate_split_subclusters\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 360\u001b[39m \u001b[43m \u001b[49m\u001b[43mclosest_subcluster\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mnew_subcluster1\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mnew_subcluster2\u001b[49m\n\u001b[32m 361\u001b[39m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 363\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(\u001b[38;5;28mself\u001b[39m.subclusters_) > \u001b[38;5;28mself\u001b[39m.branching_factor:\n\u001b[32m 364\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;01mTrue\u001b[39;00m\n", | |
| "\u001b[36mFile \u001b[39m\u001b[32m~/software/bblean/bblean/bitbirch.py:319\u001b[39m, in \u001b[36m_BFNode.update_split_subclusters\u001b[39m\u001b[34m(self, subcluster, new_subcluster1, new_subcluster2)\u001b[39m\n\u001b[32m 317\u001b[39m idx = \u001b[38;5;28mself\u001b[39m.subclusters_.index(subcluster)\n\u001b[32m 318\u001b[39m \u001b[38;5;28mself\u001b[39m.subclusters_[idx] = new_subcluster1\n\u001b[32m--> \u001b[39m\u001b[32m319\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43minit_centroids_\u001b[49m\u001b[43m[\u001b[49m\u001b[43midx\u001b[49m\u001b[43m]\u001b[49m = new_subcluster1.centroid_\n\u001b[32m 320\u001b[39m \u001b[38;5;66;03m# Append new_subcluster2\u001b[39;00m\n\u001b[32m 321\u001b[39m \u001b[38;5;28mself\u001b[39m.append_subcluster(new_subcluster2)\n", | |
| "\u001b[31mValueError\u001b[39m: could not broadcast input array from shape (0,) into shape (256,)" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "%%time\n", | |
| "bb_tree = bblean.BitBirch(branching_factor=50, threshold=0.55, merge_criterion=\"diameter\")\n", | |
| "bb_tree.fit(fps, n_features=2048)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "id": "b6c8d74f-b1f5-44fd-a340-2aa163c4abec", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [] | |
| } | |
| ], | |
| "metadata": { | |
| "kernelspec": { | |
| "display_name": "Python 3 (ipykernel)", | |
| "language": "python", | |
| "name": "python3" | |
| }, | |
| "language_info": { | |
| "codemirror_mode": { | |
| "name": "ipython", | |
| "version": 3 | |
| }, | |
| "file_extension": ".py", | |
| "mimetype": "text/x-python", | |
| "name": "python", | |
| "nbconvert_exporter": "python", | |
| "pygments_lexer": "ipython3", | |
| "version": "3.11.13" | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 5 | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment