Skip to content

Instantly share code, notes, and snippets.

@tazarov
Created August 5, 2024 11:50
Show Gist options
  • Save tazarov/a5f0d1165234d0a181b139376184b40b to your computer and use it in GitHub Desktop.
Save tazarov/a5f0d1165234d0a181b139376184b40b to your computer and use it in GitHub Desktop.
Reproduces a bug in HNSW when replace_delete=True
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"metadata": {},
"cell_type": "markdown",
"source": "# Try to reproduce the bug by replay of the state machine",
"id": "85a95571ea8c3598"
},
{
"cell_type": "code",
"id": "initial_id",
"metadata": {
"collapsed": true,
"ExecuteTime": {
"end_time": "2024-08-05T08:21:35.711003Z",
"start_time": "2024-08-05T08:21:35.642637Z"
}
},
"source": [
"import random\n",
"\n",
"import hnswlib\n",
"import numpy as np\n",
"\n",
"vectors = np.random.rand(30, 1536).astype(np.float32)\n",
"ids = np.arange(30)\n",
"index = hnswlib.Index(space='l2', dim=1536)\n",
"index.init_index(allow_replace_deleted=True,persistence_location=\"hnsw_label_bug/\",is_persistent_index=True,max_elements=501)\n"
],
"outputs": [],
"execution_count": 1
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-08-05T08:21:36.465732Z",
"start_time": "2024-08-05T08:21:36.463921Z"
}
},
"cell_type": "code",
"source": [
"# Add ID 0\n",
"index.add_items(data=[vectors[0]],ids=[ids[0]],replace_deleted=True)"
],
"id": "4d7c85267ecc743f",
"outputs": [],
"execution_count": 2
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-08-05T08:21:37.269094Z",
"start_time": "2024-08-05T08:21:37.266903Z"
}
},
"cell_type": "code",
"source": [
"#delete 0\n",
"index.mark_deleted(ids[0])"
],
"id": "40c3a1aa79f6b39b",
"outputs": [],
"execution_count": 3
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-08-05T08:21:37.977276Z",
"start_time": "2024-08-05T08:21:37.975284Z"
}
},
"cell_type": "code",
"source": [
"# add 10 vectors\n",
"index.add_items(data=vectors[1:11],ids=ids[1:11],replace_deleted=True)"
],
"id": "7823b1f0b9a2d96b",
"outputs": [],
"execution_count": 4
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-08-05T08:21:38.550548Z",
"start_time": "2024-08-05T08:21:38.548687Z"
}
},
"cell_type": "code",
"source": [
"# delete ID 1\n",
"index.mark_deleted(ids[1])"
],
"id": "958be64f115b85cc",
"outputs": [],
"execution_count": 5
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-08-05T08:21:39.080032Z",
"start_time": "2024-08-05T08:21:39.078103Z"
}
},
"cell_type": "code",
"source": [
"# upsert 11 and 12\n",
"index.add_items(data=vectors[11:13],ids=ids[11:13],replace_deleted=True)"
],
"id": "1df6fba54889425e",
"outputs": [],
"execution_count": 6
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-08-05T08:21:39.611261Z",
"start_time": "2024-08-05T08:21:39.609267Z"
}
},
"cell_type": "code",
"source": [
"# Add 11 thru 16\n",
"index.add_items(data=vectors[11:17],ids=ids[11:17],replace_deleted=True)"
],
"id": "5ed27eaa02dc651d",
"outputs": [],
"execution_count": 7
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-08-05T08:21:40.282052Z",
"start_time": "2024-08-05T08:21:40.280009Z"
}
},
"cell_type": "code",
"source": [
"# Add Emmbeddings 17, 18, and 1\n",
"index.add_items(data=vectors[17:20],ids=ids[17:20],replace_deleted=True)"
],
"id": "d5966e5d96af17c2",
"outputs": [],
"execution_count": 8
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-08-05T08:21:41.210879Z",
"start_time": "2024-08-05T08:21:41.208631Z"
}
},
"cell_type": "code",
"source": [
"# Add 20\n",
"index.add_items(data=[vectors[20]],ids=[ids[20]],replace_deleted=True)"
],
"id": "ea183ec3dfaeaeb4",
"outputs": [],
"execution_count": 9
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-08-05T08:21:41.764222Z",
"start_time": "2024-08-05T08:21:41.762388Z"
}
},
"cell_type": "code",
"source": [
"# Add 21 and 22\n",
"index.add_items(data=vectors[21:23],ids=ids[21:23],replace_deleted=True)"
],
"id": "575d4f7c51b13370",
"outputs": [],
"execution_count": 10
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-08-05T08:21:42.347645Z",
"start_time": "2024-08-05T08:21:42.345325Z"
}
},
"cell_type": "code",
"source": [
"# Add 16\n",
"index.add_items(data=[vectors[16]],ids=[ids[16]],replace_deleted=True)"
],
"id": "fa60cb75d8c5dd67",
"outputs": [],
"execution_count": 11
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-08-05T08:21:43.072866Z",
"start_time": "2024-08-05T08:21:43.070776Z"
}
},
"cell_type": "code",
"source": [
"# Add 23\n",
"index.add_items(data=[vectors[23]],ids=[ids[23]],replace_deleted=True)"
],
"id": "84ffe2f36ba11edf",
"outputs": [],
"execution_count": 12
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-08-05T08:21:43.809598Z",
"start_time": "2024-08-05T08:21:43.807569Z"
}
},
"cell_type": "code",
"source": [
"# Add 24 thru 27\n",
"index.add_items(data=vectors[24:28],ids=ids[24:28],replace_deleted=True)"
],
"id": "ec1951a2a863a670",
"outputs": [],
"execution_count": 13
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-08-05T08:21:45.093522Z",
"start_time": "2024-08-05T08:21:45.091329Z"
}
},
"cell_type": "code",
"source": [
"# Delete 21 thru 26\n",
"# index.mark_deleted(ids[21:27])\n",
"for i in range(21,27):\n",
" index.mark_deleted(ids[i])"
],
"id": "ae44d4e22dcc195f",
"outputs": [],
"execution_count": 14
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-08-05T08:21:46.944606Z",
"start_time": "2024-08-05T08:21:46.940604Z"
}
},
"cell_type": "code",
"source": [
"# Upsert 20, 26 and 27\n",
"index.add_items(data=[vectors[20],vectors[26],vectors[27]],ids=[ids[20],ids[26],ids[27]],replace_deleted=True)"
],
"id": "d2906a7759ffaa75",
"outputs": [],
"execution_count": 15
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-08-05T08:32:56.414844Z",
"start_time": "2024-08-05T08:32:56.412628Z"
}
},
"cell_type": "code",
"source": [
"# Delete 19, 20, 27\n",
"# index.mark_deleted([ids[19],ids[20],ids[27]])\n",
"\n",
"for i in [19,20,27]:\n",
" print(i)\n",
" try:\n",
" index.mark_deleted(ids[i])\n",
" except:\n",
" print(\"Error\")"
],
"id": "6403a57f9e98d9cb",
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"19\n",
"Error\n",
"20\n",
"Error\n",
"27\n",
"Error\n"
]
}
],
"execution_count": 25
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-08-05T08:47:22.725534Z",
"start_time": "2024-08-05T08:47:22.723324Z"
}
},
"cell_type": "code",
"source": [
"\n",
"print(index.element_count)\n",
"res=index.knn_query(vectors[19], k=10)\n",
"print(res[0][0])"
],
"id": "33820ccb85967435",
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"26\n",
"[12 18 3 6 20 4 26 13 14 27]\n"
]
}
],
"execution_count": 30
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-08-05T08:31:34.668700Z",
"start_time": "2024-08-05T08:31:34.666993Z"
}
},
"cell_type": "code",
"source": "print(ids[20])",
"id": "b356e8f82160798c",
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"20\n"
]
}
],
"execution_count": 21
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-08-05T08:47:30.479579Z",
"start_time": "2024-08-05T08:47:30.477757Z"
}
},
"cell_type": "code",
"source": "print(index.get_ids_list())",
"id": "e76e1732806ff74e",
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[26, 27, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2]\n"
]
}
],
"execution_count": 31
},
{
"metadata": {},
"cell_type": "markdown",
"source": "# Reproduce the bug",
"id": "53a3523613d92529"
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-08-05T11:40:31.416876Z",
"start_time": "2024-08-05T11:40:31.092098Z"
}
},
"cell_type": "code",
"source": [
"import hnswlib\n",
"import numpy as np\n",
"import random\n",
"delete_ids = []\n",
"cycles = 2000\n",
"vectors = np.random.uniform(-1,1,(cycles*6, 1536)).astype(np.float32)\n",
"index = hnswlib.Index(space='l2', dim=1536)\n",
"index.init_index(allow_replace_deleted=True, max_elements=501)\n",
"index.set_num_threads(1)\n",
"for cid in range(cycles):\n",
" if random.random()>0.7 and len(index.get_ids_list()) >0: # we randomly delete ~30% of the time\n",
" ids_to_delete = random.sample(index.get_ids_list(), random.randint(1, len(index.get_ids_list())))\n",
" for id in ids_to_delete:\n",
" if id in delete_ids: # weirdly we also have to check that get_ids_list doesn't return deleted ids\n",
" continue\n",
" index.mark_deleted(id)\n",
" delete_ids.append(id)\n",
" else:\n",
" start_id = index.element_count #this is not idea but does the job.\n",
" num_vectors_to_add = random.randint(1, 6) # \n",
" va = vectors[start_id:start_id+num_vectors_to_add]\n",
" index.add_items(data=va,ids=range(start_id,start_id+len(va)),num_threads=1,replace_deleted=True)\n",
" if index.element_count>0:\n",
" try:\n",
" res=index.knn_query(vectors[random.sample(index.get_ids_list(),1)[0]],num_threads=1, k=min(len(index.get_ids_list()),10))\n",
" if len(res[0][0])>0:\n",
" for i in res[0][0]:\n",
" if i in delete_ids:\n",
" raise Exception(f\"Error in cycle {cid}\",f\"ID: {i}\", f\"Deleted IDs: {delete_ids}\", f\"Query Response Labels: {res[0][0].tolist()}\",f\"All index IDs: {index.get_ids_list()}\")\n",
" except Exception as e:\n",
" if \"Cannot return\" not in str(e):\n",
" print(str(e))\n",
" raise e\n",
" "
],
"id": "65115ca986be7d34",
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"('Error in cycle 5', 'ID: 5', 'Deleted IDs: [1, 2, 0, 6, 5, 7]', 'Query Response Labels: [3, 9, 4, 5, 8, 10]', 'All index IDs: [10, 7, 9, 4, 8, 3]')\n"
]
},
{
"ename": "Exception",
"evalue": "('Error in cycle 5', 'ID: 5', 'Deleted IDs: [1, 2, 0, 6, 5, 7]', 'Query Response Labels: [3, 9, 4, 5, 8, 10]', 'All index IDs: [10, 7, 9, 4, 8, 3]')",
"output_type": "error",
"traceback": [
"\u001B[0;31m---------------------------------------------------------------------------\u001B[0m",
"\u001B[0;31mException\u001B[0m Traceback (most recent call last)",
"Cell \u001B[0;32mIn[1], line 33\u001B[0m\n\u001B[1;32m 31\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mCannot return\u001B[39m\u001B[38;5;124m\"\u001B[39m \u001B[38;5;129;01mnot\u001B[39;00m \u001B[38;5;129;01min\u001B[39;00m \u001B[38;5;28mstr\u001B[39m(e):\n\u001B[1;32m 32\u001B[0m \u001B[38;5;28mprint\u001B[39m(\u001B[38;5;28mstr\u001B[39m(e))\n\u001B[0;32m---> 33\u001B[0m \u001B[38;5;28;01mraise\u001B[39;00m e\n",
"Cell \u001B[0;32mIn[1], line 29\u001B[0m\n\u001B[1;32m 27\u001B[0m \u001B[38;5;28;01mfor\u001B[39;00m i \u001B[38;5;129;01min\u001B[39;00m res[\u001B[38;5;241m0\u001B[39m][\u001B[38;5;241m0\u001B[39m]:\n\u001B[1;32m 28\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m i \u001B[38;5;129;01min\u001B[39;00m delete_ids:\n\u001B[0;32m---> 29\u001B[0m \u001B[38;5;28;01mraise\u001B[39;00m \u001B[38;5;167;01mException\u001B[39;00m(\u001B[38;5;124mf\u001B[39m\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mError in cycle \u001B[39m\u001B[38;5;132;01m{\u001B[39;00mcid\u001B[38;5;132;01m}\u001B[39;00m\u001B[38;5;124m\"\u001B[39m,\u001B[38;5;124mf\u001B[39m\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mID: \u001B[39m\u001B[38;5;132;01m{\u001B[39;00mi\u001B[38;5;132;01m}\u001B[39;00m\u001B[38;5;124m\"\u001B[39m, \u001B[38;5;124mf\u001B[39m\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mDeleted IDs: \u001B[39m\u001B[38;5;132;01m{\u001B[39;00mdelete_ids\u001B[38;5;132;01m}\u001B[39;00m\u001B[38;5;124m\"\u001B[39m, \u001B[38;5;124mf\u001B[39m\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mQuery Response Labels: \u001B[39m\u001B[38;5;132;01m{\u001B[39;00mres[\u001B[38;5;241m0\u001B[39m][\u001B[38;5;241m0\u001B[39m]\u001B[38;5;241m.\u001B[39mtolist()\u001B[38;5;132;01m}\u001B[39;00m\u001B[38;5;124m\"\u001B[39m,\u001B[38;5;124mf\u001B[39m\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mAll index IDs: \u001B[39m\u001B[38;5;132;01m{\u001B[39;00mindex\u001B[38;5;241m.\u001B[39mget_ids_list()\u001B[38;5;132;01m}\u001B[39;00m\u001B[38;5;124m\"\u001B[39m)\n\u001B[1;32m 30\u001B[0m \u001B[38;5;28;01mexcept\u001B[39;00m \u001B[38;5;167;01mException\u001B[39;00m \u001B[38;5;28;01mas\u001B[39;00m e:\n\u001B[1;32m 31\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mCannot return\u001B[39m\u001B[38;5;124m\"\u001B[39m \u001B[38;5;129;01mnot\u001B[39;00m \u001B[38;5;129;01min\u001B[39;00m \u001B[38;5;28mstr\u001B[39m(e):\n",
"\u001B[0;31mException\u001B[0m: ('Error in cycle 5', 'ID: 5', 'Deleted IDs: [1, 2, 0, 6, 5, 7]', 'Query Response Labels: [3, 9, 4, 5, 8, 10]', 'All index IDs: [10, 7, 9, 4, 8, 3]')"
]
}
],
"execution_count": 1
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-08-05T11:16:16.752248Z",
"start_time": "2024-08-05T11:16:16.750843Z"
}
},
"cell_type": "code",
"source": "",
"id": "182388f341cbd54a",
"outputs": [],
"execution_count": 1
},
{
"metadata": {},
"cell_type": "code",
"outputs": [],
"execution_count": null,
"source": "",
"id": "bcf7cbd3996dd5b9"
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment