Created
August 5, 2024 11:50
-
-
Save tazarov/a5f0d1165234d0a181b139376184b40b to your computer and use it in GitHub Desktop.
Reproduces a bug in HNSW when replace_delete=True
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"metadata": {}, | |
"cell_type": "markdown", | |
"source": "# Try to reproduce the bug by replay of the state machine", | |
"id": "85a95571ea8c3598" | |
}, | |
{ | |
"cell_type": "code", | |
"id": "initial_id", | |
"metadata": { | |
"collapsed": true, | |
"ExecuteTime": { | |
"end_time": "2024-08-05T08:21:35.711003Z", | |
"start_time": "2024-08-05T08:21:35.642637Z" | |
} | |
}, | |
"source": [ | |
"import random\n", | |
"\n", | |
"import hnswlib\n", | |
"import numpy as np\n", | |
"\n", | |
"vectors = np.random.rand(30, 1536).astype(np.float32)\n", | |
"ids = np.arange(30)\n", | |
"index = hnswlib.Index(space='l2', dim=1536)\n", | |
"index.init_index(allow_replace_deleted=True,persistence_location=\"hnsw_label_bug/\",is_persistent_index=True,max_elements=501)\n" | |
], | |
"outputs": [], | |
"execution_count": 1 | |
}, | |
{ | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2024-08-05T08:21:36.465732Z", | |
"start_time": "2024-08-05T08:21:36.463921Z" | |
} | |
}, | |
"cell_type": "code", | |
"source": [ | |
"# Add ID 0\n", | |
"index.add_items(data=[vectors[0]],ids=[ids[0]],replace_deleted=True)" | |
], | |
"id": "4d7c85267ecc743f", | |
"outputs": [], | |
"execution_count": 2 | |
}, | |
{ | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2024-08-05T08:21:37.269094Z", | |
"start_time": "2024-08-05T08:21:37.266903Z" | |
} | |
}, | |
"cell_type": "code", | |
"source": [ | |
"#delete 0\n", | |
"index.mark_deleted(ids[0])" | |
], | |
"id": "40c3a1aa79f6b39b", | |
"outputs": [], | |
"execution_count": 3 | |
}, | |
{ | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2024-08-05T08:21:37.977276Z", | |
"start_time": "2024-08-05T08:21:37.975284Z" | |
} | |
}, | |
"cell_type": "code", | |
"source": [ | |
"# add 10 vectors\n", | |
"index.add_items(data=vectors[1:11],ids=ids[1:11],replace_deleted=True)" | |
], | |
"id": "7823b1f0b9a2d96b", | |
"outputs": [], | |
"execution_count": 4 | |
}, | |
{ | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2024-08-05T08:21:38.550548Z", | |
"start_time": "2024-08-05T08:21:38.548687Z" | |
} | |
}, | |
"cell_type": "code", | |
"source": [ | |
"# delete ID 1\n", | |
"index.mark_deleted(ids[1])" | |
], | |
"id": "958be64f115b85cc", | |
"outputs": [], | |
"execution_count": 5 | |
}, | |
{ | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2024-08-05T08:21:39.080032Z", | |
"start_time": "2024-08-05T08:21:39.078103Z" | |
} | |
}, | |
"cell_type": "code", | |
"source": [ | |
"# upsert 11 and 12\n", | |
"index.add_items(data=vectors[11:13],ids=ids[11:13],replace_deleted=True)" | |
], | |
"id": "1df6fba54889425e", | |
"outputs": [], | |
"execution_count": 6 | |
}, | |
{ | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2024-08-05T08:21:39.611261Z", | |
"start_time": "2024-08-05T08:21:39.609267Z" | |
} | |
}, | |
"cell_type": "code", | |
"source": [ | |
"# Add 11 thru 16\n", | |
"index.add_items(data=vectors[11:17],ids=ids[11:17],replace_deleted=True)" | |
], | |
"id": "5ed27eaa02dc651d", | |
"outputs": [], | |
"execution_count": 7 | |
}, | |
{ | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2024-08-05T08:21:40.282052Z", | |
"start_time": "2024-08-05T08:21:40.280009Z" | |
} | |
}, | |
"cell_type": "code", | |
"source": [ | |
"# Add Emmbeddings 17, 18, and 1\n", | |
"index.add_items(data=vectors[17:20],ids=ids[17:20],replace_deleted=True)" | |
], | |
"id": "d5966e5d96af17c2", | |
"outputs": [], | |
"execution_count": 8 | |
}, | |
{ | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2024-08-05T08:21:41.210879Z", | |
"start_time": "2024-08-05T08:21:41.208631Z" | |
} | |
}, | |
"cell_type": "code", | |
"source": [ | |
"# Add 20\n", | |
"index.add_items(data=[vectors[20]],ids=[ids[20]],replace_deleted=True)" | |
], | |
"id": "ea183ec3dfaeaeb4", | |
"outputs": [], | |
"execution_count": 9 | |
}, | |
{ | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2024-08-05T08:21:41.764222Z", | |
"start_time": "2024-08-05T08:21:41.762388Z" | |
} | |
}, | |
"cell_type": "code", | |
"source": [ | |
"# Add 21 and 22\n", | |
"index.add_items(data=vectors[21:23],ids=ids[21:23],replace_deleted=True)" | |
], | |
"id": "575d4f7c51b13370", | |
"outputs": [], | |
"execution_count": 10 | |
}, | |
{ | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2024-08-05T08:21:42.347645Z", | |
"start_time": "2024-08-05T08:21:42.345325Z" | |
} | |
}, | |
"cell_type": "code", | |
"source": [ | |
"# Add 16\n", | |
"index.add_items(data=[vectors[16]],ids=[ids[16]],replace_deleted=True)" | |
], | |
"id": "fa60cb75d8c5dd67", | |
"outputs": [], | |
"execution_count": 11 | |
}, | |
{ | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2024-08-05T08:21:43.072866Z", | |
"start_time": "2024-08-05T08:21:43.070776Z" | |
} | |
}, | |
"cell_type": "code", | |
"source": [ | |
"# Add 23\n", | |
"index.add_items(data=[vectors[23]],ids=[ids[23]],replace_deleted=True)" | |
], | |
"id": "84ffe2f36ba11edf", | |
"outputs": [], | |
"execution_count": 12 | |
}, | |
{ | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2024-08-05T08:21:43.809598Z", | |
"start_time": "2024-08-05T08:21:43.807569Z" | |
} | |
}, | |
"cell_type": "code", | |
"source": [ | |
"# Add 24 thru 27\n", | |
"index.add_items(data=vectors[24:28],ids=ids[24:28],replace_deleted=True)" | |
], | |
"id": "ec1951a2a863a670", | |
"outputs": [], | |
"execution_count": 13 | |
}, | |
{ | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2024-08-05T08:21:45.093522Z", | |
"start_time": "2024-08-05T08:21:45.091329Z" | |
} | |
}, | |
"cell_type": "code", | |
"source": [ | |
"# Delete 21 thru 26\n", | |
"# index.mark_deleted(ids[21:27])\n", | |
"for i in range(21,27):\n", | |
" index.mark_deleted(ids[i])" | |
], | |
"id": "ae44d4e22dcc195f", | |
"outputs": [], | |
"execution_count": 14 | |
}, | |
{ | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2024-08-05T08:21:46.944606Z", | |
"start_time": "2024-08-05T08:21:46.940604Z" | |
} | |
}, | |
"cell_type": "code", | |
"source": [ | |
"# Upsert 20, 26 and 27\n", | |
"index.add_items(data=[vectors[20],vectors[26],vectors[27]],ids=[ids[20],ids[26],ids[27]],replace_deleted=True)" | |
], | |
"id": "d2906a7759ffaa75", | |
"outputs": [], | |
"execution_count": 15 | |
}, | |
{ | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2024-08-05T08:32:56.414844Z", | |
"start_time": "2024-08-05T08:32:56.412628Z" | |
} | |
}, | |
"cell_type": "code", | |
"source": [ | |
"# Delete 19, 20, 27\n", | |
"# index.mark_deleted([ids[19],ids[20],ids[27]])\n", | |
"\n", | |
"for i in [19,20,27]:\n", | |
" print(i)\n", | |
" try:\n", | |
" index.mark_deleted(ids[i])\n", | |
" except:\n", | |
" print(\"Error\")" | |
], | |
"id": "6403a57f9e98d9cb", | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"19\n", | |
"Error\n", | |
"20\n", | |
"Error\n", | |
"27\n", | |
"Error\n" | |
] | |
} | |
], | |
"execution_count": 25 | |
}, | |
{ | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2024-08-05T08:47:22.725534Z", | |
"start_time": "2024-08-05T08:47:22.723324Z" | |
} | |
}, | |
"cell_type": "code", | |
"source": [ | |
"\n", | |
"print(index.element_count)\n", | |
"res=index.knn_query(vectors[19], k=10)\n", | |
"print(res[0][0])" | |
], | |
"id": "33820ccb85967435", | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"26\n", | |
"[12 18 3 6 20 4 26 13 14 27]\n" | |
] | |
} | |
], | |
"execution_count": 30 | |
}, | |
{ | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2024-08-05T08:31:34.668700Z", | |
"start_time": "2024-08-05T08:31:34.666993Z" | |
} | |
}, | |
"cell_type": "code", | |
"source": "print(ids[20])", | |
"id": "b356e8f82160798c", | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"20\n" | |
] | |
} | |
], | |
"execution_count": 21 | |
}, | |
{ | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2024-08-05T08:47:30.479579Z", | |
"start_time": "2024-08-05T08:47:30.477757Z" | |
} | |
}, | |
"cell_type": "code", | |
"source": "print(index.get_ids_list())", | |
"id": "e76e1732806ff74e", | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"[26, 27, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2]\n" | |
] | |
} | |
], | |
"execution_count": 31 | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "markdown", | |
"source": "# Reproduce the bug", | |
"id": "53a3523613d92529" | |
}, | |
{ | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2024-08-05T11:40:31.416876Z", | |
"start_time": "2024-08-05T11:40:31.092098Z" | |
} | |
}, | |
"cell_type": "code", | |
"source": [ | |
"import hnswlib\n", | |
"import numpy as np\n", | |
"import random\n", | |
"delete_ids = []\n", | |
"cycles = 2000\n", | |
"vectors = np.random.uniform(-1,1,(cycles*6, 1536)).astype(np.float32)\n", | |
"index = hnswlib.Index(space='l2', dim=1536)\n", | |
"index.init_index(allow_replace_deleted=True, max_elements=501)\n", | |
"index.set_num_threads(1)\n", | |
"for cid in range(cycles):\n", | |
" if random.random()>0.7 and len(index.get_ids_list()) >0: # we randomly delete ~30% of the time\n", | |
" ids_to_delete = random.sample(index.get_ids_list(), random.randint(1, len(index.get_ids_list())))\n", | |
" for id in ids_to_delete:\n", | |
" if id in delete_ids: # weirdly we also have to check that get_ids_list doesn't return deleted ids\n", | |
" continue\n", | |
" index.mark_deleted(id)\n", | |
" delete_ids.append(id)\n", | |
" else:\n", | |
" start_id = index.element_count #this is not idea but does the job.\n", | |
" num_vectors_to_add = random.randint(1, 6) # \n", | |
" va = vectors[start_id:start_id+num_vectors_to_add]\n", | |
" index.add_items(data=va,ids=range(start_id,start_id+len(va)),num_threads=1,replace_deleted=True)\n", | |
" if index.element_count>0:\n", | |
" try:\n", | |
" res=index.knn_query(vectors[random.sample(index.get_ids_list(),1)[0]],num_threads=1, k=min(len(index.get_ids_list()),10))\n", | |
" if len(res[0][0])>0:\n", | |
" for i in res[0][0]:\n", | |
" if i in delete_ids:\n", | |
" raise Exception(f\"Error in cycle {cid}\",f\"ID: {i}\", f\"Deleted IDs: {delete_ids}\", f\"Query Response Labels: {res[0][0].tolist()}\",f\"All index IDs: {index.get_ids_list()}\")\n", | |
" except Exception as e:\n", | |
" if \"Cannot return\" not in str(e):\n", | |
" print(str(e))\n", | |
" raise e\n", | |
" " | |
], | |
"id": "65115ca986be7d34", | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"('Error in cycle 5', 'ID: 5', 'Deleted IDs: [1, 2, 0, 6, 5, 7]', 'Query Response Labels: [3, 9, 4, 5, 8, 10]', 'All index IDs: [10, 7, 9, 4, 8, 3]')\n" | |
] | |
}, | |
{ | |
"ename": "Exception", | |
"evalue": "('Error in cycle 5', 'ID: 5', 'Deleted IDs: [1, 2, 0, 6, 5, 7]', 'Query Response Labels: [3, 9, 4, 5, 8, 10]', 'All index IDs: [10, 7, 9, 4, 8, 3]')", | |
"output_type": "error", | |
"traceback": [ | |
"\u001B[0;31m---------------------------------------------------------------------------\u001B[0m", | |
"\u001B[0;31mException\u001B[0m Traceback (most recent call last)", | |
"Cell \u001B[0;32mIn[1], line 33\u001B[0m\n\u001B[1;32m 31\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mCannot return\u001B[39m\u001B[38;5;124m\"\u001B[39m \u001B[38;5;129;01mnot\u001B[39;00m \u001B[38;5;129;01min\u001B[39;00m \u001B[38;5;28mstr\u001B[39m(e):\n\u001B[1;32m 32\u001B[0m \u001B[38;5;28mprint\u001B[39m(\u001B[38;5;28mstr\u001B[39m(e))\n\u001B[0;32m---> 33\u001B[0m \u001B[38;5;28;01mraise\u001B[39;00m e\n", | |
"Cell \u001B[0;32mIn[1], line 29\u001B[0m\n\u001B[1;32m 27\u001B[0m \u001B[38;5;28;01mfor\u001B[39;00m i \u001B[38;5;129;01min\u001B[39;00m res[\u001B[38;5;241m0\u001B[39m][\u001B[38;5;241m0\u001B[39m]:\n\u001B[1;32m 28\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m i \u001B[38;5;129;01min\u001B[39;00m delete_ids:\n\u001B[0;32m---> 29\u001B[0m \u001B[38;5;28;01mraise\u001B[39;00m \u001B[38;5;167;01mException\u001B[39;00m(\u001B[38;5;124mf\u001B[39m\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mError in cycle \u001B[39m\u001B[38;5;132;01m{\u001B[39;00mcid\u001B[38;5;132;01m}\u001B[39;00m\u001B[38;5;124m\"\u001B[39m,\u001B[38;5;124mf\u001B[39m\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mID: \u001B[39m\u001B[38;5;132;01m{\u001B[39;00mi\u001B[38;5;132;01m}\u001B[39;00m\u001B[38;5;124m\"\u001B[39m, \u001B[38;5;124mf\u001B[39m\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mDeleted IDs: \u001B[39m\u001B[38;5;132;01m{\u001B[39;00mdelete_ids\u001B[38;5;132;01m}\u001B[39;00m\u001B[38;5;124m\"\u001B[39m, \u001B[38;5;124mf\u001B[39m\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mQuery Response Labels: \u001B[39m\u001B[38;5;132;01m{\u001B[39;00mres[\u001B[38;5;241m0\u001B[39m][\u001B[38;5;241m0\u001B[39m]\u001B[38;5;241m.\u001B[39mtolist()\u001B[38;5;132;01m}\u001B[39;00m\u001B[38;5;124m\"\u001B[39m,\u001B[38;5;124mf\u001B[39m\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mAll index IDs: \u001B[39m\u001B[38;5;132;01m{\u001B[39;00mindex\u001B[38;5;241m.\u001B[39mget_ids_list()\u001B[38;5;132;01m}\u001B[39;00m\u001B[38;5;124m\"\u001B[39m)\n\u001B[1;32m 30\u001B[0m \u001B[38;5;28;01mexcept\u001B[39;00m \u001B[38;5;167;01mException\u001B[39;00m \u001B[38;5;28;01mas\u001B[39;00m e:\n\u001B[1;32m 31\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mCannot return\u001B[39m\u001B[38;5;124m\"\u001B[39m \u001B[38;5;129;01mnot\u001B[39;00m \u001B[38;5;129;01min\u001B[39;00m \u001B[38;5;28mstr\u001B[39m(e):\n", | |
"\u001B[0;31mException\u001B[0m: ('Error in cycle 5', 'ID: 5', 'Deleted IDs: [1, 2, 0, 6, 5, 7]', 'Query Response Labels: [3, 9, 4, 5, 8, 10]', 'All index IDs: [10, 7, 9, 4, 8, 3]')" | |
] | |
} | |
], | |
"execution_count": 1 | |
}, | |
{ | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2024-08-05T11:16:16.752248Z", | |
"start_time": "2024-08-05T11:16:16.750843Z" | |
} | |
}, | |
"cell_type": "code", | |
"source": "", | |
"id": "182388f341cbd54a", | |
"outputs": [], | |
"execution_count": 1 | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"outputs": [], | |
"execution_count": null, | |
"source": "", | |
"id": "bcf7cbd3996dd5b9" | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 2 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython2", | |
"version": "2.7.6" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 5 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment