Last active
December 11, 2023 04:32
-
-
Save kacperlukawski/2d3a3225f15a4cc5772cd1c81866340d to your computer and use it in GitHub Desktop.
Qdrant tips&tricks
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2023-03-13T09:01:38.772705Z", | |
"start_time": "2023-03-13T09:01:38.627212Z" | |
} | |
}, | |
"outputs": [], | |
"source": [ | |
"import config\n", | |
"import func" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2023-03-13T09:01:39.116822Z", | |
"start_time": "2023-03-13T09:01:38.774713Z" | |
} | |
}, | |
"outputs": [], | |
"source": [ | |
"from tqdm import tqdm\n", | |
"from qdrant_client import QdrantClient\n", | |
"from qdrant_client.http import models as rest" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Basic connection" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2023-03-13T09:01:39.205014Z", | |
"start_time": "2023-03-13T09:01:39.118666Z" | |
} | |
}, | |
"outputs": [], | |
"source": [ | |
"client = QdrantClient(\n", | |
" url=\"http://localhost\",\n", | |
")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2023-03-13T09:01:39.952500Z", | |
"start_time": "2023-03-13T09:01:39.207647Z" | |
} | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"True" | |
] | |
}, | |
"execution_count": 4, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"client.recreate_collection(\n", | |
" collection_name=config.COLLECTION_NAME,\n", | |
" vectors_config=rest.VectorParams(\n", | |
" size=config.VECTOR_SIZE,\n", | |
" distance=rest.Distance.COSINE,\n", | |
" )\n", | |
")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2023-03-13T09:14:40.339461Z", | |
"start_time": "2023-03-13T09:01:39.953855Z" | |
} | |
}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"101it [13:00, 7.73s/it] " | |
] | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 12min 7s, sys: 5.23 s, total: 12min 12s\n", | |
"Wall time: 13min\n" | |
] | |
}, | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"\n" | |
] | |
} | |
], | |
"source": [ | |
"%%time\n", | |
"\n", | |
"max_num = 50_000\n", | |
"\n", | |
"batch_size = config.BATCH_SIZE\n", | |
"objects = func.iterate_objects(max_num=max_num)\n", | |
"batched_objects = func.batchify_objects(objects, n=batch_size)\n", | |
"for batch in tqdm(batched_objects, total=max_num // batch_size):\n", | |
" ids, vectors, payloads = batch\n", | |
" client.upsert(\n", | |
" collection_name=config.COLLECTION_NAME,\n", | |
" points=rest.Batch(\n", | |
" ids=ids,\n", | |
" vectors=vectors,\n", | |
" payloads=payloads,\n", | |
" )\n", | |
" )" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2023-03-13T09:14:40.348596Z", | |
"start_time": "2023-03-13T09:14:40.342240Z" | |
} | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"CollectionInfo(status=<CollectionStatus.GREEN: 'green'>, optimizer_status=<OptimizersStatusOneOf.OK: 'ok'>, vectors_count=50001, indexed_vectors_count=47500, points_count=50001, segments_count=2, config=CollectionConfig(params=CollectionParams(vectors=VectorParams(size=2048, distance=<Distance.COSINE: 'Cosine'>), shard_number=1, replication_factor=1, write_consistency_factor=1, on_disk_payload=True), hnsw_config=HnswConfig(m=16, ef_construct=100, full_scan_threshold=10000, max_indexing_threads=0, on_disk=False, payload_m=None), optimizer_config=OptimizersConfig(deleted_threshold=0.2, vacuum_min_vector_number=1000, default_segment_number=0, max_segment_size=None, memmap_threshold=None, indexing_threshold=20000, flush_interval_sec=5, max_optimization_threads=1), wal_config=WalConfig(wal_capacity_mb=32, wal_segments_ahead=0)), payload_schema={})" | |
] | |
}, | |
"execution_count": 6, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"client.get_collection(config.COLLECTION_NAME)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# gRPC protocol" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2023-03-13T09:14:40.600739Z", | |
"start_time": "2023-03-13T09:14:40.349853Z" | |
} | |
}, | |
"outputs": [], | |
"source": [ | |
"client = QdrantClient(\n", | |
" url=\"http://localhost\",\n", | |
" prefer_grpc=True,\n", | |
")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2023-03-13T09:14:41.047721Z", | |
"start_time": "2023-03-13T09:14:40.602615Z" | |
} | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"True" | |
] | |
}, | |
"execution_count": 8, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"client.recreate_collection(\n", | |
" collection_name=config.COLLECTION_NAME,\n", | |
" vectors_config=rest.VectorParams(\n", | |
" size=config.VECTOR_SIZE,\n", | |
" distance=rest.Distance.COSINE,\n", | |
" )\n", | |
")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2023-03-13T09:21:29.766200Z", | |
"start_time": "2023-03-13T09:14:41.049769Z" | |
}, | |
"pycharm": { | |
"is_executing": true | |
} | |
}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"51it [06:48, 8.01s/it] " | |
] | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 6min 10s, sys: 3.34 s, total: 6min 13s\n", | |
"Wall time: 6min 48s\n" | |
] | |
}, | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"\n" | |
] | |
} | |
], | |
"source": [ | |
"%%time\n", | |
"\n", | |
"batch_size = batch_size * 2\n", | |
"objects = func.iterate_objects(max_num=max_num)\n", | |
"batched_objects = func.batchify_objects(objects, n=batch_size)\n", | |
"for batch in tqdm(batched_objects, total=max_num // batch_size):\n", | |
" ids, vectors, payloads = batch\n", | |
" client.upsert(\n", | |
" collection_name=config.COLLECTION_NAME,\n", | |
" points=rest.Batch(\n", | |
" ids=ids,\n", | |
" vectors=vectors,\n", | |
" payloads=payloads,\n", | |
" )\n", | |
" )" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2023-03-13T09:21:29.777569Z", | |
"start_time": "2023-03-13T09:21:29.770752Z" | |
}, | |
"pycharm": { | |
"is_executing": true | |
} | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"CollectionInfo(status=<CollectionStatus.YELLOW: 'yellow'>, optimizer_status=<OptimizersStatusOneOf.OK: 'ok'>, vectors_count=50001, indexed_vectors_count=45000, points_count=50001, segments_count=2, config=CollectionConfig(params=CollectionParams(vectors=VectorParams(size=2048, distance=<Distance.COSINE: 'Cosine'>), shard_number=1, replication_factor=1, write_consistency_factor=1, on_disk_payload=True), hnsw_config=HnswConfig(m=16, ef_construct=100, full_scan_threshold=10000, max_indexing_threads=0, on_disk=False, payload_m=None), optimizer_config=OptimizersConfig(deleted_threshold=0.2, vacuum_min_vector_number=1000, default_segment_number=0, max_segment_size=None, memmap_threshold=None, indexing_threshold=20000, flush_interval_sec=5, max_optimization_threads=1), wal_config=WalConfig(wal_capacity_mb=32, wal_segments_ahead=0)), payload_schema={})" | |
] | |
}, | |
"execution_count": 10, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"client.get_collection(config.COLLECTION_NAME)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Indexing threshold" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 22, | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2023-03-13T11:47:30.851036Z", | |
"start_time": "2023-03-13T11:47:30.382652Z" | |
}, | |
"pycharm": { | |
"is_executing": true | |
} | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"True" | |
] | |
}, | |
"execution_count": 22, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"client.recreate_collection(\n", | |
" collection_name=config.COLLECTION_NAME,\n", | |
" vectors_config=rest.VectorParams(\n", | |
" size=config.VECTOR_SIZE,\n", | |
" distance=rest.Distance.COSINE,\n", | |
" ),\n", | |
" optimizers_config=rest.OptimizersConfigDiff(\n", | |
" indexing_threshold=1_000_000_000, # 1M KBs\n", | |
" )\n", | |
")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 23, | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2023-03-13T11:52:00.538505Z", | |
"start_time": "2023-03-13T11:47:30.852851Z" | |
}, | |
"pycharm": { | |
"is_executing": true | |
} | |
}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"51it [04:29, 5.29s/it] " | |
] | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 4min 19s, sys: 1.8 s, total: 4min 20s\n", | |
"Wall time: 4min 29s\n" | |
] | |
}, | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"\n" | |
] | |
} | |
], | |
"source": [ | |
"%%time\n", | |
"\n", | |
"objects = func.iterate_objects(max_num=max_num)\n", | |
"batched_objects = func.batchify_objects(objects, n=batch_size)\n", | |
"for batch in tqdm(batched_objects, total=max_num // batch_size):\n", | |
" ids, vectors, payloads = batch\n", | |
" client.upsert(\n", | |
" collection_name=config.COLLECTION_NAME,\n", | |
" points=rest.Batch(\n", | |
" ids=ids,\n", | |
" vectors=vectors,\n", | |
" payloads=payloads,\n", | |
" )\n", | |
" )" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 24, | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2023-03-13T11:52:00.546753Z", | |
"start_time": "2023-03-13T11:52:00.541890Z" | |
}, | |
"pycharm": { | |
"is_executing": true | |
} | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"CollectionInfo(status=<CollectionStatus.GREEN: 'green'>, optimizer_status=<OptimizersStatusOneOf.OK: 'ok'>, vectors_count=50001, indexed_vectors_count=0, points_count=50001, segments_count=8, config=CollectionConfig(params=CollectionParams(vectors=VectorParams(size=2048, distance=<Distance.COSINE: 'Cosine'>), shard_number=1, replication_factor=1, write_consistency_factor=1, on_disk_payload=True), hnsw_config=HnswConfig(m=16, ef_construct=100, full_scan_threshold=10000, max_indexing_threads=0, on_disk=False, payload_m=None), optimizer_config=OptimizersConfig(deleted_threshold=0.2, vacuum_min_vector_number=1000, default_segment_number=0, max_segment_size=None, memmap_threshold=None, indexing_threshold=1000000000, flush_interval_sec=5, max_optimization_threads=1), wal_config=WalConfig(wal_capacity_mb=32, wal_segments_ahead=0)), payload_schema={})" | |
] | |
}, | |
"execution_count": 24, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"client.get_collection(config.COLLECTION_NAME)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 25, | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2023-03-13T11:52:00.634095Z", | |
"start_time": "2023-03-13T11:52:00.548790Z" | |
}, | |
"pycharm": { | |
"is_executing": true | |
} | |
}, | |
"outputs": [], | |
"source": [ | |
"client.update_collection(\n", | |
" collection_name=config.COLLECTION_NAME,\n", | |
" optimizer_config=rest.OptimizersConfigDiff(\n", | |
" indexing_threshold=10_000, # 1K KBs\n", | |
" )\n", | |
")\n", | |
"\n", | |
"while True:\n", | |
" collection_info = client.get_collection(collection_name=config.COLLECTION_NAME)\n", | |
" if collection_info.status == rest.CollectionStatus.GREEN:\n", | |
" break" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 26, | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2023-03-13T11:52:00.641505Z", | |
"start_time": "2023-03-13T11:52:00.636482Z" | |
}, | |
"pycharm": { | |
"is_executing": true | |
} | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"CollectionInfo(status=<CollectionStatus.GREEN: 'green'>, optimizer_status=<OptimizersStatusOneOf.OK: 'ok'>, vectors_count=50001, indexed_vectors_count=0, points_count=50001, segments_count=8, config=CollectionConfig(params=CollectionParams(vectors=VectorParams(size=2048, distance=<Distance.COSINE: 'Cosine'>), shard_number=1, replication_factor=1, write_consistency_factor=1, on_disk_payload=True), hnsw_config=HnswConfig(m=16, ef_construct=100, full_scan_threshold=10000, max_indexing_threads=0, on_disk=False, payload_m=None), optimizer_config=OptimizersConfig(deleted_threshold=0.2, vacuum_min_vector_number=1000, default_segment_number=0, max_segment_size=None, memmap_threshold=None, indexing_threshold=10000, flush_interval_sec=5, max_optimization_threads=1), wal_config=WalConfig(wal_capacity_mb=32, wal_segments_ahead=0)), payload_schema={})" | |
] | |
}, | |
"execution_count": 26, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"client.get_collection(config.COLLECTION_NAME)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3 (ipykernel)", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.10.6" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 1 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment