Last active
October 14, 2024 18:26
-
-
Save kacperlukawski/961aaa7946f55110abfcd37fbe869b8f to your computer and use it in GitHub Desktop.
Using multiple vectors in Qdrant 0.10
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 23, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"Using custom data configuration ChristophSchuhmann--MS_COCO_2017_URL_TEXT-01533eba737b92f6\n", | |
"Found cached dataset parquet (/home/kacper/.cache/huggingface/datasets/ChristophSchuhmann___parquet/ChristophSchuhmann--MS_COCO_2017_URL_TEXT-01533eba737b92f6/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/plain": " 0%| | 0/1 [00:00<?, ?it/s]", | |
"application/vnd.jupyter.widget-view+json": { | |
"version_major": 2, | |
"version_minor": 0, | |
"model_id": "4035dc6877d646aeb575846acf29a91b" | |
} | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"from datasets import load_dataset\n", | |
"\n", | |
"dataset = load_dataset(\"ChristophSchuhmann/MS_COCO_2017_URL_TEXT\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 24, | |
"outputs": [], | |
"source": [ | |
"import pandas as pd\n", | |
"\n", | |
"dataset_df = pd.DataFrame(dataset[\"train\"])" | |
], | |
"metadata": { | |
"collapsed": false | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 25, | |
"outputs": [], | |
"source": [ | |
"from pathlib import Path\n", | |
"from urllib.request import urlretrieve\n", | |
"from embetter.base import EmbetterBase\n", | |
"\n", | |
"\n", | |
"class DownloadFile(EmbetterBase):\n", | |
"\n", | |
" def __init__(self, out_dir: Path):\n", | |
" self.out_dir = out_dir\n", | |
"\n", | |
" def transform(self, X, y=None):\n", | |
" output_paths = []\n", | |
" for x in X:\n", | |
" output_file = self.out_dir / Path(x).name\n", | |
" urlretrieve(x, output_file)\n", | |
" output_paths.append(str(output_file))\n", | |
" return output_paths" | |
], | |
"metadata": { | |
"collapsed": false | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 26, | |
"outputs": [], | |
"source": [ | |
"from qdrant_client import QdrantClient\n", | |
"from qdrant_client.http.models import VectorParams, Distance\n", | |
"\n", | |
"client = QdrantClient(timeout=None)\n", | |
"client.recreate_collection(\n", | |
" collection_name=\"ms-coco-2017\",\n", | |
" vectors_config={\n", | |
" \"text\": VectorParams(\n", | |
" size=384,\n", | |
" distance=Distance.EUCLID,\n", | |
" ),\n", | |
" \"image\": VectorParams(\n", | |
" size=1000,\n", | |
" distance=Distance.COSINE,\n", | |
" ),\n", | |
" },\n", | |
")" | |
], | |
"metadata": { | |
"collapsed": false | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 34, | |
"outputs": [], | |
"source": [ | |
"from sklearn.pipeline import make_pipeline\n", | |
"from embetter.grab import ColumnGrabber\n", | |
"from embetter.vision import ImageLoader, TimmEncoder\n", | |
"from embetter.text import SentenceEncoder\n", | |
"\n", | |
"output_directory = Path(\"./images\")\n", | |
"\n", | |
"image_pipeline = make_pipeline(\n", | |
" ColumnGrabber(\"URL\"),\n", | |
" DownloadFile(output_directory),\n", | |
" ImageLoader(),\n", | |
" TimmEncoder(\"vit_base_patch16_224\"),\n", | |
")\n", | |
"\n", | |
"text_pipeline = make_pipeline(\n", | |
" ColumnGrabber(\"TEXT\"),\n", | |
" SentenceEncoder(\"all-MiniLM-L6-v2\"),\n", | |
")" | |
], | |
"metadata": { | |
"collapsed": false | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 37, | |
"outputs": [], | |
"source": [ | |
"sample_df = dataset_df.sample(n=2000, random_state=643)\n", | |
"image_vectors = image_pipeline.transform(sample_df)\n", | |
"text_vectors = text_pipeline.transform(sample_df)\n", | |
"sample_df[\"image_vector\"] = image_vectors.tolist()\n", | |
"sample_df[\"text_vector\"] = text_vectors.tolist()" | |
], | |
"metadata": { | |
"collapsed": false | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 39, | |
"outputs": [], | |
"source": [ | |
"from qdrant_client.http.models import PointStruct\n", | |
"\n", | |
"for index, row in sample_df.iterrows():\n", | |
" client.upsert(\n", | |
" collection_name=\"ms-coco-2017\",\n", | |
" points=[\n", | |
" PointStruct(\n", | |
" id=index,\n", | |
" vector={\n", | |
" \"text\": row[\"text_vector\"],\n", | |
" \"image\": row[\"image_vector\"],\n", | |
" },\n", | |
" payload={\n", | |
" \"url\": row[\"URL\"],\n", | |
" \"text\": row[\"TEXT\"],\n", | |
" }\n", | |
" )\n", | |
" ]\n", | |
" )" | |
], | |
"metadata": { | |
"collapsed": false | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 46, | |
"outputs": [], | |
"source": [ | |
"test_df = dataset_df.sample(n=3, random_state=724)\n", | |
"test_image_vectors = image_pipeline.transform(test_df)\n", | |
"test_text_vectors = text_pipeline.transform(test_df)\n", | |
"test_df[\"image_vector\"] = test_image_vectors.tolist()\n", | |
"test_df[\"text_vector\"] = test_text_vectors.tolist()" | |
], | |
"metadata": { | |
"collapsed": false | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 71, | |
"outputs": [], | |
"source": [ | |
"pd.set_option(\"display.max_colwidth\", None)\n", | |
"pd.set_option(\"display.max_columns\", None)\n", | |
"pd.set_option(\"display.expand_frame_repr\", False)" | |
], | |
"metadata": { | |
"collapsed": false | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 75, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Query: A pile of burnt carrots and a glove sitting in the snow.\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/html": "<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>text</th>\n <th>url</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>It's tough to find a meal under the snow.</td>\n <td>http://images.cocodataset.org/train2017/000000003926.jpg</td>\n </tr>\n <tr>\n <th>1</th>\n <td>carrots, bananas and some other food sitting on a table</td>\n <td>http://images.cocodataset.org/train2017/000000002624.jpg</td>\n </tr>\n <tr>\n <th>2</th>\n <td>Some noodles with some carrots and chicken in it.</td>\n <td>http://images.cocodataset.org/train2017/000000111606.jpg</td>\n </tr>\n <tr>\n <th>3</th>\n <td>A row of snow boards sticking out of the snow.</td>\n <td>http://images.cocodataset.org/train2017/000000000201.jpg</td>\n </tr>\n <tr>\n <th>4</th>\n <td>A cutting board topped with carrots and onion and a dessert.</td>\n <td>http://images.cocodataset.org/train2017/000000460833.jpg</td>\n </tr>\n </tbody>\n</table>" | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Query: A couple of cows standing in a grassy field.\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/html": "<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>text</th>\n <th>url</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>a couple of cows stand in some tall grass</td>\n <td>http://images.cocodataset.org/train2017/000000007601.jpg</td>\n </tr>\n <tr>\n <th>1</th>\n <td>Multiple cows standing in a field staring forward.</td>\n <td>http://images.cocodataset.org/train2017/000000115680.jpg</td>\n </tr>\n <tr>\n <th>2</th>\n <td>A herd of three cows standing next to each other on a field.</td>\n <td>http://images.cocodataset.org/train2017/000000456919.jpg</td>\n </tr>\n <tr>\n <th>3</th>\n <td>Two cows that are laying down in the grass.</td>\n <td>http://images.cocodataset.org/train2017/000000392556.jpg</td>\n </tr>\n <tr>\n <th>4</th>\n <td>A large black and white cow standing in a desert field.</td>\n <td>http://images.cocodataset.org/train2017/000000134594.jpg</td>\n </tr>\n </tbody>\n</table>" | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Query: A computer desk topped with two monitors next to a TV.\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/html": "<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>text</th>\n <th>url</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>Two computer monitors and a keyboard sitting on a desk.</td>\n <td>http://images.cocodataset.org/train2017/000000416002.jpg</td>\n </tr>\n <tr>\n <th>1</th>\n <td>A computer monitor, keyboard, and tower with peripherals and plugs sit on a desk.</td>\n <td>http://images.cocodataset.org/train2017/000000140122.jpg</td>\n </tr>\n <tr>\n <th>2</th>\n <td>a desk with a laptop a monitor and a keyboard</td>\n <td>http://images.cocodataset.org/train2017/000000273103.jpg</td>\n </tr>\n <tr>\n <th>3</th>\n <td>Two television monitors with a keyboard are on a table.</td>\n <td>http://images.cocodataset.org/train2017/000000416815.jpg</td>\n </tr>\n <tr>\n <th>4</th>\n <td>A computer desk with a laptop computer on it.</td>\n <td>http://images.cocodataset.org/train2017/000000459118.jpg</td>\n </tr>\n </tbody>\n</table>" | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"from qdrant_client.http.models import NamedVector\n", | |
"from IPython.display import HTML, display_html\n", | |
"\n", | |
"for _, row in test_df.iterrows():\n", | |
" results = client.search(\n", | |
" collection_name=\"ms-coco-2017\",\n", | |
" query_vector=NamedVector(\n", | |
" name=\"text\",\n", | |
" vector=row[\"text_vector\"],\n", | |
" ),\n", | |
" limit=5,\n", | |
" with_vectors=False,\n", | |
" with_payload=True,\n", | |
" )\n", | |
"\n", | |
" results_df = pd.DataFrame([point.payload for point in results])\n", | |
" print(\"Query:\", row[\"TEXT\"])\n", | |
" display_html(HTML(results_df.to_html()))" | |
], | |
"metadata": { | |
"collapsed": false | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 77, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Query: http://images.cocodataset.org/train2017/000000399680.jpg\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/html": "<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>text</th>\n <th>url</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>Potted plants, branches and pruning shears are on a table.</td>\n <td>http://images.cocodataset.org/train2017/000000129407.jpg</td>\n </tr>\n <tr>\n <th>1</th>\n <td>Cheese has been cut up beside diced and whole carrots.</td>\n <td>http://images.cocodataset.org/train2017/000000500190.jpg</td>\n </tr>\n <tr>\n <th>2</th>\n <td>A cutting board topped with carrots and onion and a dessert.</td>\n <td>http://images.cocodataset.org/train2017/000000460833.jpg</td>\n </tr>\n <tr>\n <th>3</th>\n <td>A banana with a razor blade in it made to look like an operation.</td>\n <td>http://images.cocodataset.org/train2017/000000435883.jpg</td>\n </tr>\n </tbody>\n</table>" | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Query: http://images.cocodataset.org/train2017/000000544114.jpg\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/html": "<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>text</th>\n <th>url</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>Two cows that are laying down in the grass.</td>\n <td>http://images.cocodataset.org/train2017/000000392556.jpg</td>\n </tr>\n <tr>\n <th>1</th>\n <td>Several cows are grazing in a field along side a dirt road.</td>\n <td>http://images.cocodataset.org/train2017/000000162226.jpg</td>\n </tr>\n <tr>\n <th>2</th>\n <td>A large black and white cow standing in a desert field.</td>\n <td>http://images.cocodataset.org/train2017/000000134594.jpg</td>\n </tr>\n <tr>\n <th>3</th>\n <td>A group of three horses standing on a lush green field.</td>\n <td>http://images.cocodataset.org/train2017/000000249609.jpg</td>\n </tr>\n </tbody>\n</table>" | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Query: http://images.cocodataset.org/train2017/000000495680.jpg\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/html": "<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>text</th>\n <th>url</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>a desk with a laptop a monitor and a keyboard</td>\n <td>http://images.cocodataset.org/train2017/000000273103.jpg</td>\n </tr>\n <tr>\n <th>1</th>\n <td>Cluttered elaborate work station with laptop open and on</td>\n <td>http://images.cocodataset.org/train2017/000000066816.jpg</td>\n </tr>\n <tr>\n <th>2</th>\n <td>A desk along a wall with book cases over head.</td>\n <td>http://images.cocodataset.org/train2017/000000069768.jpg</td>\n </tr>\n <tr>\n <th>3</th>\n <td>A desk in a dimly lit room with computers on it</td>\n <td>http://images.cocodataset.org/train2017/000000499300.jpg</td>\n </tr>\n </tbody>\n</table>" | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"for _, row in test_df.iterrows():\n", | |
" results = client.search(\n", | |
" collection_name=\"ms-coco-2017\",\n", | |
" query_vector=(\"image\", row[\"image_vector\"]),\n", | |
" limit=4,\n", | |
" with_vectors=False,\n", | |
" with_payload=True,\n", | |
" )\n", | |
"\n", | |
" results_df = pd.DataFrame([point.payload for point in results])\n", | |
" print(\"Query:\", row[\"URL\"])\n", | |
" display_html(HTML(results_df.to_html()))" | |
], | |
"metadata": { | |
"collapsed": false | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"outputs": [], | |
"source": [], | |
"metadata": { | |
"collapsed": false | |
} | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 2 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython2", | |
"version": "2.7.6" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 0 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment