Skip to content

Instantly share code, notes, and snippets.

@kacperlukawski
Last active October 14, 2024 18:26
Show Gist options
  • Save kacperlukawski/961aaa7946f55110abfcd37fbe869b8f to your computer and use it in GitHub Desktop.
Save kacperlukawski/961aaa7946f55110abfcd37fbe869b8f to your computer and use it in GitHub Desktop.
Using multiple vectors in Qdrant 0.10
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 23,
"metadata": {
"collapsed": true
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Using custom data configuration ChristophSchuhmann--MS_COCO_2017_URL_TEXT-01533eba737b92f6\n",
"Found cached dataset parquet (/home/kacper/.cache/huggingface/datasets/ChristophSchuhmann___parquet/ChristophSchuhmann--MS_COCO_2017_URL_TEXT-01533eba737b92f6/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)\n"
]
},
{
"data": {
"text/plain": " 0%| | 0/1 [00:00<?, ?it/s]",
"application/vnd.jupyter.widget-view+json": {
"version_major": 2,
"version_minor": 0,
"model_id": "4035dc6877d646aeb575846acf29a91b"
}
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"from datasets import load_dataset\n",
"\n",
"dataset = load_dataset(\"ChristophSchuhmann/MS_COCO_2017_URL_TEXT\")"
]
},
{
"cell_type": "code",
"execution_count": 24,
"outputs": [],
"source": [
"import pandas as pd\n",
"\n",
"dataset_df = pd.DataFrame(dataset[\"train\"])"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 25,
"outputs": [],
"source": [
"from pathlib import Path\n",
"from urllib.request import urlretrieve\n",
"from embetter.base import EmbetterBase\n",
"\n",
"\n",
"class DownloadFile(EmbetterBase):\n",
"\n",
" def __init__(self, out_dir: Path):\n",
" self.out_dir = out_dir\n",
"\n",
" def transform(self, X, y=None):\n",
" output_paths = []\n",
" for x in X:\n",
" output_file = self.out_dir / Path(x).name\n",
" urlretrieve(x, output_file)\n",
" output_paths.append(str(output_file))\n",
" return output_paths"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 26,
"outputs": [],
"source": [
"from qdrant_client import QdrantClient\n",
"from qdrant_client.http.models import VectorParams, Distance\n",
"\n",
"client = QdrantClient(timeout=None)\n",
"client.recreate_collection(\n",
" collection_name=\"ms-coco-2017\",\n",
" vectors_config={\n",
" \"text\": VectorParams(\n",
" size=384,\n",
" distance=Distance.EUCLID,\n",
" ),\n",
" \"image\": VectorParams(\n",
" size=1000,\n",
" distance=Distance.COSINE,\n",
" ),\n",
" },\n",
")"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 34,
"outputs": [],
"source": [
"from sklearn.pipeline import make_pipeline\n",
"from embetter.grab import ColumnGrabber\n",
"from embetter.vision import ImageLoader, TimmEncoder\n",
"from embetter.text import SentenceEncoder\n",
"\n",
"output_directory = Path(\"./images\")\n",
"\n",
"image_pipeline = make_pipeline(\n",
" ColumnGrabber(\"URL\"),\n",
" DownloadFile(output_directory),\n",
" ImageLoader(),\n",
" TimmEncoder(\"vit_base_patch16_224\"),\n",
")\n",
"\n",
"text_pipeline = make_pipeline(\n",
" ColumnGrabber(\"TEXT\"),\n",
" SentenceEncoder(\"all-MiniLM-L6-v2\"),\n",
")"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 37,
"outputs": [],
"source": [
"sample_df = dataset_df.sample(n=2000, random_state=643)\n",
"image_vectors = image_pipeline.transform(sample_df)\n",
"text_vectors = text_pipeline.transform(sample_df)\n",
"sample_df[\"image_vector\"] = image_vectors.tolist()\n",
"sample_df[\"text_vector\"] = text_vectors.tolist()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 39,
"outputs": [],
"source": [
"from qdrant_client.http.models import PointStruct\n",
"\n",
"for index, row in sample_df.iterrows():\n",
" client.upsert(\n",
" collection_name=\"ms-coco-2017\",\n",
" points=[\n",
" PointStruct(\n",
" id=index,\n",
" vector={\n",
" \"text\": row[\"text_vector\"],\n",
" \"image\": row[\"image_vector\"],\n",
" },\n",
" payload={\n",
" \"url\": row[\"URL\"],\n",
" \"text\": row[\"TEXT\"],\n",
" }\n",
" )\n",
" ]\n",
" )"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 46,
"outputs": [],
"source": [
"test_df = dataset_df.sample(n=3, random_state=724)\n",
"test_image_vectors = image_pipeline.transform(test_df)\n",
"test_text_vectors = text_pipeline.transform(test_df)\n",
"test_df[\"image_vector\"] = test_image_vectors.tolist()\n",
"test_df[\"text_vector\"] = test_text_vectors.tolist()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 71,
"outputs": [],
"source": [
"pd.set_option(\"display.max_colwidth\", None)\n",
"pd.set_option(\"display.max_columns\", None)\n",
"pd.set_option(\"display.expand_frame_repr\", False)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 75,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Query: A pile of burnt carrots and a glove sitting in the snow.\n"
]
},
{
"data": {
"text/html": "<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>text</th>\n <th>url</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>It's tough to find a meal under the snow.</td>\n <td>http://images.cocodataset.org/train2017/000000003926.jpg</td>\n </tr>\n <tr>\n <th>1</th>\n <td>carrots, bananas and some other food sitting on a table</td>\n <td>http://images.cocodataset.org/train2017/000000002624.jpg</td>\n </tr>\n <tr>\n <th>2</th>\n <td>Some noodles with some carrots and chicken in it.</td>\n <td>http://images.cocodataset.org/train2017/000000111606.jpg</td>\n </tr>\n <tr>\n <th>3</th>\n <td>A row of snow boards sticking out of the snow.</td>\n <td>http://images.cocodataset.org/train2017/000000000201.jpg</td>\n </tr>\n <tr>\n <th>4</th>\n <td>A cutting board topped with carrots and onion and a dessert.</td>\n <td>http://images.cocodataset.org/train2017/000000460833.jpg</td>\n </tr>\n </tbody>\n</table>"
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Query: A couple of cows standing in a grassy field.\n"
]
},
{
"data": {
"text/html": "<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>text</th>\n <th>url</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>a couple of cows stand in some tall grass</td>\n <td>http://images.cocodataset.org/train2017/000000007601.jpg</td>\n </tr>\n <tr>\n <th>1</th>\n <td>Multiple cows standing in a field staring forward.</td>\n <td>http://images.cocodataset.org/train2017/000000115680.jpg</td>\n </tr>\n <tr>\n <th>2</th>\n <td>A herd of three cows standing next to each other on a field.</td>\n <td>http://images.cocodataset.org/train2017/000000456919.jpg</td>\n </tr>\n <tr>\n <th>3</th>\n <td>Two cows that are laying down in the grass.</td>\n <td>http://images.cocodataset.org/train2017/000000392556.jpg</td>\n </tr>\n <tr>\n <th>4</th>\n <td>A large black and white cow standing in a desert field.</td>\n <td>http://images.cocodataset.org/train2017/000000134594.jpg</td>\n </tr>\n </tbody>\n</table>"
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Query: A computer desk topped with two monitors next to a TV.\n"
]
},
{
"data": {
"text/html": "<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>text</th>\n <th>url</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>Two computer monitors and a keyboard sitting on a desk.</td>\n <td>http://images.cocodataset.org/train2017/000000416002.jpg</td>\n </tr>\n <tr>\n <th>1</th>\n <td>A computer monitor, keyboard, and tower with peripherals and plugs sit on a desk.</td>\n <td>http://images.cocodataset.org/train2017/000000140122.jpg</td>\n </tr>\n <tr>\n <th>2</th>\n <td>a desk with a laptop a monitor and a keyboard</td>\n <td>http://images.cocodataset.org/train2017/000000273103.jpg</td>\n </tr>\n <tr>\n <th>3</th>\n <td>Two television monitors with a keyboard are on a table.</td>\n <td>http://images.cocodataset.org/train2017/000000416815.jpg</td>\n </tr>\n <tr>\n <th>4</th>\n <td>A computer desk with a laptop computer on it.</td>\n <td>http://images.cocodataset.org/train2017/000000459118.jpg</td>\n </tr>\n </tbody>\n</table>"
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"from qdrant_client.http.models import NamedVector\n",
"from IPython.display import HTML, display_html\n",
"\n",
"for _, row in test_df.iterrows():\n",
" results = client.search(\n",
" collection_name=\"ms-coco-2017\",\n",
" query_vector=NamedVector(\n",
" name=\"text\",\n",
" vector=row[\"text_vector\"],\n",
" ),\n",
" limit=5,\n",
" with_vectors=False,\n",
" with_payload=True,\n",
" )\n",
"\n",
" results_df = pd.DataFrame([point.payload for point in results])\n",
" print(\"Query:\", row[\"TEXT\"])\n",
" display_html(HTML(results_df.to_html()))"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 77,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Query: http://images.cocodataset.org/train2017/000000399680.jpg\n"
]
},
{
"data": {
"text/html": "<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>text</th>\n <th>url</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>Potted plants, branches and pruning shears are on a table.</td>\n <td>http://images.cocodataset.org/train2017/000000129407.jpg</td>\n </tr>\n <tr>\n <th>1</th>\n <td>Cheese has been cut up beside diced and whole carrots.</td>\n <td>http://images.cocodataset.org/train2017/000000500190.jpg</td>\n </tr>\n <tr>\n <th>2</th>\n <td>A cutting board topped with carrots and onion and a dessert.</td>\n <td>http://images.cocodataset.org/train2017/000000460833.jpg</td>\n </tr>\n <tr>\n <th>3</th>\n <td>A banana with a razor blade in it made to look like an operation.</td>\n <td>http://images.cocodataset.org/train2017/000000435883.jpg</td>\n </tr>\n </tbody>\n</table>"
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Query: http://images.cocodataset.org/train2017/000000544114.jpg\n"
]
},
{
"data": {
"text/html": "<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>text</th>\n <th>url</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>Two cows that are laying down in the grass.</td>\n <td>http://images.cocodataset.org/train2017/000000392556.jpg</td>\n </tr>\n <tr>\n <th>1</th>\n <td>Several cows are grazing in a field along side a dirt road.</td>\n <td>http://images.cocodataset.org/train2017/000000162226.jpg</td>\n </tr>\n <tr>\n <th>2</th>\n <td>A large black and white cow standing in a desert field.</td>\n <td>http://images.cocodataset.org/train2017/000000134594.jpg</td>\n </tr>\n <tr>\n <th>3</th>\n <td>A group of three horses standing on a lush green field.</td>\n <td>http://images.cocodataset.org/train2017/000000249609.jpg</td>\n </tr>\n </tbody>\n</table>"
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Query: http://images.cocodataset.org/train2017/000000495680.jpg\n"
]
},
{
"data": {
"text/html": "<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>text</th>\n <th>url</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>a desk with a laptop a monitor and a keyboard</td>\n <td>http://images.cocodataset.org/train2017/000000273103.jpg</td>\n </tr>\n <tr>\n <th>1</th>\n <td>Cluttered elaborate work station with laptop open and on</td>\n <td>http://images.cocodataset.org/train2017/000000066816.jpg</td>\n </tr>\n <tr>\n <th>2</th>\n <td>A desk along a wall with book cases over head.</td>\n <td>http://images.cocodataset.org/train2017/000000069768.jpg</td>\n </tr>\n <tr>\n <th>3</th>\n <td>A desk in a dimly lit room with computers on it</td>\n <td>http://images.cocodataset.org/train2017/000000499300.jpg</td>\n </tr>\n </tbody>\n</table>"
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"for _, row in test_df.iterrows():\n",
" results = client.search(\n",
" collection_name=\"ms-coco-2017\",\n",
" query_vector=(\"image\", row[\"image_vector\"]),\n",
" limit=4,\n",
" with_vectors=False,\n",
" with_payload=True,\n",
" )\n",
"\n",
" results_df = pd.DataFrame([point.payload for point in results])\n",
" print(\"Query:\", row[\"URL\"])\n",
" display_html(HTML(results_df.to_html()))"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [],
"metadata": {
"collapsed": false
}
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment