Last active
August 13, 2024 10:32
-
-
Save alonsosilvaallende/52abf6d54695193692f07bd72488ca48 to your computer and use it in GitHub Desktop.
Exploration-LanceDB
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"id": "aaeb1763-a761-4024-a795-235f76020c34", | |
"metadata": { | |
"execution": { | |
"iopub.execute_input": "2024-08-12T18:58:24.350170Z", | |
"iopub.status.busy": "2024-08-12T18:58:24.349420Z", | |
"iopub.status.idle": "2024-08-12T18:58:24.374615Z", | |
"shell.execute_reply": "2024-08-12T18:58:24.373367Z", | |
"shell.execute_reply.started": "2024-08-12T18:58:24.350119Z" | |
} | |
}, | |
"outputs": [], | |
"source": [ | |
"%load_ext autoreload\n", | |
"%autoreload 2" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"id": "d176441d-70e4-4ff8-aef5-231c4275b87a", | |
"metadata": { | |
"execution": { | |
"iopub.execute_input": "2024-08-12T18:58:24.375768Z", | |
"iopub.status.busy": "2024-08-12T18:58:24.375470Z", | |
"iopub.status.idle": "2024-08-12T18:58:24.392561Z", | |
"shell.execute_reply": "2024-08-12T18:58:24.391722Z", | |
"shell.execute_reply.started": "2024-08-12T18:58:24.375741Z" | |
} | |
}, | |
"outputs": [], | |
"source": [ | |
"# Clean up all the directories used in this notebook\n", | |
"import shutil\n", | |
"\n", | |
"shutil.rmtree(\"./data\", ignore_errors=True)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"id": "c5ab97e1-fed6-489b-b342-88af3fe50adb", | |
"metadata": { | |
"execution": { | |
"iopub.execute_input": "2024-08-12T18:58:24.393704Z", | |
"iopub.status.busy": "2024-08-12T18:58:24.393450Z", | |
"iopub.status.idle": "2024-08-12T18:58:24.415500Z", | |
"shell.execute_reply": "2024-08-12T18:58:24.414326Z", | |
"shell.execute_reply.started": "2024-08-12T18:58:24.393680Z" | |
} | |
}, | |
"outputs": [], | |
"source": [ | |
"polars_df = {\n", | |
" \"vector\": [[3.1, 4.1], [5.9, 26.5]],\n", | |
" \"text\": [\"Frodo was a happy puppy\", \"There are several kittens playing\"]\n", | |
"}" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"id": "57578e77-7e33-44f4-938e-681b26513967", | |
"metadata": { | |
"execution": { | |
"iopub.execute_input": "2024-08-12T18:58:24.417103Z", | |
"iopub.status.busy": "2024-08-12T18:58:24.416649Z", | |
"iopub.status.idle": "2024-08-12T18:58:24.528264Z", | |
"shell.execute_reply": "2024-08-12T18:58:24.527797Z", | |
"shell.execute_reply.started": "2024-08-12T18:58:24.417072Z" | |
} | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"shape: (2, 2)\n", | |
"┌─────────────┬─────────────────────────────────┐\n", | |
"│ vector ┆ text │\n", | |
"│ --- ┆ --- │\n", | |
"│ list[f64] ┆ str │\n", | |
"╞═════════════╪═════════════════════════════════╡\n", | |
"│ [3.1, 4.1] ┆ Frodo was a happy puppy │\n", | |
"│ [5.9, 26.5] ┆ There are several kittens play… │\n", | |
"└─────────────┴─────────────────────────────────┘\n" | |
] | |
} | |
], | |
"source": [ | |
"import polars as pl\n", | |
"\n", | |
"data = pl.DataFrame(polars_df)\n", | |
"print(data)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"id": "d9b929d2-6ea7-4bdc-9b25-56d799aa97ed", | |
"metadata": { | |
"execution": { | |
"iopub.execute_input": "2024-08-12T18:58:24.529222Z", | |
"iopub.status.busy": "2024-08-12T18:58:24.528899Z", | |
"iopub.status.idle": "2024-08-12T18:58:25.073719Z", | |
"shell.execute_reply": "2024-08-12T18:58:25.072682Z", | |
"shell.execute_reply.started": "2024-08-12T18:58:24.529203Z" | |
} | |
}, | |
"outputs": [], | |
"source": [ | |
"import lancedb\n", | |
"\n", | |
"db = lancedb.connect(\"data/\")\n", | |
"table = db.create_table(\"pl_table\", data=data)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "3dfaae42-e3ef-4d69-8893-83078cb88714", | |
"metadata": {}, | |
"source": [ | |
"## Semantic search" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"id": "6215120a-e8bd-4b88-9286-145587330ab6", | |
"metadata": { | |
"execution": { | |
"iopub.execute_input": "2024-08-12T18:58:25.077144Z", | |
"iopub.status.busy": "2024-08-12T18:58:25.076696Z", | |
"iopub.status.idle": "2024-08-12T18:58:25.098824Z", | |
"shell.execute_reply": "2024-08-12T18:58:25.098066Z", | |
"shell.execute_reply.started": "2024-08-12T18:58:25.077108Z" | |
} | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"shape: (2, 3)\n", | |
"┌───────────────┬─────────────────────────────────┬────────────┐\n", | |
"│ vector ┆ text ┆ _distance │\n", | |
"│ --- ┆ --- ┆ --- │\n", | |
"│ array[f32, 2] ┆ str ┆ f32 │\n", | |
"╞═══════════════╪═════════════════════════════════╪════════════╡\n", | |
"│ [3.1, 4.1] ┆ Frodo was a happy puppy ┆ 0.02 │\n", | |
"│ [5.9, 26.5] ┆ There are several kittens play… ┆ 514.659973 │\n", | |
"└───────────────┴─────────────────────────────────┴────────────┘\n" | |
] | |
} | |
], | |
"source": [ | |
"# make a vector query to find the nearest neighbors\n", | |
"query = [3.0, 4.0]\n", | |
"result = table.search(query).limit(10).to_polars()\n", | |
"print(result)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "3d181a12-c5ad-40d9-ba6e-d1802dc0a1ff", | |
"metadata": {}, | |
"source": [ | |
"## Full text search" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"id": "2da73dd9-5236-44f1-8ce8-42e83a3555e3", | |
"metadata": { | |
"execution": { | |
"iopub.execute_input": "2024-08-12T18:58:25.100750Z", | |
"iopub.status.busy": "2024-08-12T18:58:25.100067Z", | |
"iopub.status.idle": "2024-08-12T18:58:25.413241Z", | |
"shell.execute_reply": "2024-08-12T18:58:25.411764Z", | |
"shell.execute_reply.started": "2024-08-12T18:58:25.100705Z" | |
} | |
}, | |
"outputs": [], | |
"source": [ | |
"# create an index to do full-text search\n", | |
"table.create_fts_index(\"text\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"id": "e3c06e3b-ee4a-4157-a6c1-7a958e7b8b6a", | |
"metadata": { | |
"execution": { | |
"iopub.execute_input": "2024-08-12T18:58:25.415663Z", | |
"iopub.status.busy": "2024-08-12T18:58:25.415080Z", | |
"iopub.status.idle": "2024-08-12T18:58:25.447343Z", | |
"shell.execute_reply": "2024-08-12T18:58:25.445956Z", | |
"shell.execute_reply.started": "2024-08-12T18:58:25.415610Z" | |
} | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"shape: (1, 3)\n", | |
"┌───────────────┬─────────────────────────────────┬──────────┐\n", | |
"│ vector ┆ text ┆ score │\n", | |
"│ --- ┆ --- ┆ --- │\n", | |
"│ array[f32, 2] ┆ str ┆ f64 │\n", | |
"╞═══════════════╪═════════════════════════════════╪══════════╡\n", | |
"│ [5.9, 26.5] ┆ There are several kittens play… ┆ 0.693147 │\n", | |
"└───────────────┴─────────────────────────────────┴──────────┘\n" | |
] | |
} | |
], | |
"source": [ | |
"# only one of the texts contain the word \"kittens\"\n", | |
"result = table.search(\"kittens\").select([\"vector\", \"text\"]).limit(10).to_polars()\n", | |
"print(result)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "b26560d3-ddee-4d84-a4bf-d16bd88b9182", | |
"metadata": {}, | |
"source": [ | |
"## Hybrid search" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"id": "da1d7683-90d8-4b18-ab46-88d758cdcab6", | |
"metadata": { | |
"execution": { | |
"iopub.execute_input": "2024-08-12T18:58:25.449771Z", | |
"iopub.status.busy": "2024-08-12T18:58:25.449124Z", | |
"iopub.status.idle": "2024-08-12T18:58:25.476130Z", | |
"shell.execute_reply": "2024-08-12T18:58:25.474945Z", | |
"shell.execute_reply.started": "2024-08-12T18:58:25.449718Z" | |
} | |
}, | |
"outputs": [], | |
"source": [ | |
"# weird way of doing embeddings\n", | |
"from lancedb.embeddings import get_registry\n", | |
"\n", | |
"model = get_registry().get(\"sentence-transformers\").create(name=\"BAAI/bge-small-en-v1.5\", device=\"cuda\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"id": "3636a7c7-ad83-4806-b367-42011d5f48da", | |
"metadata": { | |
"execution": { | |
"iopub.execute_input": "2024-08-12T18:58:25.478465Z", | |
"iopub.status.busy": "2024-08-12T18:58:25.477720Z", | |
"iopub.status.idle": "2024-08-12T18:58:32.328574Z", | |
"shell.execute_reply": "2024-08-12T18:58:32.327437Z", | |
"shell.execute_reply.started": "2024-08-12T18:58:25.478416Z" | |
} | |
}, | |
"outputs": [], | |
"source": [ | |
"# we store them with a pydantic class\n", | |
"from lancedb.pydantic import LanceModel, Vector\n", | |
"\n", | |
"class Words(LanceModel):\n", | |
" text: str = model.SourceField()\n", | |
" vector: Vector(model.ndims()) = model.VectorField()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"id": "05f16cd9-c2b3-4c6d-9dc5-8c57df0bd23d", | |
"metadata": { | |
"execution": { | |
"iopub.execute_input": "2024-08-12T18:58:32.330478Z", | |
"iopub.status.busy": "2024-08-12T18:58:32.329913Z", | |
"iopub.status.idle": "2024-08-12T18:58:32.379299Z", | |
"shell.execute_reply": "2024-08-12T18:58:32.378467Z", | |
"shell.execute_reply.started": "2024-08-12T18:58:32.330439Z" | |
} | |
}, | |
"outputs": [], | |
"source": [ | |
"# we create a table with the pydantic schema and add some texts\n", | |
"table = db.create_table(\"words\", schema=Words)\n", | |
"table.add(\n", | |
" [\n", | |
" {\"text\": \"hello world\"},\n", | |
" {\"text\": \"goodbye world\"}\n", | |
" ]\n", | |
")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"id": "f5d4d72c-81c1-470c-9cb0-d56bf4c9915c", | |
"metadata": { | |
"execution": { | |
"iopub.execute_input": "2024-08-12T18:58:32.381262Z", | |
"iopub.status.busy": "2024-08-12T18:58:32.380777Z", | |
"iopub.status.idle": "2024-08-12T18:58:32.754448Z", | |
"shell.execute_reply": "2024-08-12T18:58:32.753268Z", | |
"shell.execute_reply.started": "2024-08-12T18:58:32.381217Z" | |
} | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"shape: (2, 3)\n", | |
"┌───────────────┬─────────────────────────────────┬───────────┐\n", | |
"│ text ┆ vector ┆ _distance │\n", | |
"│ --- ┆ --- ┆ --- │\n", | |
"│ str ┆ array[f32, 384] ┆ f32 │\n", | |
"╞═══════════════╪═════════════════════════════════╪═══════════╡\n", | |
"│ hello world ┆ [0.015196, -0.022571, … 0.0260… ┆ 0.0 │\n", | |
"│ goodbye world ┆ [0.010943, -0.005271, … -0.004… ┆ 1.0 │\n", | |
"└───────────────┴─────────────────────────────────┴───────────┘\n" | |
] | |
} | |
], | |
"source": [ | |
"table.create_fts_index(\"text\")\n", | |
"query = \"greetings\"\n", | |
"actual = table.search(query, query_type=\"hybrid\").limit(10).to_polars()\n", | |
"print(actual)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "9ae3f048-9158-49b1-a47d-839040bb1395", | |
"metadata": {}, | |
"source": [ | |
"## Hybrid search with all scores" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"id": "e091e02a-93a6-450a-917c-80cf771c83b4", | |
"metadata": { | |
"execution": { | |
"iopub.execute_input": "2024-08-12T18:58:32.756710Z", | |
"iopub.status.busy": "2024-08-12T18:58:32.756163Z", | |
"iopub.status.idle": "2024-08-12T18:58:32.812473Z", | |
"shell.execute_reply": "2024-08-12T18:58:32.811552Z", | |
"shell.execute_reply.started": "2024-08-12T18:58:32.756661Z" | |
} | |
}, | |
"outputs": [], | |
"source": [ | |
"# weird way of doing embeddings\n", | |
"class Documents(LanceModel):\n", | |
" vector: Vector(model.ndims()) = model.VectorField()\n", | |
" text: str = model.SourceField()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 14, | |
"id": "9a071b11-c1e9-4572-8d12-7d38672631a7", | |
"metadata": { | |
"execution": { | |
"iopub.execute_input": "2024-08-12T18:58:32.813647Z", | |
"iopub.status.busy": "2024-08-12T18:58:32.813352Z", | |
"iopub.status.idle": "2024-08-12T18:58:32.841303Z", | |
"shell.execute_reply": "2024-08-12T18:58:32.840529Z", | |
"shell.execute_reply.started": "2024-08-12T18:58:32.813625Z" | |
} | |
}, | |
"outputs": [], | |
"source": [ | |
"# create table with the pydantic schema\n", | |
"table = db.create_table(\"documents\", schema=Documents)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 15, | |
"id": "5b7ce689-b753-47f8-acd9-93e3cdcc3716", | |
"metadata": { | |
"execution": { | |
"iopub.execute_input": "2024-08-12T18:58:32.842356Z", | |
"iopub.status.busy": "2024-08-12T18:58:32.842113Z", | |
"iopub.status.idle": "2024-08-12T18:58:32.893150Z", | |
"shell.execute_reply": "2024-08-12T18:58:32.891778Z", | |
"shell.execute_reply.started": "2024-08-12T18:58:32.842337Z" | |
} | |
}, | |
"outputs": [], | |
"source": [ | |
"# define data\n", | |
"data = [\n", | |
" { \"text\": \"rebel spaceships striking from a hidden base\"},\n", | |
" { \"text\": \"have won their first victory against the evil Galactic Empire\"},\n", | |
" { \"text\": \"during the battle rebel spies managed to steal secret plans\"},\n", | |
" { \"text\": \"to the Empire's ultimate weapon the Death Star\"}\n", | |
"]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 16, | |
"id": "4fd2f73c-2bc7-4278-83c4-99c1dddb81fc", | |
"metadata": { | |
"execution": { | |
"iopub.execute_input": "2024-08-12T18:58:32.895200Z", | |
"iopub.status.busy": "2024-08-12T18:58:32.894722Z", | |
"iopub.status.idle": "2024-08-12T18:58:32.961347Z", | |
"shell.execute_reply": "2024-08-12T18:58:32.960657Z", | |
"shell.execute_reply.started": "2024-08-12T18:58:32.895154Z" | |
} | |
}, | |
"outputs": [], | |
"source": [ | |
"# ingest docs with auto-vectorization\n", | |
"table.add(data)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 17, | |
"id": "4759bc0b-99ad-4ee4-a13e-1fe4ec3fe1bd", | |
"metadata": { | |
"execution": { | |
"iopub.execute_input": "2024-08-12T18:58:32.962194Z", | |
"iopub.status.busy": "2024-08-12T18:58:32.962006Z", | |
"iopub.status.idle": "2024-08-12T18:58:33.477636Z", | |
"shell.execute_reply": "2024-08-12T18:58:33.475962Z", | |
"shell.execute_reply.started": "2024-08-12T18:58:32.962176Z" | |
} | |
}, | |
"outputs": [], | |
"source": [ | |
"# Create a fts index before the hybrid search\n", | |
"table.create_fts_index(\"text\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 18, | |
"id": "5c4ef667-3d79-4e7c-9567-c466a1fe96fb", | |
"metadata": { | |
"execution": { | |
"iopub.execute_input": "2024-08-12T18:58:33.480048Z", | |
"iopub.status.busy": "2024-08-12T18:58:33.479478Z", | |
"iopub.status.idle": "2024-08-12T18:58:33.523897Z", | |
"shell.execute_reply": "2024-08-12T18:58:33.522430Z", | |
"shell.execute_reply.started": "2024-08-12T18:58:33.479998Z" | |
} | |
}, | |
"outputs": [], | |
"source": [ | |
"# define reranker if you want different weigths than the defaults and to return all scores\n", | |
"from lancedb.rerankers import LinearCombinationReranker\n", | |
"\n", | |
"reranker = LinearCombinationReranker(weight=0.7, return_score=\"all\") # Use 0.7 as the weight for vector search" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 19, | |
"id": "80583129-99c6-4862-8d1c-d237243ef1cf", | |
"metadata": { | |
"execution": { | |
"iopub.execute_input": "2024-08-12T18:58:33.526160Z", | |
"iopub.status.busy": "2024-08-12T18:58:33.525584Z", | |
"iopub.status.idle": "2024-08-12T18:58:33.603270Z", | |
"shell.execute_reply": "2024-08-12T18:58:33.602241Z", | |
"shell.execute_reply.started": "2024-08-12T18:58:33.526094Z" | |
} | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"shape: (4, 5)\n", | |
"┌──────────────────────────┬────────────────────────────────┬───────────┬───────┬──────────────────┐\n", | |
"│ vector ┆ text ┆ _distance ┆ score ┆ _relevance_score │\n", | |
"│ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", | |
"│ array[f32, 384] ┆ str ┆ f32 ┆ f32 ┆ f32 │\n", | |
"╞══════════════════════════╪════════════════════════════════╪═══════════╪═══════╪══════════════════╡\n", | |
"│ [-0.068654, 0.05788, … ┆ during the battle rebel spies ┆ 0.0 ┆ 1.0 ┆ 1.0 │\n", | |
"│ -0.0432… ┆ … ┆ ┆ ┆ │\n", | |
"│ [0.007457, 0.010016, … ┆ rebel spaceships striking ┆ 0.408497 ┆ 0.0 ┆ 0.414052 │\n", | |
"│ -0.0164… ┆ from… ┆ ┆ ┆ │\n", | |
"│ [-0.062896, -0.000119, … ┆ to the Empire's ultimate ┆ 0.881678 ┆ null ┆ 0.082825 │\n", | |
"│ 0.035… ┆ weapo… ┆ ┆ ┆ │\n", | |
"│ [-0.044833, 0.007395, … ┆ have won their first victory ┆ 1.0 ┆ null ┆ 0.0 │\n", | |
"│ -0.018… ┆ a… ┆ ┆ ┆ │\n", | |
"└──────────────────────────┴────────────────────────────────┴───────────┴───────┴──────────────────┘\n" | |
] | |
} | |
], | |
"source": [ | |
"# hybrid search with re-ranker\n", | |
"result = table.search(\"battle rebel\", query_type=\"hybrid\").rerank(reranker=reranker).to_polars()\n", | |
"print(result)" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3 (ipykernel)", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.10.12" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 5 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment