Skip to content

Instantly share code, notes, and snippets.

@alonsosilvaallende
Last active August 13, 2024 10:32
Show Gist options
  • Save alonsosilvaallende/52abf6d54695193692f07bd72488ca48 to your computer and use it in GitHub Desktop.
Save alonsosilvaallende/52abf6d54695193692f07bd72488ca48 to your computer and use it in GitHub Desktop.
Exploration-LanceDB
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "aaeb1763-a761-4024-a795-235f76020c34",
"metadata": {
"execution": {
"iopub.execute_input": "2024-08-12T18:58:24.350170Z",
"iopub.status.busy": "2024-08-12T18:58:24.349420Z",
"iopub.status.idle": "2024-08-12T18:58:24.374615Z",
"shell.execute_reply": "2024-08-12T18:58:24.373367Z",
"shell.execute_reply.started": "2024-08-12T18:58:24.350119Z"
}
},
"outputs": [],
"source": [
"%load_ext autoreload\n",
"%autoreload 2"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "d176441d-70e4-4ff8-aef5-231c4275b87a",
"metadata": {
"execution": {
"iopub.execute_input": "2024-08-12T18:58:24.375768Z",
"iopub.status.busy": "2024-08-12T18:58:24.375470Z",
"iopub.status.idle": "2024-08-12T18:58:24.392561Z",
"shell.execute_reply": "2024-08-12T18:58:24.391722Z",
"shell.execute_reply.started": "2024-08-12T18:58:24.375741Z"
}
},
"outputs": [],
"source": [
"# Clean up all the directories used in this notebook\n",
"import shutil\n",
"\n",
"shutil.rmtree(\"./data\", ignore_errors=True)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "c5ab97e1-fed6-489b-b342-88af3fe50adb",
"metadata": {
"execution": {
"iopub.execute_input": "2024-08-12T18:58:24.393704Z",
"iopub.status.busy": "2024-08-12T18:58:24.393450Z",
"iopub.status.idle": "2024-08-12T18:58:24.415500Z",
"shell.execute_reply": "2024-08-12T18:58:24.414326Z",
"shell.execute_reply.started": "2024-08-12T18:58:24.393680Z"
}
},
"outputs": [],
"source": [
"polars_df = {\n",
" \"vector\": [[3.1, 4.1], [5.9, 26.5]],\n",
" \"text\": [\"Frodo was a happy puppy\", \"There are several kittens playing\"]\n",
"}"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "57578e77-7e33-44f4-938e-681b26513967",
"metadata": {
"execution": {
"iopub.execute_input": "2024-08-12T18:58:24.417103Z",
"iopub.status.busy": "2024-08-12T18:58:24.416649Z",
"iopub.status.idle": "2024-08-12T18:58:24.528264Z",
"shell.execute_reply": "2024-08-12T18:58:24.527797Z",
"shell.execute_reply.started": "2024-08-12T18:58:24.417072Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"shape: (2, 2)\n",
"┌─────────────┬─────────────────────────────────┐\n",
"│ vector ┆ text │\n",
"│ --- ┆ --- │\n",
"│ list[f64] ┆ str │\n",
"╞═════════════╪═════════════════════════════════╡\n",
"│ [3.1, 4.1] ┆ Frodo was a happy puppy │\n",
"│ [5.9, 26.5] ┆ There are several kittens play… │\n",
"└─────────────┴─────────────────────────────────┘\n"
]
}
],
"source": [
"import polars as pl\n",
"\n",
"data = pl.DataFrame(polars_df)\n",
"print(data)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "d9b929d2-6ea7-4bdc-9b25-56d799aa97ed",
"metadata": {
"execution": {
"iopub.execute_input": "2024-08-12T18:58:24.529222Z",
"iopub.status.busy": "2024-08-12T18:58:24.528899Z",
"iopub.status.idle": "2024-08-12T18:58:25.073719Z",
"shell.execute_reply": "2024-08-12T18:58:25.072682Z",
"shell.execute_reply.started": "2024-08-12T18:58:24.529203Z"
}
},
"outputs": [],
"source": [
"import lancedb\n",
"\n",
"db = lancedb.connect(\"data/\")\n",
"table = db.create_table(\"pl_table\", data=data)"
]
},
{
"cell_type": "markdown",
"id": "3dfaae42-e3ef-4d69-8893-83078cb88714",
"metadata": {},
"source": [
"## Semantic search"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "6215120a-e8bd-4b88-9286-145587330ab6",
"metadata": {
"execution": {
"iopub.execute_input": "2024-08-12T18:58:25.077144Z",
"iopub.status.busy": "2024-08-12T18:58:25.076696Z",
"iopub.status.idle": "2024-08-12T18:58:25.098824Z",
"shell.execute_reply": "2024-08-12T18:58:25.098066Z",
"shell.execute_reply.started": "2024-08-12T18:58:25.077108Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"shape: (2, 3)\n",
"┌───────────────┬─────────────────────────────────┬────────────┐\n",
"│ vector ┆ text ┆ _distance │\n",
"│ --- ┆ --- ┆ --- │\n",
"│ array[f32, 2] ┆ str ┆ f32 │\n",
"╞═══════════════╪═════════════════════════════════╪════════════╡\n",
"│ [3.1, 4.1] ┆ Frodo was a happy puppy ┆ 0.02 │\n",
"│ [5.9, 26.5] ┆ There are several kittens play… ┆ 514.659973 │\n",
"└───────────────┴─────────────────────────────────┴────────────┘\n"
]
}
],
"source": [
"# make a vector query to find the nearest neighbors\n",
"query = [3.0, 4.0]\n",
"result = table.search(query).limit(10).to_polars()\n",
"print(result)"
]
},
{
"cell_type": "markdown",
"id": "3d181a12-c5ad-40d9-ba6e-d1802dc0a1ff",
"metadata": {},
"source": [
"## Full text search"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "2da73dd9-5236-44f1-8ce8-42e83a3555e3",
"metadata": {
"execution": {
"iopub.execute_input": "2024-08-12T18:58:25.100750Z",
"iopub.status.busy": "2024-08-12T18:58:25.100067Z",
"iopub.status.idle": "2024-08-12T18:58:25.413241Z",
"shell.execute_reply": "2024-08-12T18:58:25.411764Z",
"shell.execute_reply.started": "2024-08-12T18:58:25.100705Z"
}
},
"outputs": [],
"source": [
"# create an index to do full-text search\n",
"table.create_fts_index(\"text\")"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "e3c06e3b-ee4a-4157-a6c1-7a958e7b8b6a",
"metadata": {
"execution": {
"iopub.execute_input": "2024-08-12T18:58:25.415663Z",
"iopub.status.busy": "2024-08-12T18:58:25.415080Z",
"iopub.status.idle": "2024-08-12T18:58:25.447343Z",
"shell.execute_reply": "2024-08-12T18:58:25.445956Z",
"shell.execute_reply.started": "2024-08-12T18:58:25.415610Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"shape: (1, 3)\n",
"┌───────────────┬─────────────────────────────────┬──────────┐\n",
"│ vector ┆ text ┆ score │\n",
"│ --- ┆ --- ┆ --- │\n",
"│ array[f32, 2] ┆ str ┆ f64 │\n",
"╞═══════════════╪═════════════════════════════════╪══════════╡\n",
"│ [5.9, 26.5] ┆ There are several kittens play… ┆ 0.693147 │\n",
"└───────────────┴─────────────────────────────────┴──────────┘\n"
]
}
],
"source": [
"# only one of the texts contain the word \"kittens\"\n",
"result = table.search(\"kittens\").select([\"vector\", \"text\"]).limit(10).to_polars()\n",
"print(result)"
]
},
{
"cell_type": "markdown",
"id": "b26560d3-ddee-4d84-a4bf-d16bd88b9182",
"metadata": {},
"source": [
"## Hybrid search"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "da1d7683-90d8-4b18-ab46-88d758cdcab6",
"metadata": {
"execution": {
"iopub.execute_input": "2024-08-12T18:58:25.449771Z",
"iopub.status.busy": "2024-08-12T18:58:25.449124Z",
"iopub.status.idle": "2024-08-12T18:58:25.476130Z",
"shell.execute_reply": "2024-08-12T18:58:25.474945Z",
"shell.execute_reply.started": "2024-08-12T18:58:25.449718Z"
}
},
"outputs": [],
"source": [
"# weird way of doing embeddings\n",
"from lancedb.embeddings import get_registry\n",
"\n",
"model = get_registry().get(\"sentence-transformers\").create(name=\"BAAI/bge-small-en-v1.5\", device=\"cuda\")"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "3636a7c7-ad83-4806-b367-42011d5f48da",
"metadata": {
"execution": {
"iopub.execute_input": "2024-08-12T18:58:25.478465Z",
"iopub.status.busy": "2024-08-12T18:58:25.477720Z",
"iopub.status.idle": "2024-08-12T18:58:32.328574Z",
"shell.execute_reply": "2024-08-12T18:58:32.327437Z",
"shell.execute_reply.started": "2024-08-12T18:58:25.478416Z"
}
},
"outputs": [],
"source": [
"# we store them with a pydantic class\n",
"from lancedb.pydantic import LanceModel, Vector\n",
"\n",
"class Words(LanceModel):\n",
" text: str = model.SourceField()\n",
" vector: Vector(model.ndims()) = model.VectorField()"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "05f16cd9-c2b3-4c6d-9dc5-8c57df0bd23d",
"metadata": {
"execution": {
"iopub.execute_input": "2024-08-12T18:58:32.330478Z",
"iopub.status.busy": "2024-08-12T18:58:32.329913Z",
"iopub.status.idle": "2024-08-12T18:58:32.379299Z",
"shell.execute_reply": "2024-08-12T18:58:32.378467Z",
"shell.execute_reply.started": "2024-08-12T18:58:32.330439Z"
}
},
"outputs": [],
"source": [
"# we create a table with the pydantic schema and add some texts\n",
"table = db.create_table(\"words\", schema=Words)\n",
"table.add(\n",
" [\n",
" {\"text\": \"hello world\"},\n",
" {\"text\": \"goodbye world\"}\n",
" ]\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "f5d4d72c-81c1-470c-9cb0-d56bf4c9915c",
"metadata": {
"execution": {
"iopub.execute_input": "2024-08-12T18:58:32.381262Z",
"iopub.status.busy": "2024-08-12T18:58:32.380777Z",
"iopub.status.idle": "2024-08-12T18:58:32.754448Z",
"shell.execute_reply": "2024-08-12T18:58:32.753268Z",
"shell.execute_reply.started": "2024-08-12T18:58:32.381217Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"shape: (2, 3)\n",
"┌───────────────┬─────────────────────────────────┬───────────┐\n",
"│ text ┆ vector ┆ _distance │\n",
"│ --- ┆ --- ┆ --- │\n",
"│ str ┆ array[f32, 384] ┆ f32 │\n",
"╞═══════════════╪═════════════════════════════════╪═══════════╡\n",
"│ hello world ┆ [0.015196, -0.022571, … 0.0260… ┆ 0.0 │\n",
"│ goodbye world ┆ [0.010943, -0.005271, … -0.004… ┆ 1.0 │\n",
"└───────────────┴─────────────────────────────────┴───────────┘\n"
]
}
],
"source": [
"table.create_fts_index(\"text\")\n",
"query = \"greetings\"\n",
"actual = table.search(query, query_type=\"hybrid\").limit(10).to_polars()\n",
"print(actual)"
]
},
{
"cell_type": "markdown",
"id": "9ae3f048-9158-49b1-a47d-839040bb1395",
"metadata": {},
"source": [
"## Hybrid search with all scores"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "e091e02a-93a6-450a-917c-80cf771c83b4",
"metadata": {
"execution": {
"iopub.execute_input": "2024-08-12T18:58:32.756710Z",
"iopub.status.busy": "2024-08-12T18:58:32.756163Z",
"iopub.status.idle": "2024-08-12T18:58:32.812473Z",
"shell.execute_reply": "2024-08-12T18:58:32.811552Z",
"shell.execute_reply.started": "2024-08-12T18:58:32.756661Z"
}
},
"outputs": [],
"source": [
"# weird way of doing embeddings\n",
"class Documents(LanceModel):\n",
" vector: Vector(model.ndims()) = model.VectorField()\n",
" text: str = model.SourceField()"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "9a071b11-c1e9-4572-8d12-7d38672631a7",
"metadata": {
"execution": {
"iopub.execute_input": "2024-08-12T18:58:32.813647Z",
"iopub.status.busy": "2024-08-12T18:58:32.813352Z",
"iopub.status.idle": "2024-08-12T18:58:32.841303Z",
"shell.execute_reply": "2024-08-12T18:58:32.840529Z",
"shell.execute_reply.started": "2024-08-12T18:58:32.813625Z"
}
},
"outputs": [],
"source": [
"# create table with the pydantic schema\n",
"table = db.create_table(\"documents\", schema=Documents)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "5b7ce689-b753-47f8-acd9-93e3cdcc3716",
"metadata": {
"execution": {
"iopub.execute_input": "2024-08-12T18:58:32.842356Z",
"iopub.status.busy": "2024-08-12T18:58:32.842113Z",
"iopub.status.idle": "2024-08-12T18:58:32.893150Z",
"shell.execute_reply": "2024-08-12T18:58:32.891778Z",
"shell.execute_reply.started": "2024-08-12T18:58:32.842337Z"
}
},
"outputs": [],
"source": [
"# define data\n",
"data = [\n",
" { \"text\": \"rebel spaceships striking from a hidden base\"},\n",
" { \"text\": \"have won their first victory against the evil Galactic Empire\"},\n",
" { \"text\": \"during the battle rebel spies managed to steal secret plans\"},\n",
" { \"text\": \"to the Empire's ultimate weapon the Death Star\"}\n",
"]"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "4fd2f73c-2bc7-4278-83c4-99c1dddb81fc",
"metadata": {
"execution": {
"iopub.execute_input": "2024-08-12T18:58:32.895200Z",
"iopub.status.busy": "2024-08-12T18:58:32.894722Z",
"iopub.status.idle": "2024-08-12T18:58:32.961347Z",
"shell.execute_reply": "2024-08-12T18:58:32.960657Z",
"shell.execute_reply.started": "2024-08-12T18:58:32.895154Z"
}
},
"outputs": [],
"source": [
"# ingest docs with auto-vectorization\n",
"table.add(data)"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "4759bc0b-99ad-4ee4-a13e-1fe4ec3fe1bd",
"metadata": {
"execution": {
"iopub.execute_input": "2024-08-12T18:58:32.962194Z",
"iopub.status.busy": "2024-08-12T18:58:32.962006Z",
"iopub.status.idle": "2024-08-12T18:58:33.477636Z",
"shell.execute_reply": "2024-08-12T18:58:33.475962Z",
"shell.execute_reply.started": "2024-08-12T18:58:32.962176Z"
}
},
"outputs": [],
"source": [
"# Create a fts index before the hybrid search\n",
"table.create_fts_index(\"text\")"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "5c4ef667-3d79-4e7c-9567-c466a1fe96fb",
"metadata": {
"execution": {
"iopub.execute_input": "2024-08-12T18:58:33.480048Z",
"iopub.status.busy": "2024-08-12T18:58:33.479478Z",
"iopub.status.idle": "2024-08-12T18:58:33.523897Z",
"shell.execute_reply": "2024-08-12T18:58:33.522430Z",
"shell.execute_reply.started": "2024-08-12T18:58:33.479998Z"
}
},
"outputs": [],
"source": [
"# define reranker if you want different weigths than the defaults and to return all scores\n",
"from lancedb.rerankers import LinearCombinationReranker\n",
"\n",
"reranker = LinearCombinationReranker(weight=0.7, return_score=\"all\") # Use 0.7 as the weight for vector search"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "80583129-99c6-4862-8d1c-d237243ef1cf",
"metadata": {
"execution": {
"iopub.execute_input": "2024-08-12T18:58:33.526160Z",
"iopub.status.busy": "2024-08-12T18:58:33.525584Z",
"iopub.status.idle": "2024-08-12T18:58:33.603270Z",
"shell.execute_reply": "2024-08-12T18:58:33.602241Z",
"shell.execute_reply.started": "2024-08-12T18:58:33.526094Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"shape: (4, 5)\n",
"┌──────────────────────────┬────────────────────────────────┬───────────┬───────┬──────────────────┐\n",
"│ vector ┆ text ┆ _distance ┆ score ┆ _relevance_score │\n",
"│ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n",
"│ array[f32, 384] ┆ str ┆ f32 ┆ f32 ┆ f32 │\n",
"╞══════════════════════════╪════════════════════════════════╪═══════════╪═══════╪══════════════════╡\n",
"│ [-0.068654, 0.05788, … ┆ during the battle rebel spies ┆ 0.0 ┆ 1.0 ┆ 1.0 │\n",
"│ -0.0432… ┆ … ┆ ┆ ┆ │\n",
"│ [0.007457, 0.010016, … ┆ rebel spaceships striking ┆ 0.408497 ┆ 0.0 ┆ 0.414052 │\n",
"│ -0.0164… ┆ from… ┆ ┆ ┆ │\n",
"│ [-0.062896, -0.000119, … ┆ to the Empire's ultimate ┆ 0.881678 ┆ null ┆ 0.082825 │\n",
"│ 0.035… ┆ weapo… ┆ ┆ ┆ │\n",
"│ [-0.044833, 0.007395, … ┆ have won their first victory ┆ 1.0 ┆ null ┆ 0.0 │\n",
"│ -0.018… ┆ a… ┆ ┆ ┆ │\n",
"└──────────────────────────┴────────────────────────────────┴───────────┴───────┴──────────────────┘\n"
]
}
],
"source": [
"# hybrid search with re-ranker\n",
"result = table.search(\"battle rebel\", query_type=\"hybrid\").rerank(reranker=reranker).to_polars()\n",
"print(result)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.12"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment