Skip to content

Instantly share code, notes, and snippets.

@pims
Forked from raidendotai/vectorsearch.html
Created October 10, 2024 13:34
Show Gist options
  • Save pims/80cca8b12743b890c7a5e1ff070372c9 to your computer and use it in GitHub Desktop.
Save pims/80cca8b12743b890c7a5e1ff070372c9 to your computer and use it in GitHub Desktop.
In-browser, local vector similarity search, powered by pglite
<!doctype html>
<html lang="en">
<head>
<meta charset="UTF-8" />
<meta name="viewport" content="width=device-width, initial-scale=1" />
<title>Semantic Search Demo</title>
<link
href="https://cdn.jsdelivr.net/npm/[email protected]/dist/tailwind.min.css"
rel="stylesheet"
/>
<script src="https://code.jquery.com/jquery-3.6.0.min.js"></script>
<style>
/* Hide scrollbars but keep them functional */
::-webkit-scrollbar {
width: 0;
background: transparent; /* Optional: to make the scrollbar background transparent */
}
/* For Firefox */
* {
scrollbar-width: none; /* Hide scrollbar for Firefox */
}
</style>
</head>
<body class="bg-gray-100">
<div class="container mx-auto p-12 xl:p-24 xl:py-12">
<h1 class="text-xl mb-6">in-browser, local, persistent semantic search & storage<br/>with pglite/pgvector</h1>
<div class="mb-4">
<input
id="search-input"
type="text"
class="w-full p-3 border rounded"
placeholder="Enter search term..."
/>
</div>
<button
id="search-button"
class="bg-blue-500 text-white px-4 py-2 rounded mb-6"
>
Search
</button>
<div class="grid lg:grid-cols-2 gap-4">
<div>
<h2 class="text-xl mb-2">DB content</h2>
<ul
id="db-list"
class="list-disc list-inside bg-white p-4 rounded shadow"
>
<!-- db data will appear here -->
</ul>
</div>
<div>
<h2 class="text-xl mb-2">Search results</h2>
<ul
id="results-list"
class="list-disc list-inside bg-white p-4 rounded shadow"
>
<!-- search results will appear here -->
</ul>
</div>
</div>
</div>
<script type="module">
import { PGlite } from "https://cdn.jsdelivr.net/npm/@electric-sql/[email protected]/dist/index.js";
import { vector } from "https://cdn.jsdelivr.net/npm/@electric-sql/[email protected]/dist/vector/index.js";
import { getEncoding } from "https://cdn.jsdelivr.net/npm/[email protected]/+esm";
const OPENAI_API_KEY = "REPLACE_WITH_YOUR_OPENAI_API_KEY";
const LOCAL_DB_NAME = `vector-search-demo`; // local persistent db
const seed_data = [
"A juicy burger with melted cheese\nand crispy lettuce on a toasted bun.",
"Freshly baked pizza topped with\npepperoni, mushrooms, and olives.",
"A creamy bowl of pasta with\nrich Alfredo sauce and grilled chicken.",
"Crispy fried chicken served with\nbuttery mashed potatoes and gravy.",
"A refreshing salad with mixed greens,\navocado, and a tangy vinaigrette.",
"Grilled salmon with a side of\nsteamed vegetables and lemon butter.",
"A hearty beef stew with tender\ncarrots, potatoes, and savory broth.",
"A ripe banana, sweet and soft,\nperfect for a quick healthy snack.",
];
// ---
if (
!OPENAI_API_KEY ||
OPENAI_API_KEY === "REPLACE_WITH_YOUR_OPENAI_API_KEY"
) {
alert(
'Please replace "REPLACE_WITH_YOUR_OPENAI_API_KEY" with your actual OpenAI API key in the script.',
);
}
// tiktoken , to slice at max tokens before embedding
const enc = getEncoding("cl100k_base");
let updateDbList = async () => {};
const _chunkify = (array, size) => {
const chunks = [];
for (let i = 0; i < array.length; i += size) {
chunks.push(array.slice(i, i + size));
}
return chunks;
};
// pglite/pgvector integration courtesy of https://supabase.com/blog/in-browser-semantic-search-pglite
let dbInstance = null;
async function getDB() {
if (dbInstance) {
return dbInstance;
}
const db = new PGlite(`idb://${LOCAL_DB_NAME}`, {
extensions: {
vector,
},
});
await db.waitReady;
dbInstance = db;
return db;
}
async function initSchema(db) {
await db.exec(`
CREATE EXTENSION IF NOT EXISTS vector;
CREATE TABLE IF NOT EXISTS embeddings (
uid TEXT NOT NULL UNIQUE,
content TEXT,
embedding VECTOR(1536)
);
CREATE INDEX IF NOT EXISTS embeddings_embedding_idx ON embeddings USING hnsw (embedding vector_ip_ops);
`);
}
async function countRows(db, table) {
const res = await db.query(`SELECT COUNT(*) FROM ${table};`);
return res.rows[0].count;
}
async function sha256(source) {
const sourceBytes = new TextEncoder().encode(source);
const digest = await window.crypto.subtle.digest(
"SHA-256",
sourceBytes,
);
const resultBytes = [...new Uint8Array(digest)];
return resultBytes.map((x) => x.toString(16).padStart(2, "0")).join("");
}
async function insertDb({ texts, db }) {
const chunks = _chunkify(texts, 15);
await Promise.all(
chunks.map(async (_chunk) => {
const chunkWithUids = await Promise.all(
_chunk.map(async (text) => {
const uid = await sha256(text);
return { uid, text };
}),
);
const existingUids = await db.query(`
SELECT uid FROM embeddings WHERE uid IN (${chunkWithUids.map((entry) => `'${entry.uid}'`).join(", ")});
`);
const existingUidSet = new Set(
existingUids.rows.map((row) => row.uid),
);
const newEntries = chunkWithUids.filter(
(entry) => !existingUidSet.has(entry.uid),
);
if (newEntries.length) {
const newTexts = newEntries.map((entry) => entry.text);
const embeddings = await getEmbeddings({ texts: newTexts });
if (!embeddings) return;
const entriesWithEmbeddings = newEntries.map((entry, idx) => ({
uid: entry.uid,
text: entry.text,
vector: embeddings[idx],
}));
const pg_entries = entriesWithEmbeddings
.map((entry) => {
const content = entry.text.replaceAll(`'`, ``);
return `\t('${entry.uid}', '${content}', '${JSON.stringify(entry.vector)}')`;
})
.join(",\n");
await db.exec(`
insert into embeddings (uid, content, embedding) values
${pg_entries};
`);
}
console.dir(await db.query(`SELECT COUNT(*) FROM embeddings;`), {
depth: null,
});
updateDbList();
}),
);
}
async function seedDB(db) {
await insertDb({ texts: seed_data, db });
}
async function getEmbeddings({ texts }) {
const maxTokens = 8192;
const sliceTexts = (texts) => {
return texts.map((text) => {
const tokens = enc.encode(text);
const txt = enc.decode(tokens.slice(0, maxTokens));
console.log({ debug_txt: txt });
return txt;
});
};
texts = sliceTexts(texts);
try {
const response = await fetch("https://api.openai.com/v1/embeddings", {
method: "POST",
headers: {
"Content-Type": "application/json",
Authorization: `Bearer ${OPENAI_API_KEY}`,
},
body: JSON.stringify({
model: "text-embedding-3-small",
input: texts,
encoding_format: "float",
}),
});
if (!response.ok) {
const errorData = await response.json();
console.error("OpenAI API Error:", errorData);
alert(
`Error fetching embedding for "${text}": ${errorData.error.message}`,
);
return null;
}
const data = await response.json();
console.log({ debug_getEmbeddings: { texts, data } });
return data.data
.sort((a, b) => a.index - b.index)
.map((e) => e.embedding);
} catch (error) {
console.error("Fetch Error:", error);
alert(`Network error while fetching embedding for "${text}".`);
return null;
}
}
async function search({
db,
embedding,
match_threshold = 0.0,
limit = 3,
}) {
try {
const res = await db.query(
`
SELECT uid, content, embedding <#> $1 AS score FROM embeddings
WHERE embeddings.embedding <#> $1 < $2
ORDER BY embeddings.embedding <#> $1
LIMIT $3;
`,
[
JSON.stringify(embedding),
-Number(match_threshold),
Number(limit),
],
);
console.log({ debug_search_res: { embedding, res } });
return res.rows;
} catch (error) {
console.error("Search Error:", error);
alert("Error during search operation.");
return [];
}
}
$(document).ready(async function () {
const db = await getDB();
await initSchema(db);
const rowCount = await countRows(db, "embeddings");
await seedDB(db);
updateDbList = async () => {
$("#db-list").empty();
const items = await db.query("SELECT uid, content FROM embeddings");
items.rows.forEach((row) => {
$("#db-list").append(`
<div class="mb-2">
<div class="bg-gray-100 p-4 rounded-lg shadow-md max-h-50 overflow-y-auto break-words whitspace-pre-wrap">
<p class="text-xs font-semibold text-gray-700">uid: ${row.uid.slice(0, 20)}...</p>
<p class="text-sm text-gray-600 mt-1 max-h-10 overflow-auto">${row.content}</p>
</div>
</div>
`);
});
};
updateDbList();
$("#search-button").on("click", async function () {
const query = $("#search-input").val().trim();
if (query === "") {
alert("Please enter a search term.");
return;
}
const queryEmbedding = (await getEmbeddings({ texts: [query] }))[0];
console.log({ debug_search: { query, queryEmbedding } });
if (!queryEmbedding) {
return;
}
const searchResults = await search({ db, embedding: queryEmbedding });
$("#results-list").empty();
if (searchResults.length === 0) {
$("#results-list").append("<li>No results found.</li>");
return;
}
searchResults.forEach((row) => {
$("#results-list").append(`
<div class="mb-2">
<div class="bg-gray-100 p-4 rounded-lg shadow-md max-h-50 overflow-y-auto break-words whitspace-pre-wrap">
<p class="text-xs font-semibold text-gray-700">uid: ${row.uid.slice(0, 20)}...</p>
<p class="text-xs font-semibold text-gray-700">(score: ${row.score.toFixed(4)})</p>
<p class="text-sm text-gray-600 mt-1 max-h-10 overflow-auto">${row.content}</p>
</div>
</div>
`);
});
});
});
</script>
</body>
</html>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment