Created
September 10, 2025 07:31
-
-
Save ezzabuzaid/abb9fa72a052d17856cd9c29107442d5 to your computer and use it in GitHub Desktop.
qwen0.6-embedding.ts
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import { embedMany } from 'ai'; | |
import { MarkdownTextSplitter } from 'langchain/text_splitter'; | |
import lmstudio from '../lmstudio.ts'; | |
import { DatabaseSync } from 'node:sqlite'; | |
import type { Embedding } from 'ai'; | |
import * as sqliteVec from 'sqlite-vec'; | |
const db = new DatabaseSync('embed.sqlite', { allowExtension: true }); | |
sqliteVec.load(db); | |
db.exec( | |
[ | |
'PRAGMA journal_mode = WAL;', | |
'PRAGMA synchronous = NORMAL;', | |
'PRAGMA temp_store = MEMORY;', | |
].join('\n') | |
); | |
export type Metadata = Record<string, unknown>; | |
export type Content = { | |
content: string; | |
metadata?: Metadata; | |
}; | |
export type Chunk = Content & { | |
embedding: Embedding; | |
}; | |
export async function vectoreStore() { | |
return { | |
addDocuments: async (key: string, dimension: number, chunks: Chunk[]) => { | |
db.exec(` | |
CREATE VIRTUAL TABLE IF NOT EXISTS vec_chunks USING vec0( | |
key TEXT PARTITION KEY, | |
embedding FLOAT[${dimension}] DISTANCE_METRIC=cosine, | |
+content TEXT, | |
+metadata TEXT | |
); | |
`); | |
const del = db.prepare(`DELETE FROM vec_chunks WHERE key = ?`); | |
const insert = db.prepare(` | |
INSERT INTO vec_chunks (key, content, embedding, metadata) | |
VALUES (?, ?, vec_normalize(vec_f32(?)), ?) | |
`); | |
try { | |
db.exec('BEGIN'); | |
del.run(key); | |
for (let i = 0; i < chunks.length; i++) { | |
insert.run( | |
key, | |
chunks[i].content, | |
JSON.stringify(chunks[i].embedding), | |
chunks[i].metadata ? JSON.stringify(chunks[i].metadata) : null | |
); | |
} | |
db.exec('COMMIT'); | |
} catch (error) { | |
db.exec('ROLLBACK'); | |
console.log('Error during insert, rolled back transaction:', error); | |
} | |
}, | |
}; | |
} | |
function github(path: string) { | |
const [owner, repo, ...filePath] = path.split('/'); | |
return { | |
async readFile() { | |
const url = `https://api.github.com/repos/${owner}/${repo}/contents/${filePath.join('/')}`; | |
const res = await fetch(url); | |
const data = (await res.json()) as { content: string }; | |
return atob(data.content); | |
}, | |
}; | |
} | |
async function split(content: string) { | |
const splitter = new MarkdownTextSplitter(); | |
return splitter.splitText(content); | |
} | |
async function embed(documents: string[]) { | |
const dimensions = 1024; | |
const { embeddings } = await embedMany({ | |
model: lmstudio.textEmbeddingModel('text-embedding-qwen3-embedding-0.6b'), | |
values: documents, | |
providerOptions: { lmstudio: { dimensions } }, | |
}); | |
// const chunks: Document[] = documents.map((it, idx) => ({ | |
// content: it, | |
// embedding: embeddings[idx], | |
// })); | |
return { embeddings, dimensions }; | |
} | |
async function ingest( | |
key: string, | |
getDocument: () => Content | Promise<Content>, | |
) { | |
const document = await getDocument(); | |
const values = await split(document.content); | |
const { dimensions, embeddings } = await embed(values); | |
const chunks = values.map((it, idx) => ({ | |
content: it, | |
embedding: embeddings[idx], | |
metadata: document.metadata, | |
})); | |
const store = await vectoreStore(); | |
store.addDocuments(key, dimensions, chunks); | |
} | |
async function search(key: string, query: string) { | |
const { embeddings } = await embed([query]); | |
const stmt = db.prepare( | |
` | |
SELECT content, distance, metadata | |
FROM vec_chunks | |
WHERE | |
embedding MATCH vec_normalize(vec_f32(?)) | |
AND k = 10 | |
AND key = ? | |
ORDER BY distance ASC | |
`, | |
); | |
return stmt.all(JSON.stringify(embeddings[0]), key); | |
} | |
const filePath = 'mlschmitt/classic-books-markdown/Franz Kafka/The Trial.md'; | |
const query = 'Why the protagonist is arrested?'; | |
await ingest(filePath, async () => { | |
const { readFile } = github(filePath); | |
return { | |
content: await readFile(), | |
metadata: { source: `github:${filePath}` }, | |
}; | |
}); | |
const results = await search(filePath, query); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment