Skip to content

Instantly share code, notes, and snippets.

@ezzabuzaid
Created September 10, 2025 07:31
Show Gist options
  • Save ezzabuzaid/abb9fa72a052d17856cd9c29107442d5 to your computer and use it in GitHub Desktop.
Save ezzabuzaid/abb9fa72a052d17856cd9c29107442d5 to your computer and use it in GitHub Desktop.
qwen0.6-embedding.ts
import { embedMany } from 'ai';
import { MarkdownTextSplitter } from 'langchain/text_splitter';
import lmstudio from '../lmstudio.ts';
import { DatabaseSync } from 'node:sqlite';
import type { Embedding } from 'ai';
import * as sqliteVec from 'sqlite-vec';
const db = new DatabaseSync('embed.sqlite', { allowExtension: true });
sqliteVec.load(db);
db.exec(
[
'PRAGMA journal_mode = WAL;',
'PRAGMA synchronous = NORMAL;',
'PRAGMA temp_store = MEMORY;',
].join('\n')
);
export type Metadata = Record<string, unknown>;
export type Content = {
content: string;
metadata?: Metadata;
};
export type Chunk = Content & {
embedding: Embedding;
};
export async function vectoreStore() {
return {
addDocuments: async (key: string, dimension: number, chunks: Chunk[]) => {
db.exec(`
CREATE VIRTUAL TABLE IF NOT EXISTS vec_chunks USING vec0(
key TEXT PARTITION KEY,
embedding FLOAT[${dimension}] DISTANCE_METRIC=cosine,
+content TEXT,
+metadata TEXT
);
`);
const del = db.prepare(`DELETE FROM vec_chunks WHERE key = ?`);
const insert = db.prepare(`
INSERT INTO vec_chunks (key, content, embedding, metadata)
VALUES (?, ?, vec_normalize(vec_f32(?)), ?)
`);
try {
db.exec('BEGIN');
del.run(key);
for (let i = 0; i < chunks.length; i++) {
insert.run(
key,
chunks[i].content,
JSON.stringify(chunks[i].embedding),
chunks[i].metadata ? JSON.stringify(chunks[i].metadata) : null
);
}
db.exec('COMMIT');
} catch (error) {
db.exec('ROLLBACK');
console.log('Error during insert, rolled back transaction:', error);
}
},
};
}
function github(path: string) {
const [owner, repo, ...filePath] = path.split('/');
return {
async readFile() {
const url = `https://api.github.com/repos/${owner}/${repo}/contents/${filePath.join('/')}`;
const res = await fetch(url);
const data = (await res.json()) as { content: string };
return atob(data.content);
},
};
}
async function split(content: string) {
const splitter = new MarkdownTextSplitter();
return splitter.splitText(content);
}
async function embed(documents: string[]) {
const dimensions = 1024;
const { embeddings } = await embedMany({
model: lmstudio.textEmbeddingModel('text-embedding-qwen3-embedding-0.6b'),
values: documents,
providerOptions: { lmstudio: { dimensions } },
});
// const chunks: Document[] = documents.map((it, idx) => ({
// content: it,
// embedding: embeddings[idx],
// }));
return { embeddings, dimensions };
}
async function ingest(
key: string,
getDocument: () => Content | Promise<Content>,
) {
const document = await getDocument();
const values = await split(document.content);
const { dimensions, embeddings } = await embed(values);
const chunks = values.map((it, idx) => ({
content: it,
embedding: embeddings[idx],
metadata: document.metadata,
}));
const store = await vectoreStore();
store.addDocuments(key, dimensions, chunks);
}
async function search(key: string, query: string) {
const { embeddings } = await embed([query]);
const stmt = db.prepare(
`
SELECT content, distance, metadata
FROM vec_chunks
WHERE
embedding MATCH vec_normalize(vec_f32(?))
AND k = 10
AND key = ?
ORDER BY distance ASC
`,
);
return stmt.all(JSON.stringify(embeddings[0]), key);
}
const filePath = 'mlschmitt/classic-books-markdown/Franz Kafka/The Trial.md';
const query = 'Why the protagonist is arrested?';
await ingest(filePath, async () => {
const { readFile } = github(filePath);
return {
content: await readFile(),
metadata: { source: `github:${filePath}` },
};
});
const results = await search(filePath, query);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment