-
-
Save cdxker/90af4ae4d4873cc2d6acc7586c3c9705 to your computer and use it in GitHub Desktop.
Supa fast bulk_create script with https://trieve.ai and bun.js (unsupported)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
API_URL="http://api.trieve.ai/api" | |
API_KEY="tr-***************" | |
ORGANIZATION_ID="************************************" | |
# Optional | |
DATASET_ID="*************" # If doesn't exist, will make one from the organization ID | |
# If QDRANT information doesn't exist, it just uses the defaults in trieve | |
QDRANT_URL="https://<my-qdrant-ip>:6334" | |
QDRANT_API_KEY="my-qdrant-api-key" | |
QDRANT_COLLECTION_NAME="my-collection" | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const API_URL = Bun.env.API_URL as string; | |
const QDRANT_URL = Bun.env.QDRANT_URL ?? null; | |
const QDRANT_API_KEY = Bun.env.QDRANT_API_KEY ?? null; | |
const QDRANT_COLLECTION_NAME = Bun.env.QDRANT_COLLECTION_NAME ?? null; | |
export interface CreateChunkData { | |
chunk_html: string; | |
group_ids?: string[]; | |
link: string; | |
tag_set?: string[]; | |
tracking_id?: string; | |
upsert_by_tracking_id?: boolean; | |
metadata: object; | |
} | |
const createChunk = async (chunkData: CreateChunkData[]) => { | |
console.time("uplaod"); | |
const response = await fetch(`${API_URL}/chunk`, { | |
method: "POST", | |
headers: { | |
"Content-Type": "application/json", | |
Authorization: Bun.env.API_KEY ?? "", | |
"TR-Dataset": DATASET_ID, | |
}, | |
body: JSON.stringify(chunkData), | |
}); | |
if (!response.ok) { | |
console.error("err", response.status, response.statusText); | |
const respText = await response.text(); | |
console.error("err", respText); | |
return ""; | |
} | |
const responseJson = await response.json(); | |
if (!response.ok) { | |
console.error("err", responseJson.message); | |
return ""; | |
} | |
console.log("succ", responseJson.chunk_metadata); | |
console.timeEnd("uplaod"); | |
}; | |
const createDataset = async () => { | |
const response = await fetch(`${API_URL}/dataset`, { | |
headers: { | |
"Content-Type": "application/json", | |
"TR-Organization": Bun.env.ORGANIZATION_ID ?? "", | |
Authorization: Bun.env.API_KEY ?? "", | |
}, | |
body: JSON.stringify({ | |
dataset_name: "TestDataset", | |
organization_id: Bun.env.ORGANIZATION_ID ?? "", | |
server_configuration: { | |
LLM_BASE_URL: "", | |
LLM_DEFAULT_MODEL: "", | |
RAG_PROMPT: "", | |
EMBEDDING_SIZE: 1024, | |
N_RETRIEVALS_TO_INCLUDE: 8, | |
DUPLICATE_DISTANCE_THRESHOLD: 1.1, | |
DOCUMENT_UPLOAD_FEATURE: true, | |
DOCUMENT_DOWNLOAD_FEATURE: true, | |
COLLISIONS_ENABLED: false, | |
QDRANT_URL, | |
QDRANT_API_KEY, | |
QDRANT_COLLECTION_NAME, | |
}, | |
client_configuration: "{}", | |
}), | |
method: "POST", | |
}); | |
console.log(response); | |
const responseJson = await response.json(); | |
return responseJson.id; | |
}; | |
const DATASET_ID = Bun.env.DATASET_ID ?? (await createDataset()); | |
const prom = []; | |
for (let j = 0; j < 10; j++) { | |
const datas = []; | |
for (let i = 0; i < 1000; i++) { | |
const chunk_html = `ID ${i} ${j} Seems they’ve built out a PG version of MySQL’s Vitess Query rewriting seems interesting, having a layer between your DB and your application would also allow various ACL stuff as wellr`; | |
const chunkData: CreateChunkData = { | |
chunk_html, | |
link: "", | |
tracking_id: `${i}${j}`, | |
metadata: { | |
bro: "hi", | |
}, | |
}; | |
datas.push(chunkData); | |
} | |
prom.push(createChunk(datas)); | |
} | |
await Promise.all(prom); | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment