Last active
May 15, 2024 23:07
-
-
Save densumesh/31cc0ce3e54316a32dcddbdf48c2f11f to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import * as fs from "fs"; | |
import * as readline from "readline"; | |
import * as path from "path"; | |
import { | |
ChunkApi, | |
Configuration, | |
CreateChunkData, | |
} from "@devflowinc/trieve-js-ts-client"; | |
import { it } from "node:test"; | |
interface LanguageTaggedValue { | |
language_tag: string; | |
value: string; | |
} | |
interface ItemDimension { | |
unit: string; | |
value: number; | |
} | |
interface ItemDimensions { | |
height: ItemDimension; | |
length: ItemDimension; | |
width: ItemDimension; | |
} | |
interface Node { | |
node_id: number; | |
node_name: string; | |
} | |
interface Item { | |
brand: LanguageTaggedValue[]; | |
bullet_point: LanguageTaggedValue[]; | |
color: LanguageTaggedValue[]; | |
item_id: string; | |
price?: number; | |
image_url?: string; | |
item_name: LanguageTaggedValue[]; | |
model_name: LanguageTaggedValue[]; | |
model_number: { value: string }[]; | |
model_year: { value: number }[]; | |
product_type: { value: string }[]; | |
style: LanguageTaggedValue[]; | |
main_image_id: string; | |
other_image_id: string[]; | |
item_keywords: LanguageTaggedValue[]; | |
country: string; | |
marketplace: string; | |
domain_name: string; | |
node: Node[]; | |
item_dimensions?: ItemDimensions; | |
} | |
function itemToSearchableString(item: Item, price: number | undefined): string { | |
let searchableString = ""; | |
// Safely adds a field to the searchable string if it exists | |
const addField = ( | |
field: string | undefined, | |
prefix: string = "", | |
postfix: string = "\n", | |
) => { | |
if (field) { | |
searchableString += `${prefix}${field}${postfix}`; | |
} | |
}; | |
addField(price?.toString(), "Price: $"); | |
// Process each field with a safe check and appropriate formatting | |
addField(item.brand?.[0]?.value, "Brand: "); | |
addField(item.item_name?.[0]?.value, "Product Name: "); | |
item.bullet_point?.forEach((bp) => addField(bp.value, "", ";")); | |
addField(item.color?.[0]?.value, "Color: "); | |
addField(item.model_name?.[0]?.value, "Model Name: "); | |
addField(item.model_number?.[0]?.value, "Model Number: "); | |
// For numerical fields, ensure existence before converting to string | |
if (item.model_year?.[0]?.value !== undefined) { | |
addField(item.model_year[0].value.toString(), "Model Year: "); | |
} | |
addField(item.product_type?.[0]?.value, "Product Type: "); | |
addField(item.style?.[0]?.value, "Style: "); | |
item.item_keywords?.forEach((kw) => addField(kw.value, "", ";")); | |
addField(item.country, "Country: "); | |
addField(item.marketplace, "Marketplace: "); | |
addField(item.domain_name, "Domain: "); | |
return searchableString.trim(); | |
} | |
const trieveApiKey = Bun.env.TRIEVE_API_KEY ?? ""; | |
const trieveDatasetId = Bun.env.TRIEVE_DATASET_ID ?? ""; | |
const trieveApiConfig = new Configuration({ | |
apiKey: trieveApiKey, | |
basePath: "https://api.trieve.ai", | |
}); | |
function parseItem(jsonString: string): Item { | |
const item: Item = JSON.parse(jsonString); | |
return item; | |
} | |
function extractMetadata( | |
item: Item, | |
image_url: string, | |
price: number | undefined, | |
): any { | |
const metadata: Partial<Item> = { ...item }; | |
metadata.image_url = image_url; | |
metadata.price = price; | |
return metadata; | |
} | |
const chunkApi = new ChunkApi(trieveApiConfig); | |
// Function to read and parse each line of the JSON file | |
async function parseItemsFromFile(filePath: string) { | |
const fileStream = fs.createReadStream(filePath); | |
const rl = readline.createInterface({ | |
input: fileStream, | |
crlfDelay: Infinity, | |
}); | |
const items: CreateChunkData = []; | |
for await (const line of rl) { | |
try { | |
const item: Item = JSON.parse(line); | |
let image_url; | |
let imageId = item.main_image_id == "" ? null : item.main_image_id; | |
let imagePath = imageHashMap.get(imageId); | |
let price = [10, 25, 50, 100, 500, 1000][Math.floor(Math.random() * 6)]; | |
if (imagePath != null) { | |
image_url = `https://amazon-berkeley-objects.s3.amazonaws.com/images/small/${imagePath}`; | |
} | |
items.push({ | |
chunk_html: itemToSearchableString(item, price), | |
link: `https://${item.domain_name}/dp/${item.item_id}`, | |
tracking_id: item.item_id, | |
tag_set: item.item_keywords?.map((kw) => kw.value), | |
metadata: extractMetadata(item, image_url, price), | |
upsert_by_tracking_id: true, | |
}); | |
} catch (error) { | |
console.error("Error parsing JSON from line:", error); | |
} | |
} | |
const chunkSize = 50; | |
const chunkedItems: CreateChunkData[] = []; | |
for (let i = 0; i < items.length; i += chunkSize) { | |
const chunk = items.slice(i, i + chunkSize); | |
chunkedItems.push(chunk); | |
} | |
for (const chunk of chunkedItems) { | |
try { | |
console.log(`Creating chunk`); | |
await chunkApi.createChunk(trieveDatasetId, chunk); | |
} catch (error) { | |
console.error(`Failed to create chunk`); | |
console.error(error); | |
} | |
} | |
return items; | |
} | |
// Function to parse CSV data and store it in a hashmap | |
function parseCSV(csvData: string): Map<any, any> { | |
const lines = csvData.split("\n"); | |
const header = lines[0].trim().split(","); | |
const hashmap = new Map(); | |
for (let i = 1; i < lines.length; i++) { | |
const values = lines[i].trim().split(","); | |
let imageId: string | null = null; | |
let path: string | null = null; | |
if (values[0]) { | |
imageId = values[0].trim(); | |
} | |
if (values[3]) { | |
path = values[3].trim(); | |
} | |
if (imageId != null && path != null) { | |
hashmap.set(imageId, path); | |
} | |
} | |
return hashmap; | |
} | |
// Read CSV file | |
const csvFilePath = "/home/denssumesh/Documents/arguflow/amazon-abo/images.csv"; | |
const csvImageData = await Bun.file(csvFilePath).text(); | |
let imageHashMap = parseCSV(csvImageData); | |
if (imageHashMap == null) { | |
console.log("Failed"); | |
} | |
// Example usage | |
const directoryPath = "/home/denssumesh/Documents/arguflow/amazon-abo/listings"; | |
fs.readdir(directoryPath, (err, files) => { | |
if (err) { | |
console.error("Error reading directory:", err); | |
return; | |
} | |
files.forEach((file) => { | |
const filePath = path.join(directoryPath, file); | |
parseItemsFromFile(filePath) | |
.then((items) => { | |
console.log(`Processed ${file}: ${items.length} items`); | |
}) | |
.catch((error) => { | |
console.error(`Error processing ${file}:`, error); | |
}); | |
}); | |
}); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment