Created
May 7, 2024 22:07
-
-
Save cdxker/84a06d13ed05e299e7a6c56a4d018870 to your computer and use it in GitHub Desktop.
AmazonAboWithImages
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import * as fs from "fs"; | |
import * as readline from "readline"; | |
import * as path from "path"; | |
import { ChunkApi, Configuration } from "@devflowinc/trieve-js-ts-client"; | |
interface LanguageTaggedValue { | |
language_tag: string; | |
value: string; | |
} | |
interface ItemDimension { | |
unit: string; | |
value: number; | |
} | |
interface ItemDimensions { | |
height: ItemDimension; | |
length: ItemDimension; | |
width: ItemDimension; | |
} | |
interface Node { | |
node_id: number; | |
node_name: string; | |
} | |
interface Item { | |
brand: LanguageTaggedValue[]; | |
bullet_point: LanguageTaggedValue[]; | |
color: LanguageTaggedValue[]; | |
item_id: string; | |
item_name: LanguageTaggedValue[]; | |
model_name: LanguageTaggedValue[]; | |
model_number: { value: string }[]; | |
model_year: { value: number }[]; | |
product_type: { value: string }[]; | |
style: LanguageTaggedValue[]; | |
main_image_id: string; | |
other_image_id: string[]; | |
item_keywords: LanguageTaggedValue[]; | |
country: string; | |
marketplace: string; | |
domain_name: string; | |
node: Node[]; | |
item_dimensions?: ItemDimensions; | |
} | |
function itemToSearchableString(item: Item): string { | |
let searchableString = ""; | |
// Safely adds a field to the searchable string if it exists | |
const addField = ( | |
field: string | undefined, | |
prefix: string = "", | |
postfix: string = "", | |
) => { | |
if (field) { | |
searchableString += `${prefix}${field}${postfix} `; | |
} | |
}; | |
// Process each field with a safe check and appropriate formatting | |
addField(item.brand?.[0]?.value, "Brand: "); | |
addField(item.item_name?.[0]?.value, "Product Name: "); | |
item.bullet_point?.forEach((bp) => addField(bp.value, "", ";")); | |
addField(item.color?.[0]?.value, "Color: "); | |
addField(item.model_name?.[0]?.value, "Model Name: "); | |
addField(item.model_number?.[0]?.value, "Model Number: "); | |
// For numerical fields, ensure existence before converting to string | |
if (item.model_year?.[0]?.value !== undefined) { | |
addField(item.model_year[0].value.toString(), "Model Year: "); | |
} | |
addField(item.product_type?.[0]?.value, "Product Type: "); | |
addField(item.style?.[0]?.value, "Style: "); | |
item.item_keywords?.forEach((kw) => addField(kw.value, "", ";")); | |
addField(item.country, "Country: "); | |
addField(item.marketplace, "Marketplace: "); | |
addField(item.domain_name, "Domain: "); | |
return searchableString.trim(); | |
} | |
const trieveApiKey = Bun.env.TRIEVE_API_KEY ?? ""; | |
const trieveDatasetId = Bun.env.TRIEVE_DATASET_ID ?? ""; | |
const trieveApiConfig = new Configuration({ | |
apiKey: trieveApiKey, | |
basePath: "https://api.trieve.ai", | |
}); | |
function parseItem(jsonString: string): Item { | |
const item: Item = JSON.parse(jsonString); | |
return item; | |
} | |
const chunkApi = new ChunkApi(trieveApiConfig); | |
// Function to read and parse each line of the JSON file | |
async function parseItemsFromFile(filePath: string) { | |
const fileStream = fs.createReadStream(filePath); | |
const rl = readline.createInterface({ | |
input: fileStream, | |
crlfDelay: Infinity, | |
}); | |
const items = []; | |
for await (const line of rl) { | |
try { | |
const item: Item = JSON.parse(line); | |
let imageUrl; | |
let imageId = item.main_image_id == "" ? null: item.main_image_id; | |
let imagePath = imageHashMap.get(imageId); | |
if (imagePath != null) { | |
imageUrl = `https://amazon-berkeley-objects.s3.amazonaws.com/images/small/${imagePath}` | |
} | |
items.push({ | |
chunk_html: itemToSearchableString(item), | |
link: `https://${item.domain_name}/dp/${item.item_id}`, | |
tracking_id: item.item_id, | |
tag_set: item.item_keywords?.map((kw) => kw.value), | |
upsert_by_tracking_id: true, | |
metadata: { | |
imageUrl, | |
}, | |
}); | |
} catch (error) { | |
console.error("Error parsing JSON from line:", error); | |
} | |
} | |
const chunkSize = 30; | |
const chunkedItems = []; | |
for (let i = 0; i < items.length; i += chunkSize) { | |
const chunk = items.slice(i, i + chunkSize); | |
chunkedItems.push(chunk); | |
} | |
console.log(`Bulk uploading ${chunkedItems.length} items of size ${chunkSize}`); | |
for (const chunk of chunkedItems) { | |
try { | |
await chunkApi.createChunk(trieveDatasetId, chunk); | |
} catch (error) { | |
console.error(`Failed to create chunk`); | |
console.error(error); | |
} | |
} | |
return items; | |
} | |
// Function to parse CSV data and store it in a hashmap | |
function parseCSV(csvData: string): Map<any, any> { | |
const lines = csvData.split('\n'); | |
const header = lines[0].trim().split(','); | |
const hashmap = new Map(); | |
for (let i = 1; i < lines.length; i++) { | |
const values = lines[i].trim().split(','); | |
let imageId = null; | |
let path = null; | |
if (values[0]) { | |
imageId = values[0].trim(); | |
} | |
if (values[3]) { | |
path = values[3].trim(); | |
} | |
if (imageId != null && path != null) { | |
hashmap.set(imageId, path); | |
} | |
} | |
return hashmap; | |
} | |
// Read CSV file | |
const csvFilePath = 'images.csv'; | |
const csvImageData = await Bun.file(csvFilePath).text(); | |
let imageHashMap = parseCSV(csvImageData); | |
if (imageHashMap == null) { | |
console.log("Failed"); | |
} | |
// Example usage | |
const directoryPath = "./amazon-abo"; | |
fs.readdir(directoryPath, (err, files) => { | |
if (err) { | |
console.error("Error reading directory:", err); | |
return; | |
} | |
files.forEach((file) => { | |
const filePath = path.join(directoryPath, file); | |
parseItemsFromFile(filePath) | |
.then((items) => { | |
console.log(`Processed ${file}: ${items.length} items`); | |
}) | |
.catch((error) => { | |
console.error(`Error processing ${file}:`, error); | |
}); | |
}); | |
}); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment