Created
April 28, 2024 06:57
-
-
Save densumesh/1b4bd2863555dd034b08803fc362b678 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import * as fs from "fs"; | |
import * as readline from "readline"; | |
const trieveApiKey = Bun.env.TRIEVE_API_KEY ?? ""; | |
const trieveDatasetId = Bun.env.TRIEVE_DATASET_ID ?? ""; | |
const trieveBaseUrl = Bun.env.TRIEVE_BASE_URL ?? "https://api.trieve.ai"; | |
interface Address { | |
City?: string; | |
Country?: string; | |
State?: string; | |
zip_code?: string; | |
} | |
interface Amenity { | |
available: boolean; | |
name: string; | |
} | |
interface UploadChunk { | |
chunk_html: string; | |
link: string; | |
tracking_id: string; | |
location: { lon: number | undefined; lat: number | undefined }; | |
metadata: { [key: string]: any | undefined }; | |
tag_set: string[]; | |
upsert_by_tracking_id: boolean; | |
} | |
interface Business { | |
about_the_business?: string; | |
address?: Address; | |
amenities?: Amenity[]; | |
business_id?: string; | |
categories?: string[]; | |
city?: string; | |
country?: string; | |
full_address?: string; | |
highlights?: string[]; | |
is_claimed?: boolean; | |
latitude?: number; | |
longitude?: number; | |
name?: string; | |
opening_hours?: { [day: string]: string }; | |
overall_rating?: number; | |
phone_number?: string; | |
price_range?: string; | |
reviews_count?: number; | |
services_offered?: string[]; | |
state?: string; | |
updates_from_business?: string[]; | |
url?: string; | |
website?: string; | |
yelp_biz_id?: string; | |
zip_code?: string; | |
} | |
function businessToSearchableString(business: Business): string { | |
const parts = [ | |
`Name: ${business.name}`, | |
business.about_the_business ? `About: ${business.about_the_business}` : "", | |
`Address: ${business.full_address}`, | |
business.categories ? `Categories: ${business.categories.join("; ")}` : "", | |
business.amenities | |
? `Amenities: ${business.amenities | |
.map((amenity) => `${amenity.name}: ${amenity.available}`) | |
.join(", ")}` | |
: "", | |
business.highlights ? `Highlights: ${business.highlights.join("; ")}` : "", | |
]; | |
return parts.filter((part) => part).join("\n"); | |
} | |
function parseBusiness(jsonString: string): Business { | |
const data: Business = JSON.parse(jsonString); | |
return data; | |
} | |
async function parseBusinessesFromFile( | |
filePath: string, | |
): Promise<UploadChunk[]> { | |
const fileStream = fs.createReadStream(filePath); | |
const rl = readline.createInterface({ | |
input: fileStream, | |
crlfDelay: Infinity, | |
}); | |
const businesses: UploadChunk[] = []; | |
for await (const line of rl) { | |
try { | |
const business: Business = parseBusiness(line); | |
businesses.push({ | |
chunk_html: businessToSearchableString(business), | |
link: business.url ?? "", | |
tracking_id: business.yelp_biz_id ?? "", | |
location: { | |
lat: business.latitude, | |
lon: business.longitude, | |
}, | |
metadata: { | |
phone_number: business.phone_number, | |
price_range: business.price_range, | |
reviews_count: business.reviews_count?.toString(), | |
overall_rating: business.overall_rating?.toString(), | |
services_offered: business.services_offered?.join("; "), | |
updates_from_business: business.updates_from_business?.join("; "), | |
opening_hours: business.opening_hours, | |
is_claimed: business.is_claimed?.toString(), | |
address: { | |
city: business.city, | |
country: business.country, | |
state: business.state, | |
zip_code: business.zip_code, | |
}, | |
amenities: business.amenities, | |
}, | |
tag_set: business.categories ?? [], | |
upsert_by_tracking_id: true, | |
}); | |
} catch (error) { | |
console.error(`Error parsing JSON from line: ${error}`); | |
} | |
} | |
return businesses; | |
} | |
async function uploadBusinessesToChunkApi(businesses: UploadChunk[]) { | |
const chunkSize = 100; | |
const chunkedBusinesses: UploadChunk[][] = []; | |
for (let i = 0; i < businesses.length; i += chunkSize) { | |
const chunk = businesses.slice(i, i + chunkSize); | |
chunkedBusinesses.push(chunk); | |
} | |
for (const chunk of chunkedBusinesses) { | |
try { | |
console.log(`Uploading chunk of ${chunk.length} businesses`); | |
await fetch(`${trieveBaseUrl}/api/chunk`, { | |
method: "POST", | |
headers: { | |
"Content-Type": "application/json", | |
Authorization: trieveApiKey, | |
"TR-Dataset": trieveDatasetId, | |
}, | |
body: JSON.stringify(chunk), | |
}).then(async (res) => { | |
if (!res.ok) { | |
console.log(await res.json()); | |
throw new Error(`Failed to create chunk: ${await res.json()}`); | |
} | |
}); | |
} catch (error) { | |
console.error(`Failed to create chunk: ${error}`); | |
} | |
} | |
} | |
// Example usage | |
const filePath = "/home/denssumesh/Downloads/yelp_partial.ndjson"; | |
parseBusinessesFromFile(filePath) | |
.then((businesses) => { | |
uploadBusinessesToChunkApi(businesses); | |
console.log(`Processed ${filePath}: ${businesses.length} entries`); | |
}) | |
.catch((error) => { | |
console.error(`Error processing ${filePath}: ${error}`); | |
}); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment