Skip to content

Instantly share code, notes, and snippets.

@densumesh
Created April 28, 2024 06:57
Show Gist options
  • Save densumesh/1b4bd2863555dd034b08803fc362b678 to your computer and use it in GitHub Desktop.
Save densumesh/1b4bd2863555dd034b08803fc362b678 to your computer and use it in GitHub Desktop.
import * as fs from "fs";
import * as readline from "readline";
const trieveApiKey = Bun.env.TRIEVE_API_KEY ?? "";
const trieveDatasetId = Bun.env.TRIEVE_DATASET_ID ?? "";
const trieveBaseUrl = Bun.env.TRIEVE_BASE_URL ?? "https://api.trieve.ai";
interface Address {
City?: string;
Country?: string;
State?: string;
zip_code?: string;
}
interface Amenity {
available: boolean;
name: string;
}
interface UploadChunk {
chunk_html: string;
link: string;
tracking_id: string;
location: { lon: number | undefined; lat: number | undefined };
metadata: { [key: string]: any | undefined };
tag_set: string[];
upsert_by_tracking_id: boolean;
}
interface Business {
about_the_business?: string;
address?: Address;
amenities?: Amenity[];
business_id?: string;
categories?: string[];
city?: string;
country?: string;
full_address?: string;
highlights?: string[];
is_claimed?: boolean;
latitude?: number;
longitude?: number;
name?: string;
opening_hours?: { [day: string]: string };
overall_rating?: number;
phone_number?: string;
price_range?: string;
reviews_count?: number;
services_offered?: string[];
state?: string;
updates_from_business?: string[];
url?: string;
website?: string;
yelp_biz_id?: string;
zip_code?: string;
}
function businessToSearchableString(business: Business): string {
const parts = [
`Name: ${business.name}`,
business.about_the_business ? `About: ${business.about_the_business}` : "",
`Address: ${business.full_address}`,
business.categories ? `Categories: ${business.categories.join("; ")}` : "",
business.amenities
? `Amenities: ${business.amenities
.map((amenity) => `${amenity.name}: ${amenity.available}`)
.join(", ")}`
: "",
business.highlights ? `Highlights: ${business.highlights.join("; ")}` : "",
];
return parts.filter((part) => part).join("\n");
}
function parseBusiness(jsonString: string): Business {
const data: Business = JSON.parse(jsonString);
return data;
}
async function parseBusinessesFromFile(
filePath: string,
): Promise<UploadChunk[]> {
const fileStream = fs.createReadStream(filePath);
const rl = readline.createInterface({
input: fileStream,
crlfDelay: Infinity,
});
const businesses: UploadChunk[] = [];
for await (const line of rl) {
try {
const business: Business = parseBusiness(line);
businesses.push({
chunk_html: businessToSearchableString(business),
link: business.url ?? "",
tracking_id: business.yelp_biz_id ?? "",
location: {
lat: business.latitude,
lon: business.longitude,
},
metadata: {
phone_number: business.phone_number,
price_range: business.price_range,
reviews_count: business.reviews_count?.toString(),
overall_rating: business.overall_rating?.toString(),
services_offered: business.services_offered?.join("; "),
updates_from_business: business.updates_from_business?.join("; "),
opening_hours: business.opening_hours,
is_claimed: business.is_claimed?.toString(),
address: {
city: business.city,
country: business.country,
state: business.state,
zip_code: business.zip_code,
},
amenities: business.amenities,
},
tag_set: business.categories ?? [],
upsert_by_tracking_id: true,
});
} catch (error) {
console.error(`Error parsing JSON from line: ${error}`);
}
}
return businesses;
}
async function uploadBusinessesToChunkApi(businesses: UploadChunk[]) {
const chunkSize = 100;
const chunkedBusinesses: UploadChunk[][] = [];
for (let i = 0; i < businesses.length; i += chunkSize) {
const chunk = businesses.slice(i, i + chunkSize);
chunkedBusinesses.push(chunk);
}
for (const chunk of chunkedBusinesses) {
try {
console.log(`Uploading chunk of ${chunk.length} businesses`);
await fetch(`${trieveBaseUrl}/api/chunk`, {
method: "POST",
headers: {
"Content-Type": "application/json",
Authorization: trieveApiKey,
"TR-Dataset": trieveDatasetId,
},
body: JSON.stringify(chunk),
}).then(async (res) => {
if (!res.ok) {
console.log(await res.json());
throw new Error(`Failed to create chunk: ${await res.json()}`);
}
});
} catch (error) {
console.error(`Failed to create chunk: ${error}`);
}
}
}
// Example usage
const filePath = "/home/denssumesh/Downloads/yelp_partial.ndjson";
parseBusinessesFromFile(filePath)
.then((businesses) => {
uploadBusinessesToChunkApi(businesses);
console.log(`Processed ${filePath}: ${businesses.length} entries`);
})
.catch((error) => {
console.error(`Error processing ${filePath}: ${error}`);
});
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment