Last active
March 14, 2024 16:49
-
-
Save databyjp/c13b1cdd474bead8e930702cfaae858e to your computer and use it in GitHub Desktop.
Rough script to export data from Weaviate to a series of JSON files
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import weaviate | |
from weaviate.collections import Collection | |
from weaviate.collections.classes.types import GeoCoordinate | |
from tqdm import tqdm | |
from typing import List | |
import os | |
import json | |
from datetime import date, datetime | |
client = weaviate.connect_to_wcs( | |
cluster_url=os.getenv("WCS_DEMO_URL"), | |
auth_credentials=weaviate.auth.AuthApiKey(os.getenv("WCS_DEMO_RO_KEY")), | |
) | |
def export_data(collection_src: Collection) -> List: | |
objects = list() | |
for q in tqdm(collection_src.iterator(include_vector=True)): | |
object = q.properties | |
for k, v in object.items(): | |
# Convert datetime and date objects to ISO 8601 strings | |
if isinstance(v, (datetime, date)): | |
object[k] = v.isoformat() | |
# Convert GeoCoordinates to dict | |
if isinstance(v, GeoCoordinate): | |
object[k] = v._to_dict() | |
for k, v in q.vector.items(): | |
object[f"vector_{k}"] = v | |
object["uuid"] = str(q.uuid) | |
objects.append(object) | |
return objects | |
def save_objects(coll_name: str, coll_objects: List): | |
with open(f"exported_data_{coll_name}.json", "w") as f: | |
json.dump(coll_objects, f, indent=2) | |
return True | |
obj_count = 0 | |
for c_name in client.collections.list_all(simple=True): | |
print(f"Backing up {c_name}") | |
# Back up objects | |
collection = client.collections.get(c_name) | |
if collection.config.get().multi_tenancy_config.enabled: | |
mt_objects = dict() | |
for tenant in collection.tenants.get(): | |
tenant_collection = collection.with_tenant(tenant) | |
mt_coll_objects = export_data(tenant_collection) | |
mt_objects[tenant] = mt_coll_objects | |
obj_count += len(mt_coll_objects) | |
save_objects(c_name, mt_objects) | |
print(f"Tenant {tenant} for Collection {c_name} saved with {len(mt_objects)} objects") | |
else: | |
coll_objects = export_data(collection) | |
obj_count += len(coll_objects) | |
save_objects(c_name, coll_objects) | |
print(f"Collection {c_name} saved with {len(coll_objects)} objects") | |
# Back up schema | |
coll_config = collection.config.get().to_dict() | |
with open(f"exported_schema_{c_name}.json", "w") as f: | |
json.dump(coll_config, f, indent=2) | |
print(f"Schema for Collection {c_name} saved") | |
client.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment