Skip to content

Instantly share code, notes, and snippets.

@eusoubrasileiro
Created November 19, 2024 10:56
Show Gist options
  • Save eusoubrasileiro/d0629be76c7fac263f092ac2d99ee704 to your computer and use it in GitHub Desktop.
Save eusoubrasileiro/d0629be76c7fac263f092ac2d99ee704 to your computer and use it in GitHub Desktop.
Customized save and load InMemoryDocumentStore `Document`s
import json
import zlib
import numpy as np
from pathlib import Path
from haystack import Document
from haystack.document_stores.in_memory import InMemoryDocumentStore
from tqdm import tqdm
# Configuration
ROOT_PATH = Path("/mnt/shared/ipp/")
OUTPUT_DOC_DIR = ROOT_PATH / "haystack/docs"
OUTPUT_EMBED_DIR = ROOT_PATH / "haystack/embeddings/"
for path in tqdm(list(DOCS_PATH.glob("*.txt"))):
embedings_file = OUTPUT_EMBED_DIR / f"{path.stem}.npz"
textnmeta_file = OUTPUT_DOC_DIR / f"{path.stem}.meta"
# Skip if both files already exist - some names includes '.' so
if embedings_file.exists() and textnmeta_file.exists():
continue
# Read the document content
with path.open("r") as f:
text = f.read()
doc = Document(content=text, meta={"title": path.name})
# need more metadata from spotify, soundcloud etc.
split_docs = document_splitter.run([doc]) # poor split, only by '.' for sentence
# need a good cleaning and maybe another splitter?
document_embedder.warm_up() # [Document...] now contain embeedings
docs = document_embedder.run(split_docs['documents']) # a=input list[Document]
# Save embeddings and metadata
embeddings = []
metadata = []
for doc in docs['documents']:
metadata.append({
"content": doc.content,
"meta": doc.meta
})
embeddings.append(doc.embedding)
np.savez_compressed(embedings_file, embeddings=embeddings)
with textnmeta_file.open("wb") as f:
compressed_metadata = zlib.compress(json.dumps(metadata).encode("utf-8"))
f.write(compressed_metadata)
def load_documents_and_embeddings(verbose=False):
"""
Load metadata and embeddings from disk and recreate documents for the InMemoryDocumentStore.
"""
# Initialize an empty document store
document_store = InMemoryDocumentStore()
if verbose:
print("Reconstructing the InMemoryDocumentStore...")
for textnmeta_file in tqdm(list(OUTPUT_DOC_DIR.glob("*.meta"))):
embedings_file = OUTPUT_EMBED_DIR / f"{textnmeta_file.stem}.npz"
if not embedings_file.exists():
print(f"Warning: Embeddings file missing for {textnmeta_file}. Skipping...")
continue
with textnmeta_file.open("rb") as f: # Load metadata
compressed_metadata = f.read()
metadata = json.loads(zlib.decompress(compressed_metadata).decode("utf-8"))
embeddings_data = np.load(embedings_file) # Load embeddings
embeddings = embeddings_data["embeddings"]
# Ensure the counts match
if len(metadata) != len(embeddings):
print(f"Error: Metadata and embeddings count mismatch in {textnmeta_file}. Skipping...")
continue
documents = []
for meta, embedding in zip(metadata, embeddings):
documents.append(Document(content=meta["content"], meta=meta["meta"], embedding=embedding))
document_store.write_documents(documents)
if verbose:
print(f"Reconstruction complete. {len(document_store.storage)} documents loaded into the store.")
return document_store
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment