eusoubrasileiro · November 19, 2024 10:56
diff --git a/gistfile1.txt b/gistfile1.txt
 import json
 import zlib
 import numpy as np
 from pathlib import Path
 from haystack import Document
 from haystack.document_stores.in_memory import InMemoryDocumentStore
 from tqdm import tqdm

 # Configuration
 ROOT_PATH = Path("/mnt/shared/ipp/")
 OUTPUT_DOC_DIR = ROOT_PATH / "haystack/docs"
 OUTPUT_EMBED_DIR = ROOT_PATH / "haystack/embeddings/"

 for path in tqdm(list(DOCS_PATH.glob("*.txt"))):    
    embedings_file = OUTPUT_EMBED_DIR / f"{path.stem}.npz"
    textnmeta_file = OUTPUT_DOC_DIR / f"{path.stem}.meta"
    # Skip if both files already exist - some names includes '.' so
    if embedings_file.exists() and textnmeta_file.exists():
        continue
    # Read the document content
    with path.open("r") as f:    
        text = f.read()        
    doc = Document(content=text, meta={"title": path.name})
    # need more metadata from spotify, soundcloud etc.
    split_docs = document_splitter.run([doc])  # poor split, only by '.' for sentence 
    # need a good cleaning and maybe another splitter?
    document_embedder.warm_up() # [Document...] now contain embeedings    
    docs = document_embedder.run(split_docs['documents']) # a=input list[Document]
    # Save embeddings and metadata
    embeddings = []
    metadata = []
    for doc in docs['documents']:
        metadata.append({            
            "content": doc.content,
            "meta": doc.meta
        })
        embeddings.append(doc.embedding)        
    np.savez_compressed(embedings_file, embeddings=embeddings)    
    with textnmeta_file.open("wb") as f:
        compressed_metadata = zlib.compress(json.dumps(metadata).encode("utf-8"))
        f.write(compressed_metadata)

 def load_documents_and_embeddings(verbose=False):
    """
    Load metadata and embeddings from disk and recreate documents for the InMemoryDocumentStore.
    """
    # Initialize an empty document store
    document_store = InMemoryDocumentStore()
    if verbose:
        print("Reconstructing the InMemoryDocumentStore...")

    for textnmeta_file in tqdm(list(OUTPUT_DOC_DIR.glob("*.meta"))):
        embedings_file = OUTPUT_EMBED_DIR / f"{textnmeta_file.stem}.npz"
        if not embedings_file.exists():
            print(f"Warning: Embeddings file missing for {textnmeta_file}. Skipping...")
            continue
        with textnmeta_file.open("rb") as f: # Load metadata
            compressed_metadata = f.read()
            metadata = json.loads(zlib.decompress(compressed_metadata).decode("utf-8"))
        embeddings_data = np.load(embedings_file) # Load embeddings
        embeddings = embeddings_data["embeddings"]
        # Ensure the counts match
        if len(metadata) != len(embeddings):
            print(f"Error: Metadata and embeddings count mismatch in {textnmeta_file}. Skipping...")
            continue
        documents = []
        for meta, embedding in zip(metadata, embeddings):
            documents.append(Document(content=meta["content"], meta=meta["meta"], embedding=embedding))
        document_store.write_documents(documents)
    if verbose:
        print(f"Reconstruction complete. {len(document_store.storage)} documents loaded into the store.")
    return document_store
	import json
	import zlib
	import numpy as np
	from pathlib import Path
	from haystack import Document
	from haystack.document_stores.in_memory import InMemoryDocumentStore
	from tqdm import tqdm

	# Configuration
	ROOT_PATH = Path("/mnt/shared/ipp/")
	OUTPUT_DOC_DIR = ROOT_PATH / "haystack/docs"
	OUTPUT_EMBED_DIR = ROOT_PATH / "haystack/embeddings/"

	for path in tqdm(list(DOCS_PATH.glob("*.txt"))):
	embedings_file = OUTPUT_EMBED_DIR / f"{path.stem}.npz"
	textnmeta_file = OUTPUT_DOC_DIR / f"{path.stem}.meta"
	# Skip if both files already exist - some names includes '.' so
	if embedings_file.exists() and textnmeta_file.exists():
	continue
	# Read the document content
	with path.open("r") as f:
	text = f.read()
	doc = Document(content=text, meta={"title": path.name})
	# need more metadata from spotify, soundcloud etc.
	split_docs = document_splitter.run([doc]) # poor split, only by '.' for sentence
	# need a good cleaning and maybe another splitter?
	document_embedder.warm_up() # [Document...] now contain embeedings
	docs = document_embedder.run(split_docs['documents']) # a=input list[Document]
	# Save embeddings and metadata
	embeddings = []
	metadata = []
	for doc in docs['documents']:
	metadata.append({
	"content": doc.content,
	"meta": doc.meta
	})
	embeddings.append(doc.embedding)
	np.savez_compressed(embedings_file, embeddings=embeddings)
	with textnmeta_file.open("wb") as f:
	compressed_metadata = zlib.compress(json.dumps(metadata).encode("utf-8"))
	f.write(compressed_metadata)

	def load_documents_and_embeddings(verbose=False):
	"""
	Load metadata and embeddings from disk and recreate documents for the InMemoryDocumentStore.
	"""
	# Initialize an empty document store
	document_store = InMemoryDocumentStore()
	if verbose:
	print("Reconstructing the InMemoryDocumentStore...")

	for textnmeta_file in tqdm(list(OUTPUT_DOC_DIR.glob("*.meta"))):
	embedings_file = OUTPUT_EMBED_DIR / f"{textnmeta_file.stem}.npz"
	if not embedings_file.exists():
	print(f"Warning: Embeddings file missing for {textnmeta_file}. Skipping...")
	continue
	with textnmeta_file.open("rb") as f: # Load metadata
	compressed_metadata = f.read()
	metadata = json.loads(zlib.decompress(compressed_metadata).decode("utf-8"))
	embeddings_data = np.load(embedings_file) # Load embeddings
	embeddings = embeddings_data["embeddings"]
	# Ensure the counts match
	if len(metadata) != len(embeddings):
	print(f"Error: Metadata and embeddings count mismatch in {textnmeta_file}. Skipping...")
	continue
	documents = []
	for meta, embedding in zip(metadata, embeddings):
	documents.append(Document(content=meta["content"], meta=meta["meta"], embedding=embedding))
	document_store.write_documents(documents)
	if verbose:
	print(f"Reconstruction complete. {len(document_store.storage)} documents loaded into the store.")
	return document_store