fsndzomga · September 29, 2023 23:11
diff --git a/backend.py b/backend.py
 def Backend(pdf_path):

    pdf_name = os.path.basename(pdf_path)

    index_name = os.path.splitext(pdf_name)[0]  # remove extension

    if index_name not in pinecone.list_indexes():
        pinecone.create_index(
            name=index_name,
            dimension=768,  # because I use bert-base-uncased
            metric='cosine'
        )
        # wait a moment for the index to be fully initialized
        time.sleep(1)

        # Convert PDF to list of images
        images = convert_from_path(pdf_path, output_folder='images')

        # Extract text from each image
        texts = []

        for image in images:
            extracted_text = pytesseract.image_to_string(image)
            texts.append(extracted_text)

        chunks = create_chunks_with_metadata(texts)

        embeddings = create_embeddings(chunks)

        index = pinecone.Index(index_name)

        index.upsert(embeddings)

    index = pinecone.Index(index_name)

    responder = Responder(index)

    return responder
	def Backend(pdf_path):

	pdf_name = os.path.basename(pdf_path)

	index_name = os.path.splitext(pdf_name)[0] # remove extension

	if index_name not in pinecone.list_indexes():
	pinecone.create_index(
	name=index_name,
	dimension=768, # because I use bert-base-uncased
	metric='cosine'
	)
	# wait a moment for the index to be fully initialized
	time.sleep(1)

	# Convert PDF to list of images
	images = convert_from_path(pdf_path, output_folder='images')

	# Extract text from each image
	texts = []

	for image in images:
	extracted_text = pytesseract.image_to_string(image)
	texts.append(extracted_text)

	chunks = create_chunks_with_metadata(texts)

	embeddings = create_embeddings(chunks)

	index = pinecone.Index(index_name)

	index.upsert(embeddings)

	index = pinecone.Index(index_name)

	responder = Responder(index)

	return responder