Skip to content

Instantly share code, notes, and snippets.

@fsndzomga
Last active October 2, 2023 23:53
Show Gist options
  • Save fsndzomga/dcb38f791db1274c3e3291e4e0aaae8b to your computer and use it in GitHub Desktop.
Save fsndzomga/dcb38f791db1274c3e3291e4e0aaae8b to your computer and use it in GitHub Desktop.
def Backend(pdf_path):
pdf_name = os.path.basename(pdf_path)
index_name = os.path.splitext(pdf_name)[0] # remove extension
if index_name not in chroma_client.list_collections():
collection = chroma_client.create_collection(name=index_name)
# wait a moment for the collection to be fully initialized
time.sleep(1)
# First, try to extract text directly from the PDF
pdf_document = fitz.open(pdf_path)
texts = []
for page_number in range(len(pdf_document)):
page = pdf_document.load_page(page_number)
texts.append(page.get_text())
pdf_document.close()
# If no text is extracted, fall back to the OCR approach
if not any(texts):
texts = []
# Convert PDF to list of images
images = convert_from_path(pdf_path, output_folder='images')
# Extract text from each image
texts = []
for image in images:
extracted_text = pytesseract.image_to_string(image)
texts.append(extracted_text)
collection.add(documents=texts,
metadatas=[{"source": f"page_{i+1}"} for i in range(len(texts))],
ids=[f"id{i+1}" for i in range(len(texts))])
collection = chroma_client.get_collection(index_name)
responder = Responder(collection)
return responder
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment