Skip to content

Instantly share code, notes, and snippets.

@fsndzomga
Created September 29, 2023 23:11
Show Gist options
  • Save fsndzomga/c3d874239e56b0b8de0fafc65ad7d39b to your computer and use it in GitHub Desktop.
Save fsndzomga/c3d874239e56b0b8de0fafc65ad7d39b to your computer and use it in GitHub Desktop.
def Backend(pdf_path):
pdf_name = os.path.basename(pdf_path)
index_name = os.path.splitext(pdf_name)[0] # remove extension
if index_name not in pinecone.list_indexes():
pinecone.create_index(
name=index_name,
dimension=768, # because I use bert-base-uncased
metric='cosine'
)
# wait a moment for the index to be fully initialized
time.sleep(1)
# Convert PDF to list of images
images = convert_from_path(pdf_path, output_folder='images')
# Extract text from each image
texts = []
for image in images:
extracted_text = pytesseract.image_to_string(image)
texts.append(extracted_text)
chunks = create_chunks_with_metadata(texts)
embeddings = create_embeddings(chunks)
index = pinecone.Index(index_name)
index.upsert(embeddings)
index = pinecone.Index(index_name)
responder = Responder(index)
return responder
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment