Created
February 8, 2023 06:36
-
-
Save jryebread/3e9e66e0f131082f8a8b5cf4d8531573 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pickle | |
from dotenv import load_dotenv | |
import os | |
from langchain.chains.qa_with_sources import load_qa_with_sources_chain | |
load_dotenv() | |
import streamlit as st | |
import PyPDF2 | |
from pathlib import Path | |
from langchain.embeddings.openai import OpenAIEmbeddings | |
from langchain.embeddings.cohere import CohereEmbeddings | |
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter | |
from langchain.vectorstores.elastic_vector_search import ElasticVectorSearch | |
from langchain.vectorstores.faiss import FAISS | |
from langchain.docstore.document import Document | |
from langchain.prompts import PromptTemplate | |
from langchain.document_loaders import UnstructuredFileLoader | |
from langchain.chains import VectorDBQAWithSourcesChain | |
from langchain.llms import OpenAI | |
def remove_extra(line): | |
line = line.replace('\n', ' ') | |
line = line.replace(' ', ' ') | |
line = line.replace(' ', ' ') | |
line = line.replace('\\n', ' ') | |
return line | |
def pdfToTxt(): | |
pages_text = [] | |
print("Starting pdf to text transcription") | |
with open("file.pdf", 'rb') as pdfFileObject: | |
pdfReader = PyPDF2.PdfReader(pdfFileObject) | |
print(" No. Of Pages :", len(pdfReader.pages)) | |
for i, page in enumerate(pdfReader.pages): | |
pageObject = pdfReader.pages[i] | |
pages_text.append(pageObject.extract_text()) | |
if i == 544: #pages past 544 trash | |
break | |
# write txt to file | |
with open('file.txt', 'w', encoding="utf-8") as f: | |
for line in pages_text: | |
remove_extra(line) | |
f.write(line) | |
print("done with pdf to txt!") | |
# upload image to streamlit | |
uploaded_file = st.file_uploader("Choose a pdf file") | |
if uploaded_file is not None: | |
# To read file as bytes: | |
pdfBytes = uploaded_file.getvalue() | |
with open('file.pdf', 'wb') as handler: | |
handler.write(pdfBytes) | |
with st.spinner(text='In progress'): | |
pdfToTxt() | |
st.success("pdf uploaded!") | |
with open('file.txt', encoding="utf-8") as f: | |
data = f.read() | |
print("loaded data") | |
# Split Text to get most relevant data for LLM | |
text_splitter = CharacterTextSplitter(separator=" ", chunk_size=1024, chunk_overlap=0) | |
source_chunks = [] | |
for chunk in text_splitter.split_text(data): | |
print("chunk: " + chunk) | |
source_chunks.append(Document(page_content=chunk)) | |
embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_KEY) | |
# It uses OpenAI API to create embeddings (i.e. a feature vector) | |
# https://developers.google.com/machine-learning/crash-course/embeddings/video-lecture | |
vectorStore = FAISS.from_documents(source_chunks, embeddings) | |
with open("vectorstore.pkl", "wb") as f: | |
pickle.dump(vectorStore, f) | |
chain = load_qa_with_sources_chain(OpenAI(temperature=0, openai_api_key=OPENAI_KEY), | |
chain_type="stuff") | |
userText = st.text_input('Ask Me Anything, Im an AI that just scanned that pdf :)') | |
result = chain( | |
{ | |
"input_documents": vectorStore.similarity_search(userText, k=4), | |
"question": userText, | |
}, | |
return_only_outputs=True, | |
)["output_text"] | |
st.balloons() | |
st.subheader(result) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment