Last active
July 24, 2025 15:47
-
-
Save libcrack/946685662c2591003335cd87fffd5f53 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
from dotenv import load_dotenv | |
from langchain.document_loaders import PyPDFLoader | |
from langchain.llms import OpenAI | |
from langchain.chains.summarize import load_summarize_chain | |
from langchain.text_splitter import CharacterTextSplitter | |
import textwrap | |
import os | |
import fnmatch | |
try: | |
LLM_KEY = os.environ.get("OPENAI_API_KEY") | |
except Exception as e: | |
print(f"ERROR: Cannot read OpenAI API key from the environment") | |
raise(e) | |
def file_search_match(pattern) -> list: | |
cwd = os.getcwd() | |
files = os.listdir(cwd) | |
result = [f for f in files if fnmatch.fnmatch(f, pattern)] | |
return result | |
if __name__ == "__main__": | |
size = 1000 | |
overlay = 100 | |
text = "" | |
# patterns = [f"*.md", "*.txt"] | |
# | |
# results = map(file_search_match, patterns) | |
# for f in results: | |
# if f: | |
# print (f"-> Reading {f}") | |
# loader = PyPDFLoader(f) | |
# pages = loader.load() | |
# for page in pages: | |
# text += page.page_content | |
# text = text.replace('\t', ' ') | |
loader = PyPDFLoader(f) | |
pages = loader.load() | |
for page in pages: | |
text += page.page_content | |
text = text.replace('\t', ' ') | |
text_splitter = CharacterTextSplitter( | |
separator="\n", | |
chunk_size=size, | |
chunk_overlap=overlap | |
) | |
texts = text_splitter.create_documents([text]) | |
llm = OpenAI(temperature=0) | |
chain = load_summarize_chain(llm, chain_type="map_reduce") | |
summarized_text = chain.run(texts) | |
print(summarized_text) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment