Last active
November 27, 2019 19:52
-
-
Save wbhinton/7e883a9d5babeb776905c35e6d9c34e6 to your computer and use it in GitHub Desktop.
Loop through a folder and summarize all PDFs
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import PyPDF2 | |
import textract | |
import nltk | |
import re | |
from tika import parser | |
import heapq | |
import glob | |
path = "pdf/*.pdf" | |
for fname in glob.glob(path): | |
try: | |
raw = parser.from_file(fname) | |
article_text = raw['content'] | |
# Removing special characters and digits | |
formatted_article_text = re.sub('[^a-zA-Z]', ' ', article_text ) | |
formatted_article_text = re.sub(r'\s+', ' ', formatted_article_text) | |
sentence_list = nltk.sent_tokenize(article_text) | |
stopwords = nltk.corpus.stopwords.words('english') | |
word_frequencies = {} | |
for word in nltk.word_tokenize(formatted_article_text): | |
if word not in stopwords: | |
if word not in word_frequencies.keys(): | |
word_frequencies[word] = 1 | |
else: | |
word_frequencies[word] += 1 | |
maximum_frequncy = max(word_frequencies.values()) | |
for word in word_frequencies.keys(): | |
word_frequencies[word] = (word_frequencies[word]/maximum_frequncy) | |
sentence_scores = {} | |
for sent in sentence_list: | |
for word in nltk.word_tokenize(sent.lower()): | |
if word in word_frequencies.keys(): | |
if len(sent.split(' ')) < 30: | |
if sent not in sentence_scores.keys(): | |
sentence_scores[sent] = word_frequencies[word] | |
else: | |
sentence_scores[sent] += word_frequencies[word] | |
summary_sentences = heapq.nlargest(10, sentence_scores, key=sentence_scores.get) | |
summary = ' '.join(summary_sentences) | |
# utext = str(summary).encode('ISO-8859-1', 'ignore') | |
name = fname.split('.pd')[0] | |
outname = name + '.txt' | |
# out = open(outname,"w") | |
# out.write(summary).encode('utf-8') | |
# out.close() | |
with open(outname,"w") as text_file: | |
print(f'{summary}',file=text_file) | |
except: | |
print(f'{outname} did not work.') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment