Created
December 3, 2019 17:05
-
-
Save wbhinton/89f5525180559a0269d3e5faf9acb554 to your computer and use it in GitHub Desktop.
Summarize All PDF's in a directory using pdfMiner
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter | |
from pdfminer.converter import TextConverter | |
from pdfminer.layout import LAParams | |
from pdfminer.pdfpage import PDFPage | |
from io import StringIO | |
path = "pdf/*.pdf" | |
def convert_pdf_to_txt(path): | |
rsrcmgr = PDFResourceManager() | |
retstr = StringIO() | |
codec = 'utf-8' | |
laparams = LAParams() | |
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) | |
fp = open(path, 'rb') | |
interpreter = PDFPageInterpreter(rsrcmgr, device) | |
password = "" | |
maxpages = 0 | |
caching = True | |
pagenos=set() | |
for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True): | |
interpreter.process_page(page) | |
text = retstr.getvalue() | |
fp.close() | |
device.close() | |
retstr.close() | |
return text | |
for fname in glob.glob(path): | |
try: | |
# raw = convert_pdf_to_txt(fname) | |
article_text = convert_pdf_to_txt(fname) | |
# Removing special characters and digits | |
formatted_article_text = re.sub('[^a-zA-Z]', ' ', article_text ) | |
formatted_article_text = re.sub(r'\s+', ' ', formatted_article_text) | |
sentence_list = nltk.sent_tokenize(article_text) | |
stopwords = nltk.corpus.stopwords.words('english') | |
word_frequencies = {} | |
for word in nltk.word_tokenize(formatted_article_text): | |
if word not in stopwords: | |
if word not in word_frequencies.keys(): | |
word_frequencies[word] = 1 | |
else: | |
word_frequencies[word] += 1 | |
maximum_frequncy = max(word_frequencies.values()) | |
for word in word_frequencies.keys(): | |
word_frequencies[word] = (word_frequencies[word]/maximum_frequncy) | |
sentence_scores = {} | |
for sent in sentence_list: | |
for word in nltk.word_tokenize(sent.lower()): | |
if word in word_frequencies.keys(): | |
if len(sent.split(' ')) < 30: | |
if sent not in sentence_scores.keys(): | |
sentence_scores[sent] = word_frequencies[word] | |
else: | |
sentence_scores[sent] += word_frequencies[word] | |
summary_sentences = heapq.nlargest(10, sentence_scores, key=sentence_scores.get) | |
summary = ' '.join(summary_sentences) | |
# utext = str(summary).encode('ISO-8859-1', 'ignore') | |
name = fname.split('.pd')[0] | |
outname = name + '-pdfminer.txt' | |
# out = open(outname,"w") | |
# out.write(summary).encode('utf-8') | |
# out.close() | |
with open(outname,"w") as text_file: | |
print(f'{summary}',file=text_file) | |
except: | |
print(f'{outname} did not work.') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment