Created
December 3, 2019 17:00
-
-
Save wbhinton/638526e3bc7ceb9710d306a7cc15cc73 to your computer and use it in GitHub Desktop.
Summarize All PDF's in a directory using Tika. Update the directory path as needed.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import PyPDF2 | |
import textract | |
import nltk | |
import re | |
from tika import parser | |
import heapq | |
import glob | |
sys.stdout = codecs.getwriter("iso-8859-1")(sys.stdout, 'xmlcharrefreplace') | |
path = "pdf/*.pdf" | |
for fname in glob.glob(path): | |
try: | |
raw = parser.from_file(fname) | |
article_text = raw['content'] | |
# Removing special characters and digits | |
formatted_article_text = re.sub('[^a-zA-Z]', ' ', article_text ) | |
formatted_article_text = re.sub(r'\s+', ' ', formatted_article_text) | |
sentence_list = nltk.sent_tokenize(article_text) | |
stopwords = nltk.corpus.stopwords.words('english') | |
word_frequencies = {} | |
for word in nltk.word_tokenize(formatted_article_text): | |
if word not in stopwords: | |
if word not in word_frequencies.keys(): | |
word_frequencies[word] = 1 | |
else: | |
word_frequencies[word] += 1 | |
maximum_frequncy = max(word_frequencies.values()) | |
for word in word_frequencies.keys(): | |
word_frequencies[word] = (word_frequencies[word]/maximum_frequncy) | |
sentence_scores = {} | |
for sent in sentence_list: | |
for word in nltk.word_tokenize(sent.lower()): | |
if word in word_frequencies.keys(): | |
if len(sent.split(' ')) < 30: | |
if sent not in sentence_scores.keys(): | |
sentence_scores[sent] = word_frequencies[word] | |
else: | |
sentence_scores[sent] += word_frequencies[word] | |
summary_sentences = heapq.nlargest(10, sentence_scores, key=sentence_scores.get) | |
summary = ' '.join(summary_sentences) | |
# utext = str(summary).encode('ISO-8859-1', 'ignore') | |
name = fname.split('.pd')[0] | |
outname = name + '-tika.txt' | |
# out = open(outname,"w") | |
# out.write(summary).encode('utf-8') | |
# out.close() | |
with open(outname,"w") as text_file: | |
print(f'{summary}',file=text_file) | |
except: | |
print(f'{outname} did not work.') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment