Created
September 5, 2015 22:48
-
-
Save medecau/88cadd2cd702ea6c09d4 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sys import argv | |
from glob import glob | |
from itertools import imap, chain, ifilter, groupby | |
from collections import defaultdict | |
from PyPDF2 import PdfFileReader | |
from pattern.en import parsetree | |
def ilen(i): | |
""" | |
len() iterables without having to convert to a list() | |
""" | |
c = 0 | |
for e in i: | |
c += 1 | |
return c | |
def extract_text(file_name): | |
""" | |
read PDF file and extract plain text | |
""" | |
with open(file_name, 'rb') as fp: | |
pdf_doc = PdfFileReader(fp) | |
text_pages = (page.extractText() for page in pdf_doc.pages) | |
text = '\n\n'.join(text_pages) | |
return text | |
def extract_topics(text): | |
""" | |
extract topic/count pair for each topic | |
a topic is either a subject or object | |
uses sentence structure to pick the chunks | |
""" | |
tree = parsetree(text, relations=True, lemmata=True) | |
subjects = chain(s.subjects for s in tree) # extract subjects | |
objects = chain(s.objects for s in tree) # extract objects | |
all_topics = chain(*chain(subjects, objects)) # chain both collections | |
# do some clean up by removing symbols and pronouns | |
topics = ifilter(lambda topic: not (all(True if word.type in ('PRP', 'PRP$', 'SYM') else False for word in topic.words)), all_topics) | |
# normalize and sort topics | |
# normalization means extract the string from the chunk object | |
# lower case the text and force unicode object | |
topics = sorted(unicode(t.string.lower()) for t in topics) | |
grouped_topics = groupby(topics) | |
topic_counter = ((topic, ilen(topic_iterator)) for topic, topic_iterator in grouped_topics) | |
return topic_counter | |
if __name__ == '__main__': | |
files = (f for f in glob(argv[1]) if 'Arabic' not in f) | |
# I'm filtering out files with the word 'Arabic' has I have copies | |
# for the arabic docs but am only interested in the english ones | |
texts = imap(extract_text, files) | |
per_text_topic_counters = imap(extract_topics, texts) | |
topic_counter = defaultdict(int) | |
for each_topic_counter in per_text_topic_counters: | |
for topic, count in each_topic_counter: | |
topic_counter[topic] += count | |
# sort the topic counter before slicing for the top topics | |
top_topics = sorted(topic_counter.items(), key=lambda t: t[1], reverse=True) | |
for topic, count in top_topics[:int(argv[2])]: | |
print topic, count |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment