Skip to content

Instantly share code, notes, and snippets.

@medecau
Created September 5, 2015 22:48
Show Gist options
  • Save medecau/88cadd2cd702ea6c09d4 to your computer and use it in GitHub Desktop.
Save medecau/88cadd2cd702ea6c09d4 to your computer and use it in GitHub Desktop.
from sys import argv
from glob import glob
from itertools import imap, chain, ifilter, groupby
from collections import defaultdict
from PyPDF2 import PdfFileReader
from pattern.en import parsetree
def ilen(i):
"""
len() iterables without having to convert to a list()
"""
c = 0
for e in i:
c += 1
return c
def extract_text(file_name):
"""
read PDF file and extract plain text
"""
with open(file_name, 'rb') as fp:
pdf_doc = PdfFileReader(fp)
text_pages = (page.extractText() for page in pdf_doc.pages)
text = '\n\n'.join(text_pages)
return text
def extract_topics(text):
"""
extract topic/count pair for each topic
a topic is either a subject or object
uses sentence structure to pick the chunks
"""
tree = parsetree(text, relations=True, lemmata=True)
subjects = chain(s.subjects for s in tree) # extract subjects
objects = chain(s.objects for s in tree) # extract objects
all_topics = chain(*chain(subjects, objects)) # chain both collections
# do some clean up by removing symbols and pronouns
topics = ifilter(lambda topic: not (all(True if word.type in ('PRP', 'PRP$', 'SYM') else False for word in topic.words)), all_topics)
# normalize and sort topics
# normalization means extract the string from the chunk object
# lower case the text and force unicode object
topics = sorted(unicode(t.string.lower()) for t in topics)
grouped_topics = groupby(topics)
topic_counter = ((topic, ilen(topic_iterator)) for topic, topic_iterator in grouped_topics)
return topic_counter
if __name__ == '__main__':
files = (f for f in glob(argv[1]) if 'Arabic' not in f)
# I'm filtering out files with the word 'Arabic' has I have copies
# for the arabic docs but am only interested in the english ones
texts = imap(extract_text, files)
per_text_topic_counters = imap(extract_topics, texts)
topic_counter = defaultdict(int)
for each_topic_counter in per_text_topic_counters:
for topic, count in each_topic_counter:
topic_counter[topic] += count
# sort the topic counter before slicing for the top topics
top_topics = sorted(topic_counter.items(), key=lambda t: t[1], reverse=True)
for topic, count in top_topics[:int(argv[2])]:
print topic, count
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment