medecau · September 5, 2015 22:48
diff --git a/top_topics.py b/top_topics.py
 from sys import argv
 from glob import glob
 from itertools import imap, chain, ifilter, groupby
 from collections import defaultdict
 from PyPDF2 import PdfFileReader
 from pattern.en import parsetree


 def ilen(i):
    """
        len() iterables without having to convert to a list()
    """
    c = 0
    for e in i:
        c += 1
    return c


 def extract_text(file_name):
    """
        read PDF file and extract plain text
    """
    with open(file_name, 'rb') as fp:
        pdf_doc = PdfFileReader(fp)
        text_pages = (page.extractText() for page in pdf_doc.pages)
        text = '\n\n'.join(text_pages)
        return text


 def extract_topics(text):
    """
        extract topic/count pair for each topic

        a topic is either a subject or object
        uses sentence structure to pick the chunks
    """
    tree = parsetree(text, relations=True, lemmata=True)

    subjects = chain(s.subjects for s in tree)  # extract subjects
    objects = chain(s.objects for s in tree)  # extract objects
    all_topics = chain(*chain(subjects, objects))  # chain both collections

    # do some clean up by removing symbols and pronouns
    topics = ifilter(lambda topic: not (all(True if word.type in ('PRP', 'PRP$', 'SYM') else False for word in topic.words)), all_topics)

    # normalize and sort topics
    # normalization means extract the string from the chunk object
    # lower case the text and force unicode object
    topics = sorted(unicode(t.string.lower()) for t in topics)

    grouped_topics = groupby(topics)

    topic_counter = ((topic, ilen(topic_iterator)) for topic, topic_iterator in grouped_topics)

    return topic_counter


 if __name__ == '__main__':
    files = (f for f in glob(argv[1]) if 'Arabic' not in f)
    # I'm filtering out files with the word 'Arabic' has I have copies
    # for the arabic docs but am only interested in the english ones

    texts = imap(extract_text, files)

    per_text_topic_counters = imap(extract_topics, texts)

    topic_counter = defaultdict(int)

    for each_topic_counter in per_text_topic_counters:
        for topic, count in each_topic_counter:
            topic_counter[topic] += count

    # sort the topic counter before slicing for the top topics
    top_topics = sorted(topic_counter.items(), key=lambda t: t[1], reverse=True)
    for topic, count in top_topics[:int(argv[2])]:
        print topic, count
	from sys import argv
	from glob import glob
	from itertools import imap, chain, ifilter, groupby
	from collections import defaultdict
	from PyPDF2 import PdfFileReader
	from pattern.en import parsetree


	def ilen(i):
	"""
	len() iterables without having to convert to a list()
	"""
	c = 0
	for e in i:
	c += 1
	return c


	def extract_text(file_name):
	"""
	read PDF file and extract plain text
	"""
	with open(file_name, 'rb') as fp:
	pdf_doc = PdfFileReader(fp)
	text_pages = (page.extractText() for page in pdf_doc.pages)
	text = '\n\n'.join(text_pages)
	return text


	def extract_topics(text):
	"""
	extract topic/count pair for each topic

	a topic is either a subject or object
	uses sentence structure to pick the chunks
	"""
	tree = parsetree(text, relations=True, lemmata=True)

	subjects = chain(s.subjects for s in tree) # extract subjects
	objects = chain(s.objects for s in tree) # extract objects
	all_topics = chain(*chain(subjects, objects)) # chain both collections

	# do some clean up by removing symbols and pronouns
	topics = ifilter(lambda topic: not (all(True if word.type in ('PRP', 'PRP$', 'SYM') else False for word in topic.words)), all_topics)

	# normalize and sort topics
	# normalization means extract the string from the chunk object
	# lower case the text and force unicode object
	topics = sorted(unicode(t.string.lower()) for t in topics)

	grouped_topics = groupby(topics)

	topic_counter = ((topic, ilen(topic_iterator)) for topic, topic_iterator in grouped_topics)

	return topic_counter


	if __name__ == '__main__':
	files = (f for f in glob(argv[1]) if 'Arabic' not in f)
	# I'm filtering out files with the word 'Arabic' has I have copies
	# for the arabic docs but am only interested in the english ones

	texts = imap(extract_text, files)

	per_text_topic_counters = imap(extract_topics, texts)

	topic_counter = defaultdict(int)

	for each_topic_counter in per_text_topic_counters:
	for topic, count in each_topic_counter:
	topic_counter[topic] += count

	# sort the topic counter before slicing for the top topics
	top_topics = sorted(topic_counter.items(), key=lambda t: t[1], reverse=True)
	for topic, count in top_topics[:int(argv[2])]:
	print topic, count