dast1 · February 3, 2018 22:10
diff --git a/text_analyzer.py b/text_analyzer.py
 # Text Analyzer

 # Import text & break up into words
 raw_text = open('test.txt').read().lower()

 # Clean up special words: replacing false sentence stop from exception words
 def cleanup_special_words(text):
    # Since sentences ussually end with a [.?!], we need to create a dictionary of 
    # exception words that contain these characters
    exceptions = {'u.s.':'u*s*', 
                  'gov.':'gov*',
                  'mr.':'mr*',
                  'mrs.':'mrs*',
                  'r.':'r*',
                  'd.':'d*',
                  'dr.':'dr*',
                  'jan.':'jan*',
                  'feb.':'feb*',
                  'mar.':'mar*',
                  'apr.':'apr*',
                  'jun.':'jun*',
                  'jul.':'jul*',
                  'aug.':'aug*',
                  'sep.':'sep*',
                  'oct.':'oct*',
                  'nov.':'nov',
                  'dec.':'dec*'
                  } # add new words exceptions as needed
    
    words = [x.lower().strip() for x in text.split()]
    
    clean_words = []
    for word in words:
        if word in exceptions.keys(): 
            clean_words.append(exceptions.get(word))
        else:
            clean_words.append(word)
    
    return " ".join(clean_words)

 # Clean up quotes: replacing false sentence stops inside quotes
 def cleanup_quotes(text):
    characters = list(text)
    
    # Locate the quotes in the text
    is_quote = [None] * len(characters)
    is_quote[0] = False
    for i in range(1, len(characters)):
        if characters[i] == '“':
            is_quote[i] = True;
        elif characters[i] == '”':
            is_quote[i] = False;
        else:
            is_quote[i] = is_quote[i-1] 
    
    sentence_stops = {'.':'*', 
                      '?':'*',
                      '!':'*'
                      }
    
    # If in quote, replace false sentence stops
    clean_characters = []
    for i in range(0, len(characters)-1):
        if is_quote[i+1] and characters[i] in sentence_stops:
            clean_characters.append('*')                  
        else:
            clean_characters.append(characters[i])

    
    return "".join(clean_characters)

 # Find total words
 words = cleanup_quotes(cleanup_special_words(raw_text)).split()
 print("Total words: ", len(words))

 # Find unique words and sort them in discending order of appearance
 def cleanup_words(words):
    clean_words = []
    for word in words:
        clean_characters = []
        characters = list(word)
        for character in characters:
            if character.isalpha():
                clean_characters.append(character)
        clean_word = "".join(clean_characters)
        if not clean_word:
            pass
        else:
            clean_words.append(clean_word)
    return clean_words

 # Find unique words and list them in the order of descending frequency
 def analyze_words(words):
    from collections import Counter
    word_table = Counter(cleanup_words(words))

    import operator   
    return sorted(word_table.items(), key=operator.itemgetter(1))

 sorted_words = analyze_words(words)
 sorted_words.reverse()
 print('Unique words: ', len(sorted_words))
    
 # Process senteces
 def process_sentences(text):
    processed_text = cleanup_quotes(cleanup_special_words(text))
    characters = list(processed_text)
    sentence_stops = {'.':'*', 
                      '?':'*',
                      '!':'*'
                      }
    
    sentence_end_idx = []
    for i, character in enumerate(characters):
        if character in sentence_stops:
            sentence_end_idx.append(i+2)
        sentence_start_idx = [0] + sentence_end_idx[:-1]
    
    sentences = []
    for i in range(0, len(sentence_end_idx)):
        sentence = "".join(characters[sentence_start_idx[i]:sentence_end_idx[i]]).strip()
        sentences.append(sentence)
    
    return sentences

 # Find sentences
 sentences = process_sentences(raw_text);
 print('Sentences',len(sentences))

 # Calculate the average # of words per sentence
 def avg_words(sentences):
    import numpy as np
    word_count = []
    for sentence in sentences:
        word_count.append(len(sentence.split()))
    
    return sum(word_count)/len(sentences)

 avg_words = avg_words(sentences);
 print('Avg. sentence length is (words): ', round(avg_words,1))

 # The ability to find often used phrases (a phrase of 3 or more words used over 3 times)
 # Get stem words
 def stem_(words):
    import re
    import nltk
    from nltk.stem.porter import PorterStemmer
    ps = PorterStemmer()
    stem_words = []
    for word in words:
        stem_words.append(ps.stem(re.sub('[^a-zA-Z]', ' ', word)))
    
    return stem_words

 stem_words = stem_(words)

 # Find phrases
 phrase_length = 5
 phrase_bank = []
    
 for i in range(0,len(stem_words)-phrase_length):
    phrase_bank.append(" ".join(stem_words[0+i:phrase_length+i]))

 # reduce to unique phrase and get index
 phrases = analyze_words(phrase_bank)
 phrases.reverse()
	# Text Analyzer

	# Import text & break up into words
	raw_text = open('test.txt').read().lower()

	# Clean up special words: replacing false sentence stop from exception words
	def cleanup_special_words(text):
	# Since sentences ussually end with a [.?!], we need to create a dictionary of
	# exception words that contain these characters
	exceptions = {'u.s.':'us',
	'gov.':'gov*',
	'mr.':'mr*',
	'mrs.':'mrs*',
	'r.':'r*',
	'd.':'d*',
	'dr.':'dr*',
	'jan.':'jan*',
	'feb.':'feb*',
	'mar.':'mar*',
	'apr.':'apr*',
	'jun.':'jun*',
	'jul.':'jul*',
	'aug.':'aug*',
	'sep.':'sep*',
	'oct.':'oct*',
	'nov.':'nov',
	'dec.':'dec*'
	} # add new words exceptions as needed

	words = [x.lower().strip() for x in text.split()]

	clean_words = []
	for word in words:
	if word in exceptions.keys():
	clean_words.append(exceptions.get(word))
	else:
	clean_words.append(word)

	return " ".join(clean_words)

	# Clean up quotes: replacing false sentence stops inside quotes
	def cleanup_quotes(text):
	characters = list(text)

	# Locate the quotes in the text
	is_quote = [None] * len(characters)
	is_quote[0] = False
	for i in range(1, len(characters)):
	if characters[i] == '“':
	is_quote[i] = True;
	elif characters[i] == '”':
	is_quote[i] = False;
	else:
	is_quote[i] = is_quote[i-1]

	sentence_stops = {'.':'*',
	'?':'*',
	'!':'*'
	}

	# If in quote, replace false sentence stops
	clean_characters = []
	for i in range(0, len(characters)-1):
	if is_quote[i+1] and characters[i] in sentence_stops:
	clean_characters.append('*')
	else:
	clean_characters.append(characters[i])


	return "".join(clean_characters)

	# Find total words
	words = cleanup_quotes(cleanup_special_words(raw_text)).split()
	print("Total words: ", len(words))

	# Find unique words and sort them in discending order of appearance
	def cleanup_words(words):
	clean_words = []
	for word in words:
	clean_characters = []
	characters = list(word)
	for character in characters:
	if character.isalpha():
	clean_characters.append(character)
	clean_word = "".join(clean_characters)
	if not clean_word:
	pass
	else:
	clean_words.append(clean_word)
	return clean_words

	# Find unique words and list them in the order of descending frequency
	def analyze_words(words):
	from collections import Counter
	word_table = Counter(cleanup_words(words))

	import operator
	return sorted(word_table.items(), key=operator.itemgetter(1))

	sorted_words = analyze_words(words)
	sorted_words.reverse()
	print('Unique words: ', len(sorted_words))

	# Process senteces
	def process_sentences(text):
	processed_text = cleanup_quotes(cleanup_special_words(text))
	characters = list(processed_text)
	sentence_stops = {'.':'*',
	'?':'*',
	'!':'*'
	}

	sentence_end_idx = []
	for i, character in enumerate(characters):
	if character in sentence_stops:
	sentence_end_idx.append(i+2)
	sentence_start_idx = [0] + sentence_end_idx[:-1]

	sentences = []
	for i in range(0, len(sentence_end_idx)):
	sentence = "".join(characters[sentence_start_idx[i]:sentence_end_idx[i]]).strip()
	sentences.append(sentence)

	return sentences

	# Find sentences
	sentences = process_sentences(raw_text);
	print('Sentences',len(sentences))

	# Calculate the average # of words per sentence
	def avg_words(sentences):
	import numpy as np
	word_count = []
	for sentence in sentences:
	word_count.append(len(sentence.split()))

	return sum(word_count)/len(sentences)

	avg_words = avg_words(sentences);
	print('Avg. sentence length is (words): ', round(avg_words,1))

	# The ability to find often used phrases (a phrase of 3 or more words used over 3 times)
	# Get stem words
	def stem_(words):
	import re
	import nltk
	from nltk.stem.porter import PorterStemmer
	ps = PorterStemmer()
	stem_words = []
	for word in words:
	stem_words.append(ps.stem(re.sub('[^a-zA-Z]', ' ', word)))

	return stem_words

	stem_words = stem_(words)

	# Find phrases
	phrase_length = 5
	phrase_bank = []

	for i in range(0,len(stem_words)-phrase_length):
	phrase_bank.append(" ".join(stem_words[0+i:phrase_length+i]))

	# reduce to unique phrase and get index
	phrases = analyze_words(phrase_bank)
	phrases.reverse()