Created
February 3, 2018 22:10
-
-
Save dast1/d3c69d575151436be9059756ba9a38e1 to your computer and use it in GitHub Desktop.
Text Analyzer
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Text Analyzer | |
# Import text & break up into words | |
raw_text = open('test.txt').read().lower() | |
# Clean up special words: replacing false sentence stop from exception words | |
def cleanup_special_words(text): | |
# Since sentences ussually end with a [.?!], we need to create a dictionary of | |
# exception words that contain these characters | |
exceptions = {'u.s.':'u*s*', | |
'gov.':'gov*', | |
'mr.':'mr*', | |
'mrs.':'mrs*', | |
'r.':'r*', | |
'd.':'d*', | |
'dr.':'dr*', | |
'jan.':'jan*', | |
'feb.':'feb*', | |
'mar.':'mar*', | |
'apr.':'apr*', | |
'jun.':'jun*', | |
'jul.':'jul*', | |
'aug.':'aug*', | |
'sep.':'sep*', | |
'oct.':'oct*', | |
'nov.':'nov', | |
'dec.':'dec*' | |
} # add new words exceptions as needed | |
words = [x.lower().strip() for x in text.split()] | |
clean_words = [] | |
for word in words: | |
if word in exceptions.keys(): | |
clean_words.append(exceptions.get(word)) | |
else: | |
clean_words.append(word) | |
return " ".join(clean_words) | |
# Clean up quotes: replacing false sentence stops inside quotes | |
def cleanup_quotes(text): | |
characters = list(text) | |
# Locate the quotes in the text | |
is_quote = [None] * len(characters) | |
is_quote[0] = False | |
for i in range(1, len(characters)): | |
if characters[i] == '“': | |
is_quote[i] = True; | |
elif characters[i] == '”': | |
is_quote[i] = False; | |
else: | |
is_quote[i] = is_quote[i-1] | |
sentence_stops = {'.':'*', | |
'?':'*', | |
'!':'*' | |
} | |
# If in quote, replace false sentence stops | |
clean_characters = [] | |
for i in range(0, len(characters)-1): | |
if is_quote[i+1] and characters[i] in sentence_stops: | |
clean_characters.append('*') | |
else: | |
clean_characters.append(characters[i]) | |
return "".join(clean_characters) | |
# Find total words | |
words = cleanup_quotes(cleanup_special_words(raw_text)).split() | |
print("Total words: ", len(words)) | |
# Find unique words and sort them in discending order of appearance | |
def cleanup_words(words): | |
clean_words = [] | |
for word in words: | |
clean_characters = [] | |
characters = list(word) | |
for character in characters: | |
if character.isalpha(): | |
clean_characters.append(character) | |
clean_word = "".join(clean_characters) | |
if not clean_word: | |
pass | |
else: | |
clean_words.append(clean_word) | |
return clean_words | |
# Find unique words and list them in the order of descending frequency | |
def analyze_words(words): | |
from collections import Counter | |
word_table = Counter(cleanup_words(words)) | |
import operator | |
return sorted(word_table.items(), key=operator.itemgetter(1)) | |
sorted_words = analyze_words(words) | |
sorted_words.reverse() | |
print('Unique words: ', len(sorted_words)) | |
# Process senteces | |
def process_sentences(text): | |
processed_text = cleanup_quotes(cleanup_special_words(text)) | |
characters = list(processed_text) | |
sentence_stops = {'.':'*', | |
'?':'*', | |
'!':'*' | |
} | |
sentence_end_idx = [] | |
for i, character in enumerate(characters): | |
if character in sentence_stops: | |
sentence_end_idx.append(i+2) | |
sentence_start_idx = [0] + sentence_end_idx[:-1] | |
sentences = [] | |
for i in range(0, len(sentence_end_idx)): | |
sentence = "".join(characters[sentence_start_idx[i]:sentence_end_idx[i]]).strip() | |
sentences.append(sentence) | |
return sentences | |
# Find sentences | |
sentences = process_sentences(raw_text); | |
print('Sentences',len(sentences)) | |
# Calculate the average # of words per sentence | |
def avg_words(sentences): | |
import numpy as np | |
word_count = [] | |
for sentence in sentences: | |
word_count.append(len(sentence.split())) | |
return sum(word_count)/len(sentences) | |
avg_words = avg_words(sentences); | |
print('Avg. sentence length is (words): ', round(avg_words,1)) | |
# The ability to find often used phrases (a phrase of 3 or more words used over 3 times) | |
# Get stem words | |
def stem_(words): | |
import re | |
import nltk | |
from nltk.stem.porter import PorterStemmer | |
ps = PorterStemmer() | |
stem_words = [] | |
for word in words: | |
stem_words.append(ps.stem(re.sub('[^a-zA-Z]', ' ', word))) | |
return stem_words | |
stem_words = stem_(words) | |
# Find phrases | |
phrase_length = 5 | |
phrase_bank = [] | |
for i in range(0,len(stem_words)-phrase_length): | |
phrase_bank.append(" ".join(stem_words[0+i:phrase_length+i])) | |
# reduce to unique phrase and get index | |
phrases = analyze_words(phrase_bank) | |
phrases.reverse() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment