Skip to content

Instantly share code, notes, and snippets.

@dast1
Created February 3, 2018 22:10
Show Gist options
  • Save dast1/d3c69d575151436be9059756ba9a38e1 to your computer and use it in GitHub Desktop.
Save dast1/d3c69d575151436be9059756ba9a38e1 to your computer and use it in GitHub Desktop.
Text Analyzer
# Text Analyzer
# Import text & break up into words
raw_text = open('test.txt').read().lower()
# Clean up special words: replacing false sentence stop from exception words
def cleanup_special_words(text):
# Since sentences ussually end with a [.?!], we need to create a dictionary of
# exception words that contain these characters
exceptions = {'u.s.':'u*s*',
'gov.':'gov*',
'mr.':'mr*',
'mrs.':'mrs*',
'r.':'r*',
'd.':'d*',
'dr.':'dr*',
'jan.':'jan*',
'feb.':'feb*',
'mar.':'mar*',
'apr.':'apr*',
'jun.':'jun*',
'jul.':'jul*',
'aug.':'aug*',
'sep.':'sep*',
'oct.':'oct*',
'nov.':'nov',
'dec.':'dec*'
} # add new words exceptions as needed
words = [x.lower().strip() for x in text.split()]
clean_words = []
for word in words:
if word in exceptions.keys():
clean_words.append(exceptions.get(word))
else:
clean_words.append(word)
return " ".join(clean_words)
# Clean up quotes: replacing false sentence stops inside quotes
def cleanup_quotes(text):
characters = list(text)
# Locate the quotes in the text
is_quote = [None] * len(characters)
is_quote[0] = False
for i in range(1, len(characters)):
if characters[i] == '“':
is_quote[i] = True;
elif characters[i] == '”':
is_quote[i] = False;
else:
is_quote[i] = is_quote[i-1]
sentence_stops = {'.':'*',
'?':'*',
'!':'*'
}
# If in quote, replace false sentence stops
clean_characters = []
for i in range(0, len(characters)-1):
if is_quote[i+1] and characters[i] in sentence_stops:
clean_characters.append('*')
else:
clean_characters.append(characters[i])
return "".join(clean_characters)
# Find total words
words = cleanup_quotes(cleanup_special_words(raw_text)).split()
print("Total words: ", len(words))
# Find unique words and sort them in discending order of appearance
def cleanup_words(words):
clean_words = []
for word in words:
clean_characters = []
characters = list(word)
for character in characters:
if character.isalpha():
clean_characters.append(character)
clean_word = "".join(clean_characters)
if not clean_word:
pass
else:
clean_words.append(clean_word)
return clean_words
# Find unique words and list them in the order of descending frequency
def analyze_words(words):
from collections import Counter
word_table = Counter(cleanup_words(words))
import operator
return sorted(word_table.items(), key=operator.itemgetter(1))
sorted_words = analyze_words(words)
sorted_words.reverse()
print('Unique words: ', len(sorted_words))
# Process senteces
def process_sentences(text):
processed_text = cleanup_quotes(cleanup_special_words(text))
characters = list(processed_text)
sentence_stops = {'.':'*',
'?':'*',
'!':'*'
}
sentence_end_idx = []
for i, character in enumerate(characters):
if character in sentence_stops:
sentence_end_idx.append(i+2)
sentence_start_idx = [0] + sentence_end_idx[:-1]
sentences = []
for i in range(0, len(sentence_end_idx)):
sentence = "".join(characters[sentence_start_idx[i]:sentence_end_idx[i]]).strip()
sentences.append(sentence)
return sentences
# Find sentences
sentences = process_sentences(raw_text);
print('Sentences',len(sentences))
# Calculate the average # of words per sentence
def avg_words(sentences):
import numpy as np
word_count = []
for sentence in sentences:
word_count.append(len(sentence.split()))
return sum(word_count)/len(sentences)
avg_words = avg_words(sentences);
print('Avg. sentence length is (words): ', round(avg_words,1))
# The ability to find often used phrases (a phrase of 3 or more words used over 3 times)
# Get stem words
def stem_(words):
import re
import nltk
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
stem_words = []
for word in words:
stem_words.append(ps.stem(re.sub('[^a-zA-Z]', ' ', word)))
return stem_words
stem_words = stem_(words)
# Find phrases
phrase_length = 5
phrase_bank = []
for i in range(0,len(stem_words)-phrase_length):
phrase_bank.append(" ".join(stem_words[0+i:phrase_length+i]))
# reduce to unique phrase and get index
phrases = analyze_words(phrase_bank)
phrases.reverse()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment