Created
August 13, 2014 14:35
-
-
Save johndavidback/ea5ad1387f8c9320127f to your computer and use it in GitHub Desktop.
A quick text analysis of the most often repeating words in a block of text
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Usage: | |
# $ python analyze.py somefile.txt | |
# Easy breezy. | |
import sys | |
import string | |
import operator | |
# These I just grabbed the top 50 from wikipedia: http://en.wikipedia.org/wiki/Most_common_words_in_English | |
COMMON_WORDS = 'the be to of and a in that have I it for not on with he as you do at this but his by from they we say her she or an will my one all would there their what so up out if about who get which go me'.split() | |
def analyze(): | |
# Grab out the text file name | |
textfile = sys.argv[1] | |
# Read the text document | |
with open(textfile) as f: | |
# Get the content of the text file | |
text = f.read() | |
# Remove all the punctuation from the text, similar to string.punctuation but I removed some. | |
exclude = set('!"#$%&()*+,./:;<=>?@[\\]^_`{|}~') | |
text = ''.join(ch.lower() for ch in text if ch not in exclude).split() # Turn into list | |
# Go through the words and build them up, buttercup. Strip out the 50 most common | |
counts = {} | |
for word in text: | |
if not word in COMMON_WORDS: | |
if word in counts: | |
counts[word] += 1 | |
else: | |
counts[word] = 1 | |
# Now, sort them based on their keys | |
sorted_counts = sorted(counts.iteritems(), key=operator.itemgetter(1)) | |
# Go through it reversed and we're golden grahams. | |
for word_tuple in reversed(sorted_counts): | |
print word_tuple[0], word_tuple[1] | |
if __name__ == '__main__': | |
analyze() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment