Created
April 24, 2009 21:55
-
-
Save gcr/101359 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python | |
| #-*- coding:utf-8 -*- | |
| """ This code takes a wordlist and records how many occurrences of this word are | |
| in the file. It then prints a sorted list of words and their frequency to | |
| the console. """ | |
| wl = {} | |
| for line in file('wordlist.txt'): | |
| line = line.replace("\"", '').replace('.','').replace('!','').replace(',','').replace(';','').replace('?','') | |
| for word in line.split(' '): | |
| word = word.lower().strip() | |
| if word != "": | |
| wl[word] = wl.get(word, 0) + 1 # Increment it | |
| sorted_words = [] | |
| for key in wl.keys(): | |
| sorted_words.append((key, wl[key])) | |
| sorted_words.sort() | |
| # Print out everything | |
| print "Unique words found: %d" % len(sorted_words) | |
| print "Total words: %d" % sum([value for _, value in sorted_words]) | |
| for word, freq in sorted_words: | |
| print "%15s %d" % (word, freq) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment