Created
November 28, 2016 19:34
-
-
Save srivathsanmurali/5fc56fd62ac496cbb599a11b8daf84b1 to your computer and use it in GitHub Desktop.
Quick python script to find the histogram of words used in the a text file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
import sys | |
import json | |
from collections import OrderedDict | |
from operator import itemgetter | |
def getWordMap(filename): | |
wordMap = dict() | |
totalWords = 0 | |
with open(filename, 'r') as f: | |
for line in f: | |
for w in line.split(): | |
word = ''.join(e for e in w.lower() if e.isalnum()) | |
totalWords += 1 | |
if word in wordMap: | |
wordMap[word] += 1 | |
else: | |
wordMap[word] = 1 | |
return (totalWords, wordMap) | |
if __name__ == "__main__": | |
if len(sys.argv) < 3: | |
print "python wordFun.py textFile.txt out.yml" | |
else: | |
total, wordMap = getWordMap(sys.argv[1]) | |
wm = OrderedDict(sorted(wordMap.items(), key=itemgetter(1), reverse=True)) | |
print 'Total word count = {}'.format(total) | |
print 'Total word dictionary = {}'.format(len(wm)) | |
with open(sys.argv[2], 'w') as f: | |
json.dump({'Total word count':total, 'Total words used': len(wm), 'Word and count': wm}, f, indent=4) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment