Skip to content

Instantly share code, notes, and snippets.

@srivathsanmurali
Created November 28, 2016 19:34
Show Gist options
  • Save srivathsanmurali/5fc56fd62ac496cbb599a11b8daf84b1 to your computer and use it in GitHub Desktop.
Save srivathsanmurali/5fc56fd62ac496cbb599a11b8daf84b1 to your computer and use it in GitHub Desktop.
Quick python script to find the histogram of words used in the a text file.
#!/usr/bin/python
import sys
import json
from collections import OrderedDict
from operator import itemgetter
def getWordMap(filename):
wordMap = dict()
totalWords = 0
with open(filename, 'r') as f:
for line in f:
for w in line.split():
word = ''.join(e for e in w.lower() if e.isalnum())
totalWords += 1
if word in wordMap:
wordMap[word] += 1
else:
wordMap[word] = 1
return (totalWords, wordMap)
if __name__ == "__main__":
if len(sys.argv) < 3:
print "python wordFun.py textFile.txt out.yml"
else:
total, wordMap = getWordMap(sys.argv[1])
wm = OrderedDict(sorted(wordMap.items(), key=itemgetter(1), reverse=True))
print 'Total word count = {}'.format(total)
print 'Total word dictionary = {}'.format(len(wm))
with open(sys.argv[2], 'w') as f:
json.dump({'Total word count':total, 'Total words used': len(wm), 'Word and count': wm}, f, indent=4)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment