Skip to content

Instantly share code, notes, and snippets.

@terrettaz
Created February 12, 2011 10:17
Show Gist options
  • Save terrettaz/823672 to your computer and use it in GitHub Desktop.
Save terrettaz/823672 to your computer and use it in GitHub Desktop.
It counts the amount of occurrences of words in a file and sorts those words. This script accepts a file as first parameter or it use the stdin if no arguments are given.
#!/usr/bin/python
# -*- coding: iso-8859-1 -*-
__version__ = "$Revision: 0.5 $"
__author__ = "Pierrick Terrettaz"
__date__ = "2006-12-09"
import sys
import time
import re
if __name__ == "__main__":
start = time.time()
try:
f = open(sys.argv[1])
except:
f = sys.stdin
d = {}
lsum = 0
vocabulary = 0
for line in f:
lsum += 1
if line[-1] == '\n':
line = line[:-1]
for word in re.split('\s', line):
if word != '':
word = word.lower()
if not d.has_key(word):
d[word] = 1
vocabulary += 1
else:
d[word] = d[word] + 1
f.close()
rev_items = [(v, k) for k, v in d.items()]
rev_items.sort()
wsum = sum(d.values())
for v, k in rev_items:
ratio = float(v) / wsum * 100
print "%s (%.2f%%): %s" % (v, ratio, k)
print "lines: %s" % lsum
print "words: %s" % wsum
print "vocabulary: %s" % vocabulary
print "script ended in : %d secondes" % (time.time() - start)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment