Skip to content

Instantly share code, notes, and snippets.

@korc
Created September 10, 2016 02:42
Show Gist options
  • Save korc/b4aa15e14bdfc2bb71a4a348936d7a2e to your computer and use it in GitHub Desktop.
Save korc/b4aa15e14bdfc2bb71a4a348936d7a2e to your computer and use it in GitHub Desktop.
#!/usr/bin/python
import sys
import re
generics=set(['the', 'and', 'to', 'of', 'a', 'in', 'i', 'we', 'this', 'that', 'for', 'it', 'on', 'will',
'be', 'are', 'can', 'with', 'by', 'from', 'or', 'how', 'an', 's', 'these', 'have', 'not',
'our', 'used', 'at', 'their', 'has', 'such', 'also', 'which', 'using', 'they', 'but',
'all', 'what', 'about', 'based', 'you', 'more', 'been', 'some', 'use', 'other', 'one',
'when', 'them', 'many', 'well', 'o', 'll', 'up'])
word_re=re.compile(r'\w+')
words={}
for line in sys.stdin:
for match in word_re.finditer(line):
word=match.group(0).lower()
words[word]=words.get(word, 0)+1
for word in words.keys():
for plu, sin in (('s', ''), ('ies', 'y')):
if word.endswith(plu):
word_sin="%s%s"%(word[:-(len(plu))], sin)
if word_sin in words:
words[word_sin]+=words[word]
del words[word]
word_list=words.keys()
word_list.sort(cmp=lambda x, y: 1 if words[x]<words[y] else -1 if words[y]<words[x] else 0)
for word in word_list:
if word in generics: continue
print word, words[word]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment