-
-
Save xim/1279283 to your computer and use it in GitHub Desktop.
import sys | |
import numpy | |
from nltk.cluster import KMeansClusterer, GAAClusterer, euclidean_distance | |
import nltk.corpus | |
from nltk import decorators | |
import nltk.stem | |
stemmer_func = nltk.stem.EnglishStemmer().stem | |
stopwords = set(nltk.corpus.stopwords.words('english')) | |
@decorators.memoize | |
def normalize_word(word): | |
return stemmer_func(word.lower()) | |
def get_words(titles): | |
words = set() | |
for title in job_titles: | |
for word in title.split(): | |
words.add(normalize_word(word)) | |
return list(words) | |
@decorators.memoize | |
def vectorspaced(title): | |
title_components = [normalize_word(word) for word in title.split()] | |
return numpy.array([ | |
word in title_components and not word in stopwords | |
for word in words], numpy.short) | |
if __name__ == '__main__': | |
filename = 'example.txt' | |
if len(sys.argv) == 2: | |
filename = sys.argv[1] | |
with open(filename) as title_file: | |
job_titles = [line.strip() for line in title_file.readlines()] | |
words = get_words(job_titles) | |
# cluster = KMeansClusterer(5, euclidean_distance) | |
cluster = GAAClusterer(5) | |
cluster.cluster([vectorspaced(title) for title in job_titles if title]) | |
# NOTE: This is inefficient, cluster.classify should really just be | |
# called when you are classifying previously unseen examples! | |
classified_examples = [ | |
cluster.classify(vectorspaced(title)) for title in job_titles | |
] | |
for cluster_id, title in sorted(zip(classified_examples, job_titles)): | |
print cluster_id, title |
Not so skilled worker | |
Skilled worker | |
Banana picker | |
Police officer | |
Office worker | |
Fireman | |
IT consultant | |
Rapist of old ladies | |
Engineer | |
Stupid bastard son | |
Genious computer analyst | |
Computer banana peeler | |
Potato peeler | |
CEO of a major business | |
Business economist | |
Data analyst | |
Economist analyst bastard | |
Psychologist data enumerator | |
Psychologist genious | |
Evil genious | |
Murderer and rapist of cats | |
Cat psychologist | |
Top Software Engineer in IT with NLTK experience | |
xim | |
fission6 |
Stemmers have moved, line #9 should be changed to: stemmer_func = nltk.stem.snowball.EnglishStemmer().stem
If my text file is encoded with utf-8, there is this error occurring:
Traceback (most recent call last):
File "cluster_example.py", line 40, in
words = get_words(job_titles)
File "cluster_example.py", line 20, in get_words
words.add(normalize_word(word))
File "", line 1, in
File "/usr/local/lib/python2.7/dist-packages/nltk/decorators.py", line 183, in memoize
result = func(*args)
File "cluster_example.py", line 14, in normalize_word
return stemmer_func(word.lower())
File "/usr/local/lib/python2.7/dist-packages/nltk/stem/snowball.py", line 694, in stem
word = (word.replace(u"\u2019", u"\x27")
UnicodeDecodeError: 'ascii' codec can't decode byte 0xe2 in position 13: ordinal not in range(128)
Can you suggest what to do in this case? Thank you.
line 9 won't work with nltk==2.0.4
. it needs to be changed to:
stemmer_func = nltk.stem.snowball.EnglishStemmer().stem
Lines 16-18:
def get_words(titles):
words = set()
for title in job_titles:
Shouldn't job_titles be titles ?
For those suffering with UTF-8 files, a simple solution is to use codecs package to open the file:
..
import codecs
..
and replace line 39 with this:
with codecs.open(filename,encoding='latin1') as title_file:
this is a great example of clustering - thanks to xim for his excellent assistance and mentorship.