Created
October 11, 2011 20:19
-
-
Save xim/1279283 to your computer and use it in GitHub Desktop.
Clustering K-Means by euclidian distance, yay!
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
import numpy | |
from nltk.cluster import KMeansClusterer, GAAClusterer, euclidean_distance | |
import nltk.corpus | |
from nltk import decorators | |
import nltk.stem | |
stemmer_func = nltk.stem.EnglishStemmer().stem | |
stopwords = set(nltk.corpus.stopwords.words('english')) | |
@decorators.memoize | |
def normalize_word(word): | |
return stemmer_func(word.lower()) | |
def get_words(titles): | |
words = set() | |
for title in job_titles: | |
for word in title.split(): | |
words.add(normalize_word(word)) | |
return list(words) | |
@decorators.memoize | |
def vectorspaced(title): | |
title_components = [normalize_word(word) for word in title.split()] | |
return numpy.array([ | |
word in title_components and not word in stopwords | |
for word in words], numpy.short) | |
if __name__ == '__main__': | |
filename = 'example.txt' | |
if len(sys.argv) == 2: | |
filename = sys.argv[1] | |
with open(filename) as title_file: | |
job_titles = [line.strip() for line in title_file.readlines()] | |
words = get_words(job_titles) | |
# cluster = KMeansClusterer(5, euclidean_distance) | |
cluster = GAAClusterer(5) | |
cluster.cluster([vectorspaced(title) for title in job_titles if title]) | |
# NOTE: This is inefficient, cluster.classify should really just be | |
# called when you are classifying previously unseen examples! | |
classified_examples = [ | |
cluster.classify(vectorspaced(title)) for title in job_titles | |
] | |
for cluster_id, title in sorted(zip(classified_examples, job_titles)): | |
print cluster_id, title |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Not so skilled worker | |
Skilled worker | |
Banana picker | |
Police officer | |
Office worker | |
Fireman | |
IT consultant | |
Rapist of old ladies | |
Engineer | |
Stupid bastard son | |
Genious computer analyst | |
Computer banana peeler | |
Potato peeler | |
CEO of a major business | |
Business economist | |
Data analyst | |
Economist analyst bastard | |
Psychologist data enumerator | |
Psychologist genious | |
Evil genious | |
Murderer and rapist of cats | |
Cat psychologist | |
Top Software Engineer in IT with NLTK experience | |
xim | |
fission6 |
line 9 won't work with nltk==2.0.4
. it needs to be changed to:
stemmer_func = nltk.stem.snowball.EnglishStemmer().stem
Lines 16-18:
def get_words(titles):
words = set()
for title in job_titles:
Shouldn't job_titles be titles ?
For those suffering with UTF-8 files, a simple solution is to use codecs package to open the file:
..
import codecs
..
and replace line 39 with this:
with codecs.open(filename,encoding='latin1') as title_file:
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
If my text file is encoded with utf-8, there is this error occurring:
Traceback (most recent call last):
File "cluster_example.py", line 40, in
words = get_words(job_titles)
File "cluster_example.py", line 20, in get_words
words.add(normalize_word(word))
File "", line 1, in
File "/usr/local/lib/python2.7/dist-packages/nltk/decorators.py", line 183, in memoize
result = func(*args)
File "cluster_example.py", line 14, in normalize_word
return stemmer_func(word.lower())
File "/usr/local/lib/python2.7/dist-packages/nltk/stem/snowball.py", line 694, in stem
word = (word.replace(u"\u2019", u"\x27")
UnicodeDecodeError: 'ascii' codec can't decode byte 0xe2 in position 13: ordinal not in range(128)
Can you suggest what to do in this case? Thank you.