Created
October 11, 2011 20:19
-
-
Save xim/1279283 to your computer and use it in GitHub Desktop.
Clustering K-Means by euclidian distance, yay!
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
import numpy | |
from nltk.cluster import KMeansClusterer, GAAClusterer, euclidean_distance | |
import nltk.corpus | |
from nltk import decorators | |
import nltk.stem | |
stemmer_func = nltk.stem.EnglishStemmer().stem | |
stopwords = set(nltk.corpus.stopwords.words('english')) | |
@decorators.memoize | |
def normalize_word(word): | |
return stemmer_func(word.lower()) | |
def get_words(titles): | |
words = set() | |
for title in job_titles: | |
for word in title.split(): | |
words.add(normalize_word(word)) | |
return list(words) | |
@decorators.memoize | |
def vectorspaced(title): | |
title_components = [normalize_word(word) for word in title.split()] | |
return numpy.array([ | |
word in title_components and not word in stopwords | |
for word in words], numpy.short) | |
if __name__ == '__main__': | |
filename = 'example.txt' | |
if len(sys.argv) == 2: | |
filename = sys.argv[1] | |
with open(filename) as title_file: | |
job_titles = [line.strip() for line in title_file.readlines()] | |
words = get_words(job_titles) | |
# cluster = KMeansClusterer(5, euclidean_distance) | |
cluster = GAAClusterer(5) | |
cluster.cluster([vectorspaced(title) for title in job_titles if title]) | |
# NOTE: This is inefficient, cluster.classify should really just be | |
# called when you are classifying previously unseen examples! | |
classified_examples = [ | |
cluster.classify(vectorspaced(title)) for title in job_titles | |
] | |
for cluster_id, title in sorted(zip(classified_examples, job_titles)): | |
print cluster_id, title |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Not so skilled worker | |
Skilled worker | |
Banana picker | |
Police officer | |
Office worker | |
Fireman | |
IT consultant | |
Rapist of old ladies | |
Engineer | |
Stupid bastard son | |
Genious computer analyst | |
Computer banana peeler | |
Potato peeler | |
CEO of a major business | |
Business economist | |
Data analyst | |
Economist analyst bastard | |
Psychologist data enumerator | |
Psychologist genious | |
Evil genious | |
Murderer and rapist of cats | |
Cat psychologist | |
Top Software Engineer in IT with NLTK experience | |
xim | |
fission6 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
For those suffering with UTF-8 files, a simple solution is to use codecs package to open the file:
..
import codecs
..
and replace line 39 with this:
with codecs.open(filename,encoding='latin1') as title_file: