-
-
Save rasika/692fae312c7776c672e3 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
from nltk import numpy | |
from nltk.cluster import kmeans, gaac, euclidean_distance | |
import nltk.corpus | |
from nltk import decorators | |
import nltk.stem | |
stemmer_func = nltk.stem.snowball.EnglishStemmer().stem | |
stopwords = nltk.corpus.stopwords.words("english") | |
#return normalized the words(stemmed, lowercase) | |
@decorators.memoize | |
def normalize_word(word): | |
return stemmer_func(word.lower()) | |
#return list of normalize word | |
def get_words(job_titles): | |
words = set() | |
for title in job_titles: | |
for word in title.split(): | |
words.add(normalize_word(word)) | |
return list(words) | |
#return array removing stopwords | |
def vectorspaced(title,words): | |
title_components = [normalize_word(word) for word in title.split()] | |
return numpy.array([word in title_components and not word in stopwords for word in words], numpy.short) | |
def doCluster(clusterer='gaac',filename='example.txt'): | |
title_file=open(filename) | |
job_titles = [line.strip() for line in title_file.readlines()] | |
words = get_words(job_titles) | |
if clusterer=='gaac': | |
cluster = nltk.cluster.gaac.GAAClusterer(5) | |
elif clusterer=='kmeans': | |
cluster = nltk.cluster.kmeans.KMeansClusterer(5, euclidean_distance) | |
cluster = nltk.cluster.gaac.GAAClusterer(5) | |
#cluster = KMeansClusterer(5, euclidean_distance) | |
cluster.cluster([vectorspaced(title,words) for title in job_titles if title]) | |
# NOTE: This is inefficient, cluster.classify should really just be | |
# called when you are classifying previously unseen examples! | |
classified_examples = [cluster.classify(vectorspaced(title,words)) for title in job_titles] | |
for cluster_id, title in sorted(zip(classified_examples, job_titles)): | |
print cluster_id, title | |
if __name__ == '__main__': | |
filename = 'example.txt' | |
if len(sys.argv) == 2: | |
filename = sys.argv[1] | |
doCluster('gaac',filename) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Not so skilled worker | |
Skilled worker | |
Banana picker | |
Police officer | |
Office worker | |
Fireman | |
IT consultant | |
Rapist of old ladies | |
Engineer | |
Stupid bastard son | |
Genious computer analyst | |
Computer banana peeler | |
Potato peeler | |
CEO of a major business | |
Business economist | |
Data analyst | |
Economist analyst bastard | |
Psychologist data enumerator | |
Psychologist genious | |
Evil genious | |
Murderer and rapist of cats | |
Cat psychologist | |
Top Software Engineer in IT with NLTK experience | |
xim | |
fission6 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
changes done to comply with nltk 3.0