Created
May 15, 2009 07:15
-
-
Save bmaland/112080 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python | |
| # Simple algorithm for clustering WordNet synsets. Requires Python 2.5 or 2.6, | |
| # in addition to the NLTK toolkit which is available at http://www.nltk.org/. | |
| import nltk | |
| from nltk.corpus import wordnet as wn | |
| from nltk.corpus import wordnet_ic | |
| def cluster_senses(word, treshold = 0.30, | |
| ic_corpus = wordnet_ic.ic('ic-treebank.dat')): | |
| """ | |
| """ | |
| synsets = wn.synsets(word, pos=wn.NOUN) # nouns only | |
| clusters = [[synsets.pop()]] | |
| for s in synsets: | |
| added = False | |
| for c in clusters: | |
| for ss in c: | |
| if s.lin_similarity(ss, ic_corpus) > treshold: | |
| c.append(s) | |
| added = True | |
| break | |
| # Make sure that a synset is only added to one cluster | |
| if added: | |
| break | |
| if not added: | |
| # the synset doesn't fit in any of the existing clusters so we | |
| # create a new one | |
| clusters += [[s]] | |
| return clusters |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment