Created
April 24, 2015 03:15
-
-
Save e-mon/28ca194a80450311a342 to your computer and use it in GitHub Desktop.
WordNet in NLTK version up from 3.0 to 3.1
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
nltkdata_wn = '/path/to/nltk_data/corpora/wordnet/' | |
wn31 = "http://wordnetcode.princeton.edu/wn3.1.dict.tar.gz" | |
if not os.path.exists(nltkdata_wn+'wn3.0'): | |
os.mkdir(nltkdata_wn+'wn3.0') | |
os.system('mv '+nltkdata_wn+"* "+nltkdata_wn+"wn3.0/") | |
if not os.path.exists('wn3.1.dict.tar.gz'): | |
os.system('wget '+wn31) | |
os.system("tar zxf wn3.1.dict.tar.gz -C "+nltkdata_wn) | |
os.system("mv "+nltkdata_wn+"dict/* "+nltkdata_wn) | |
os.rmdir(nltkdata_wn + 'dict') | |
# Creating lexnames file. | |
dbfiles = nltkdata_wn+'dbfiles' | |
with open(nltkdata_wn+'lexnames', 'w') as fout: | |
for i,j in enumerate(sorted(os.listdir(dbfiles))): | |
pos = j.partition('.')[0] | |
if pos == "noun": | |
syncat = 1 | |
elif pos == "verb": | |
syncat = 2 | |
elif pos == "adj": | |
syncat = 3 | |
elif pos == "adv": | |
syncat = 4 | |
elif j == "cntlist": | |
syncat = "cntlist" | |
fout.write("\t".join([str(i).zfill(2),j,str(syncat)])+"\n") | |
from nltk.corpus import wordnet as wn | |
# Checking generated lexnames file. | |
for i, line in enumerate(open(nltkdata_wn + 'lexnames','r')): | |
index, lexname, _ = line.split() | |
##print line.split(), int(index), i | |
assert int(index) == i | |
# Testing wordnet function. | |
print(wn.synsets('dog')) | |
for i in wn.all_synsets(): | |
print(i, i.pos(), i.definition()) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Great! it worked, just need to make a few changes since I am doing it on windows...