-
-
Save mugenen/1972633 to your computer and use it in GitHub Desktop.
A frontend of WordNet-Ja database file (sqlite3 format) which is available on http://nlpwww.nict.go.jp/wn-ja/
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# encoding: utf-8 | |
import sys | |
import sqlite3 | |
from collections import namedtuple | |
conn = sqlite3.connect("wnjpn.db") | |
Word = namedtuple('Word', 'wordid lang lemma pron pos') | |
def getWords(lemma): | |
cur = conn.execute("select * from word where lemma=?", (lemma,)) | |
return [Word(*row) for row in cur] | |
def getWord(wordid): | |
cur = conn.execute("select * from word where wordid=?", (wordid,)) | |
return Word(*cur.fetchone()) | |
Sense = namedtuple('Sense', 'synset wordid lang rank lexid freq src') | |
def getSenses(word): | |
cur = conn.execute("select * from sense where wordid=?", (word.wordid,)) | |
return [Sense(*row) for row in cur] | |
def getSense(synset, lang='jpn'): | |
cur = conn.execute("select * from sense where synset=? and lang=?", | |
(synset,lang)) | |
row = cur.fetchone() | |
return row and Sense(*row) or None | |
Synset = namedtuple('Synset', 'synset pos name src') | |
def getSynset(synset): | |
cur = conn.execute("select * from synset where synset=?", (synset,)) | |
return Synset(*cur.fetchone()) | |
SynLink = namedtuple('SynLink', 'synset1 synset2 link src') | |
def getSynLinks(sense, link): | |
cur = conn.execute("select * from synlink where synset1=? and link=?", | |
(sense.synset, link)) | |
return [SynLink(*row) for row in cur] | |
def getSynLinksRecursive(senses, link, lang='jpn', _depth=0): | |
for sense in senses: | |
synLinks = getSynLinks(sense, link) | |
if synLinks: | |
print ''.join([' '*2*_depth, | |
getWord(sense.wordid).lemma, | |
' ', | |
getSynset(sense.synset).name]) | |
_senses = [] | |
for synLink in synLinks: | |
sense = getSense(synLink.synset2, lang) | |
if sense: | |
_senses.append(sense) | |
getSynLinksRecursive(_senses, link, lang, _depth+1) | |
def getWordsFromSynset(synset, lang): | |
cur = conn.execute("select word.* from sense, word where synset=? and word.lang=? and sense.wordid = word.wordid;", (synset,lang)) | |
return [Word(*row) for row in cur] | |
def getWordsFromSenses(sense, lang): | |
for s in sense: | |
print getSynset(s.synset).name | |
syns = getWordsFromSynset(s.synset, lang) | |
for sy in syns: | |
print ' ' + sy.lemma | |
if __name__ == '__main__': | |
if len(sys.argv)>=3: | |
words = getWords(sys.argv[1].decode(sys.stdin.encoding)) | |
if words: | |
for w in words: | |
sense = getSenses(w) | |
link = len(sys.argv)>=3 and sys.argv[2] or 'hypo' | |
lang = len(sys.argv)==4 and sys.argv[3] or 'jpn' | |
if link == 'syns': | |
getWordsFromSenses(sense, lang) | |
else: | |
getSynLinksRecursive(sense, link, lang) | |
else: | |
print >>sys.stderr, "(nothing found)" | |
else: | |
print """usage: wn.py word link [lang] | |
word | |
word to investigate | |
link | |
syns - Synonyms | |
hype - Hypernyms | |
inst - Instances | |
hypo - Hyponym | |
hasi - Has Instance | |
mero - Meronyms | |
mmem - Meronyms --- Member | |
msub - Meronyms --- Substance | |
mprt - Meronyms --- Part | |
holo - Holonyms | |
hmem - Holonyms --- Member | |
hsub - Holonyms --- Substance | |
hprt - Holonyms -- Part | |
attr - Attributes | |
sim - Similar to | |
entag - Entails | |
causg - Causes | |
dmncg - Domain --- Category | |
dmnug - Domain --- Usage | |
dmnrg - Domain --- Region | |
dmtcg - In Domain --- Category | |
dmtug - In Domain --- Usage | |
dmtrg - In Domain --- Region | |
antsg - Antonyms | |
lang (default: jpn) | |
jpn - Japanese | |
eng - English | |
""" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
以下の点を改変.
ファイル名が変わっていたので対応.
"wnjpn-0.9.db"→"wnjpn.db"
Windowsなどの非UTF-8環境だと動かない
decode('utf-8')からdecode(sys.stdin.encoding)に変更
同表記語が複数あった場合両方表示
dog(名詞),dog(動詞)
synsのリンクがデータベース上に見つからなかったので,link=synsの場合には引数の単語と同じsynsetに含まれる単語をすべて表示