Last active
May 10, 2019 10:45
-
-
Save ikegami-yukino/2a5ed3fd3ccde2938f020e47a8e4c9af to your computer and use it in GitHub Desktop.
SentiWordNet を日本語化する
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
import sqlite3 | |
import time | |
import requests | |
DB_PATH = 'wnjpn.db' | |
SWN_PATH = 'SentiWordNet_3.0.0_20130122.txt' | |
URL = 'https://script.google.com/macros/s/Please_write_here/exec?text=%s&source=en&target=ja' | |
RESULT_PATH = 'result.csv' | |
re_sentence = re.compile('"([^"]+)"') | |
def fetch_jp_lemma(synset, cursor): | |
jp_lemma = [] | |
cursor.execute("SELECT wordid FROM sense WHERE synset = '%s' AND lang != 'eng'" % (synset)) | |
for x in cursor.fetchall(): | |
cursor.execute("SELECT lemma FROM word WHERE wordid = '%s' AND lang != 'eng'" % (x[0])) | |
for y in cursor.fetchall(): | |
jp_lemma.append(y[0]) | |
return jp_lemma | |
def translate(sentence): | |
time.sleep(5) # あまり短くすると利用制限に引っかかるので注意 | |
return requests.get(URL % sentence).content.decode('utf8') | |
with open(SWN_PATH) as fd, open(RESULT_PATH, 'w') as rfd, sqlite3.connect(DB_PATH) as conn: | |
cursor = conn.cursor() | |
for line in fd.read().splitlines(): | |
if line.startswith('#'): | |
continue | |
POS, ID, PosScore, NegScore, SynsetTerms, Gloss = line.split('\t') | |
synset_id = '%s-%s' % (ID, POS) | |
jp_lemma = fetch_jp_lemma(synset_id, cursor) | |
sentences = [translate(sentence) for sentence in re_sentence.findall(Gloss)] | |
rfd.write('%s\t%s\t%s\t%s\t%s\n' % (synset_id, ','.join(jp_lemma) if jp_lemma else SynsetTerms, | |
PosScore, NegScore, ','.join(sentences))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment