Last active
October 1, 2020 16:29
-
-
Save petitviolet/4950279 to your computer and use it in GitHub Desktop.
人名をMeCabの辞書に登録するためのスクリプト.
編集権限が無くても使える (http://d.hatena.ne.jp/petitviolet/20130214/1360809625)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- encoding:utf-8 -*- | |
import os | |
import sys | |
csv_path = '/home/hoge/user_dic/celebs.csv' # ユーザー辞書の元となるファイル | |
def add_dic(fname): | |
'''MeCabで使う辞書に人名を追加します | |
更新する人名はcsv形式のfname, | |
csvじゃないとき(人名を引数にした時)はそれ自体 | |
更新するユーザー辞書は/home/hoge/user_dic/celebs.csv | |
''' | |
if '.csv' in fname: | |
celebs = file(fname).read().replace('\n', '').split(',') | |
else: | |
celebs = [fname] | |
old_contents = file(csv_path).read() | |
with file(csv_path, 'a') as dic: | |
for celeb in celebs: | |
if not celeb in old_contents: | |
# cost = int(max(-36000, -400 * len(celeb) ** 1.5)) | |
cost = -99999 | |
new = '%s,0,0,%d,名詞,固有名詞,人名,*,*,*,%s,%s,%s\n' % \ | |
(celeb, cost, celeb, celeb, celeb) | |
dic.write(new) | |
input_csv = '/home/hoge/user_dic/celebs.csv' | |
output_dic = '/home/hoge/user_dic/celebs.dic' | |
update_dic = ('/usr/local/libexec/mecab/mecab-dict-index ' | |
'-d/usr/local/lib/mecab/dic/ipadic -u %s -f utf-8 -t utf-8 %s' | |
% (output_dic, input_csv)) | |
os.system(update_dic) | |
print 'MeCab辞書の更新が終わりました' | |
return None | |
def main(): | |
'''コマンドラインで使用する用 | |
python add_dictionary.py <fname or name> <clear?> | |
<fname or name> : csv形式のファイル名 or 人名 | |
<clear?> : 引数として何か渡ればcsv_pathを空にしてから実行 | |
''' | |
if len(sys.argv) > 1: | |
fname = sys.argv[1] | |
if len(sys.argv) == 3: | |
clear = sys.argv[2] | |
else: | |
clear = False | |
if len(sys.argv) not in [2, 3]: | |
raise ValueError | |
if clear: | |
print 'Clear...' | |
os.system('echo -n > %s' % csv_path) | |
add_dic(fname) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment