Last active
June 3, 2021 11:43
-
-
Save BrambleXu/2d443e3c894f230195bf1b098c63b963 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import unicodedata | |
from typing import List | |
from pathlib import Path | |
from collections import defaultdict | |
from ahocorasick import Automaton | |
def read_dictionary(dict_path: str) -> dict: | |
with open(dict_path, 'r', encoding='utf-8') as f: | |
company_dict = {} | |
for i, line in enumerate(f, start=1): | |
try: | |
name = line.strip() | |
name = unicodedata.normalize('NFKC', name) | |
company_dict[name] = i | |
except Exception as e: | |
print(e) | |
print(line) | |
print('Read dictionary done.') | |
return company_dict | |
def build_trie(company_dict: dict) -> Automaton: | |
trie = Automaton() | |
for name, idx in company_dict.items(): | |
trie.add_word(name, (idx, name)) | |
trie.make_automaton() | |
print('Build dictionary trie done.') | |
return trie | |
def filter_chunks(chunks: list) -> list: | |
chunks = sorted(chunks) | |
# same start but for longest match | |
dic = defaultdict(list) | |
last_chunk = chunks[0] | |
for chunk in chunks: | |
start_idx = chunk[0] | |
end_idx = chunk[1] | |
if start_idx not in dic: | |
# [131, 139, 'ジャパンエナジー'], [133, 134, 'パ'] | |
if last_chunk[0] <= start_idx and last_chunk[1] >= end_idx: | |
continue | |
else: # [48, 53, '愛知学泉大'] | |
dic[start_idx] = chunk | |
else: # [131, 135, 'ジャパン'], [131, 139, 'ジャパンエナジー'] | |
if dic[start_idx][1] < chunk[1]: | |
dic[start_idx] = chunk | |
last_chunk = chunk | |
# same end but for longest match | |
chunks = dic.values() | |
dic = defaultdict(list) | |
for chunk in chunks: | |
end_idx = chunk[1] | |
if end_idx not in dic: | |
dic[end_idx] = chunk | |
else: | |
if dic[end_idx][0] > chunk[0]: | |
dic[end_idx] = chunk | |
return list(dic.values()) | |
def tag_with_dict(company_trie: Automaton, sents: list, duplicate=None) -> float: | |
for sent in sents: | |
text = ''.join(sent).strip() | |
text = unicodedata.normalize('NFKC', text) | |
chunks = [] | |
# find all chunks | |
for idx, (_, w) in company_trie.iter(text): | |
end_idx = idx + 1 | |
start_idx = end_idx - len(w) | |
chunks.append([start_idx, end_idx, w]) # [[48, 53, '愛知学泉大'], [122, 130, 'シャンソン化粧品'], [131, 135, 'ジャパン'], [131, 139, 'ジャパンエナジー'], [133, 134, 'パ'], [140, 144, '第一勧銀']] | |
# find chunks | |
if len(chunks) != 0: | |
# filter chunks | |
chunks = filter_chunks(chunks) # [[122, 130, 'シャンソン化粧品'], [131, 139, 'ジャパンエナジー'], [140, 144, '第一勧銀']] | |
return chunks | |
if __name__ == "__main__": | |
# dict_path = Path('jcl_slim.csv') | |
dict_path = Path('/Users/smap10/Project/japanese-company-lexicon/data/dictionaries/output/jcl_slim.csv') | |
company_dict = read_dictionary(dict_path) | |
company_trie = build_trie(company_dict) | |
sents = ['TISインテックグループのTIS株式会社は、自然言語処理で企業名認識を行うための辞書JCLdic(日本会社名辞書)を無償公開。'] | |
chunks = tag_with_dict(company_trie, sents) | |
print(chunks) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
not working.