Created
February 12, 2021 10:02
-
-
Save kzinmr/104ae2e7cee0f617cca82d68ea25d3bb to your computer and use it in GitHub Desktop.
Convert KWDLC to camphr NER format using https://github.com/ku-nlp/kyoto-reader
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import kyoto_reader #import KyotoReader, Document | |
def convert_camphr_dataset(doc: kyoto_reader.Document) -> list: | |
mrphs = doc.mrph_list() | |
named_entities = [ent for ent in doc.named_entities] | |
entities = [] | |
midasis = [mrph.midasi for mrph in mrphs] | |
text = ''.join(midasis) | |
for ent in named_entities: | |
label = ent.category | |
mrph_is = list(ent.dmid_range) # mid_range is range in a sentence | |
start, end = mrph_is[0], mrph_is[-1]+1 | |
offset = sum(len(midasis[i]) for i in range(0, start)) | |
ent_len = sum(len(midasis[i]) for i in range(start, end)) | |
if text[offset:offset+ent_len] == ent.name: | |
entities.append([offset, offset+ent_len, label]) | |
else: | |
print(start, end, offset, offset+ent_len) | |
print(text[offset:offset+ent_len]) | |
print(midasis[start:end]) | |
print(ent.name) | |
print() | |
return [text, {'entities': entities}] | |
def read_document(filepath: str, filename: str) -> kyoto_reader.Document: | |
docname = filename.split('.')[0] | |
reader = kyoto_reader.KyotoReader(filepath, # ファイルまたはディレクトリのパスを指定する | |
target_cases=['ガ', 'ヲ', 'ニ'], # ガ,ヲ,ニ格のみを対象とする | |
target_corefs=['=', '=構', '=≒', '=構≒'], # 共参照として扱う関係を列挙 | |
extract_nes=True # 固有表現もコーパスから抽出する | |
) | |
document = reader.process_document(docname) | |
return document | |
dataset_dir = './KWDLC/knp' | |
dataset = [ | |
read_document(os.path.join(cur, filename), filename) | |
for cur, dirs, files in os.walk(dataset_dir) | |
for filename in files | |
] | |
dataset_nes = [d for d in dataset if d.named_entities] | |
camphr_dataset_positive = [convert_camphr_dataset(doc) for doc in dataset_nes] | |
dataset_no_nes = [d for d in dataset if not d.named_entities] | |
camphr_dataset_negative = [[''.join(mrph.midasi for mrph in d.mrph_list()), {'entities':[]}] for d in dataset_no_nes] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment