Skip to content

Instantly share code, notes, and snippets.

@kzinmr
Created February 12, 2021 10:02
Show Gist options
  • Save kzinmr/104ae2e7cee0f617cca82d68ea25d3bb to your computer and use it in GitHub Desktop.
Save kzinmr/104ae2e7cee0f617cca82d68ea25d3bb to your computer and use it in GitHub Desktop.
Convert KWDLC to camphr NER format using https://github.com/ku-nlp/kyoto-reader
import os
import kyoto_reader #import KyotoReader, Document
def convert_camphr_dataset(doc: kyoto_reader.Document) -> list:
mrphs = doc.mrph_list()
named_entities = [ent for ent in doc.named_entities]
entities = []
midasis = [mrph.midasi for mrph in mrphs]
text = ''.join(midasis)
for ent in named_entities:
label = ent.category
mrph_is = list(ent.dmid_range) # mid_range is range in a sentence
start, end = mrph_is[0], mrph_is[-1]+1
offset = sum(len(midasis[i]) for i in range(0, start))
ent_len = sum(len(midasis[i]) for i in range(start, end))
if text[offset:offset+ent_len] == ent.name:
entities.append([offset, offset+ent_len, label])
else:
print(start, end, offset, offset+ent_len)
print(text[offset:offset+ent_len])
print(midasis[start:end])
print(ent.name)
print()
return [text, {'entities': entities}]
def read_document(filepath: str, filename: str) -> kyoto_reader.Document:
docname = filename.split('.')[0]
reader = kyoto_reader.KyotoReader(filepath, # ファイルまたはディレクトリのパスを指定する
target_cases=['ガ', 'ヲ', 'ニ'], # ガ,ヲ,ニ格のみを対象とする
target_corefs=['=', '=構', '=≒', '=構≒'], # 共参照として扱う関係を列挙
extract_nes=True # 固有表現もコーパスから抽出する
)
document = reader.process_document(docname)
return document
dataset_dir = './KWDLC/knp'
dataset = [
read_document(os.path.join(cur, filename), filename)
for cur, dirs, files in os.walk(dataset_dir)
for filename in files
]
dataset_nes = [d for d in dataset if d.named_entities]
camphr_dataset_positive = [convert_camphr_dataset(doc) for doc in dataset_nes]
dataset_no_nes = [d for d in dataset if not d.named_entities]
camphr_dataset_negative = [[''.join(mrph.midasi for mrph in d.mrph_list()), {'entities':[]}] for d in dataset_no_nes]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment