Last active
October 8, 2020 08:17
-
-
Save kzinmr/5115091949fdf5ae93e8becaf750eab5 to your computer and use it in GitHub Desktop.
split_text_and_spans for NER
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def get_block_index_offset(index, step): | |
assert step > 0 | |
offset = int(index % step) | |
block_index = int((index - offset) / step) | |
return block_index, offset | |
def split_text_and_spans(text, spans, tagtypes, step): | |
""" | |
DLに噛ますのに最大入力帳単位に文書を切る。スパン情報も分割にあわせて修正する。 | |
その際アノテーションが2ブロックに渡るケースがあるが、 | |
やっつけ対処として「はみ出たサイズ*2」ぶんだけ局所的にブロックテキストをずらしてブロック内にアノテーションを含める。 | |
""" | |
#print(data['meta']) | |
#text = data['text'] | |
assert len(spans) == len(tagtypes) | |
splitted_texts = [text[i:i+step] for i in range(0, len(text), step)] | |
splitted_ranges = [(i, i+step) for i in range(0, len(text), step)] | |
tag_block_spans = [[get_block_index_offset(s, step), get_block_index_offset(e, step)] for s, e in spans] | |
for i, ((s_block, s_offset), (e_block, e_offset)) in enumerate(tag_block_spans): | |
if s_block == e_block or e_offset == 0: | |
if e_offset==0: | |
e_offset = step | |
tag_block_spans[i] = (s_block, s_offset), (s_block, step) | |
#print(repr(splitted_texts[s_block][s_offset:e_offset])) | |
elif e_block - s_block == 1: | |
# 「はみ出たサイズ*2」ぶんだけ局所的にブロックテキストをずらす | |
first = splitted_texts[s_block][s_offset:] | |
second = splitted_texts[e_block][:e_offset] | |
new_text = text[s_block*step+e_offset*2:s_block*step+step+e_offset*2] | |
splitted_texts[s_block] = new_text | |
for j, ((s_block_rev, s_offset_rev), (e_block_rev, e_offset_rev)) in enumerate(tag_block_spans): | |
if s_block_rev == s_block: | |
tag_block_spans[j] = (s_block_rev, s_offset_rev - e_offset*2), (e_block_rev, e_offset_rev - e_offset*2) | |
tag_block_spans[i] = (s_block, s_offset - e_offset*2), (s_block, step - e_offset) | |
#print(repr(splitted_texts[s_block][s_offset - e_offset*2:step - e_offset]), (s_block, s_offset), (e_block, e_offset)) | |
else: | |
pass | |
#print((s_block, s_offset), (e_block, e_offset)) | |
entities_list = [[] for _ in splitted_texts] | |
for ((s_block, s_offset), (e_block, e_offset)), tagtype in zip(tag_block_spans, tagtypes): | |
assert s_block == e_block | |
entities_list[s_block].append((s_offset, e_offset, tagtype)) | |
#print(len(splitted_texts) , entities_list) | |
return [{'text': text, 'entities': entities, 'has_positive': bool(len(entities))} for text, entities in zip(splitted_texts, entities_list)] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment