Skip to content

Instantly share code, notes, and snippets.

@kzinmr
Last active October 8, 2020 08:17
Show Gist options
  • Save kzinmr/5115091949fdf5ae93e8becaf750eab5 to your computer and use it in GitHub Desktop.
Save kzinmr/5115091949fdf5ae93e8becaf750eab5 to your computer and use it in GitHub Desktop.
split_text_and_spans for NER
def get_block_index_offset(index, step):
assert step > 0
offset = int(index % step)
block_index = int((index - offset) / step)
return block_index, offset
def split_text_and_spans(text, spans, tagtypes, step):
"""
DLに噛ますのに最大入力帳単位に文書を切る。スパン情報も分割にあわせて修正する。
その際アノテーションが2ブロックに渡るケースがあるが、
やっつけ対処として「はみ出たサイズ*2」ぶんだけ局所的にブロックテキストをずらしてブロック内にアノテーションを含める。
"""
#print(data['meta'])
#text = data['text']
assert len(spans) == len(tagtypes)
splitted_texts = [text[i:i+step] for i in range(0, len(text), step)]
splitted_ranges = [(i, i+step) for i in range(0, len(text), step)]
tag_block_spans = [[get_block_index_offset(s, step), get_block_index_offset(e, step)] for s, e in spans]
for i, ((s_block, s_offset), (e_block, e_offset)) in enumerate(tag_block_spans):
if s_block == e_block or e_offset == 0:
if e_offset==0:
e_offset = step
tag_block_spans[i] = (s_block, s_offset), (s_block, step)
#print(repr(splitted_texts[s_block][s_offset:e_offset]))
elif e_block - s_block == 1:
# 「はみ出たサイズ*2」ぶんだけ局所的にブロックテキストをずらす
first = splitted_texts[s_block][s_offset:]
second = splitted_texts[e_block][:e_offset]
new_text = text[s_block*step+e_offset*2:s_block*step+step+e_offset*2]
splitted_texts[s_block] = new_text
for j, ((s_block_rev, s_offset_rev), (e_block_rev, e_offset_rev)) in enumerate(tag_block_spans):
if s_block_rev == s_block:
tag_block_spans[j] = (s_block_rev, s_offset_rev - e_offset*2), (e_block_rev, e_offset_rev - e_offset*2)
tag_block_spans[i] = (s_block, s_offset - e_offset*2), (s_block, step - e_offset)
#print(repr(splitted_texts[s_block][s_offset - e_offset*2:step - e_offset]), (s_block, s_offset), (e_block, e_offset))
else:
pass
#print((s_block, s_offset), (e_block, e_offset))
entities_list = [[] for _ in splitted_texts]
for ((s_block, s_offset), (e_block, e_offset)), tagtype in zip(tag_block_spans, tagtypes):
assert s_block == e_block
entities_list[s_block].append((s_offset, e_offset, tagtype))
#print(len(splitted_texts) , entities_list)
return [{'text': text, 'entities': entities, 'has_positive': bool(len(entities))} for text, entities in zip(splitted_texts, entities_list)]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment