Skip to content

Instantly share code, notes, and snippets.

@alexeyev
Created October 16, 2022 08:43
Show Gist options
  • Save alexeyev/3bde556288e2c80ce2712965e020d669 to your computer and use it in GitHub Desktop.
Save alexeyev/3bde556288e2c80ce2712965e020d669 to your computer and use it in GitHub Desktop.
# coding: utf-8
sentences = []
for line in open("test.txt", "r", encoding="utf-8").readlines()[1:]:
seq = line.strip().split(" ")
if len(seq) == 1:
sentences.append([])
else:
sentences[-1].append((seq[0], seq[-1]))
spans_all, types_all = [], []
for sentence in sentences:
spans, types = [], []
current_type = None
for idx, (token, label) in enumerate(sentence):
if label == "O":
current_type = None
continue
elif label.startswith("B"):
current_type = label.split("-")[1]
spans.append([idx])
types.append(current_type)
elif label.startswith("I") and label.split("-")[1] == current_type and spans[-1][-1] + 1 == idx:
spans[-1].append(idx)
else:
print("Incorrect labeling", idx, sentence)
quit()
spans_all.append(spans)
types_all.append(types)
with open("sentences.txt", "w", encoding="utf-8") as wf:
for sentence, (spans, types) in zip(sentences, zip(spans_all, types_all)):
new_sentence, last_idx = [], 0
for span, tipe in zip(spans, types):
start, end = span[0], span[-1] + 1
new_sentence.extend([w[0] for w in sentence[last_idx: start]])
new_sentence.append("[")
new_sentence.extend([w[0] for w in sentence[start: end]])
new_sentence.append("]" + tipe)
last_idx = end
new_sentence.extend([w[0] for w in sentence[last_idx:]])
wf.write(" ".join(new_sentence))
wf.write("\n")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment