alexeyev · October 16, 2022 08:43
diff --git a/sequence_to_sentences.py b/sequence_to_sentences.py
 # coding: utf-8

 sentences = []

 for line in open("test.txt", "r", encoding="utf-8").readlines()[1:]:

    seq = line.strip().split(" ")

    if len(seq) == 1:
        sentences.append([])
    else:
        sentences[-1].append((seq[0], seq[-1]))

 spans_all, types_all = [], []

 for sentence in sentences:

    spans, types = [], []
    current_type = None

    for idx, (token, label) in enumerate(sentence):
        if label == "O":
            current_type = None
            continue
        elif label.startswith("B"):
            current_type = label.split("-")[1]
            spans.append([idx])
            types.append(current_type)
        elif label.startswith("I") and label.split("-")[1] == current_type and spans[-1][-1] + 1 == idx:
            spans[-1].append(idx)
        else:
            print("Incorrect labeling", idx, sentence)
            quit()

    spans_all.append(spans)
    types_all.append(types)

 with open("sentences.txt", "w", encoding="utf-8") as wf:
    
    for sentence, (spans, types) in zip(sentences, zip(spans_all, types_all)):

        new_sentence, last_idx = [], 0

        for span, tipe in zip(spans, types):
            start, end = span[0], span[-1] + 1
            new_sentence.extend([w[0] for w in sentence[last_idx: start]])
            new_sentence.append("[")
            new_sentence.extend([w[0] for w in sentence[start: end]])
            new_sentence.append("]" + tipe)
            last_idx = end

        new_sentence.extend([w[0] for w in sentence[last_idx:]])

        wf.write(" ".join(new_sentence))
        wf.write("\n")
	# coding: utf-8

	sentences = []

	for line in open("test.txt", "r", encoding="utf-8").readlines()[1:]:

	seq = line.strip().split(" ")

	if len(seq) == 1:
	sentences.append([])
	else:
	sentences[-1].append((seq[0], seq[-1]))

	spans_all, types_all = [], []

	for sentence in sentences:

	spans, types = [], []
	current_type = None

	for idx, (token, label) in enumerate(sentence):
	if label == "O":
	current_type = None
	continue
	elif label.startswith("B"):
	current_type = label.split("-")[1]
	spans.append([idx])
	types.append(current_type)
	elif label.startswith("I") and label.split("-")[1] == current_type and spans[-1][-1] + 1 == idx:
	spans[-1].append(idx)
	else:
	print("Incorrect labeling", idx, sentence)
	quit()

	spans_all.append(spans)
	types_all.append(types)

	with open("sentences.txt", "w", encoding="utf-8") as wf:

	for sentence, (spans, types) in zip(sentences, zip(spans_all, types_all)):

	new_sentence, last_idx = [], 0

	for span, tipe in zip(spans, types):
	start, end = span[0], span[-1] + 1
	new_sentence.extend([w[0] for w in sentence[last_idx: start]])
	new_sentence.append("[")
	new_sentence.extend([w[0] for w in sentence[start: end]])
	new_sentence.append("]" + tipe)
	last_idx = end

	new_sentence.extend([w[0] for w in sentence[last_idx:]])

	wf.write(" ".join(new_sentence))
	wf.write("\n")