polm · April 30, 2020 09:18
diff --git a/spacy-convert.py b/spacy-convert.py
 #!/usr/bin/env python3

 """
 Convert GSD conll format to a format the spaCy convert script can use as-is.

 There are two main changes:

 1. POS tags format is changed slightly.

    old: 名詞-普通名詞-一般
    hyphen separated, variable fields

    new: 名詞,普通名詞,一般,*
    comma separated, always four fields, padded with asterisks

 2. The tenth field has a list of | separated attributes. BILOU NER tags are
 included, but O tags are omitted. spaCy looks for just NER data in the 10th
 field, so this script discards the other attributes and adds O tags.
 """

 import fileinput

 def pos_convert(oldpos):
    """Convert pos format.
    """
    parts = oldpos.split('-')
    newfields = (parts + list('***'))[:4]
    return ','.join(newfields)

 def ner_convert(nerfield):
    exfields = nerfield.split('|')
    ner = [xx for xx in exfields if xx[:3] == 'NE=']
    if len(ner) == 1:
        return ner[0][3:] # discard the prefix
    elif not ner:
        return 'O'
    else:
        raise RuntimeError("Multiple NER annotations?")

 for line in fileinput.input():
    line = line.strip()

    # full sentences and blank lines are unchanged
    if (not line) or line[0] == '#':
        print(line)
        continue

    fields = line.split('\t')
    fields[4] = pos_convert(fields[4])
    fields[9] = ner_convert(fields[9])

    print(*fields, sep='\t')
	#!/usr/bin/env python3

	"""
	Convert GSD conll format to a format the spaCy convert script can use as-is.

	There are two main changes:

	1. POS tags format is changed slightly.

	old: 名詞-普通名詞-一般
	hyphen separated, variable fields

	new: 名詞,普通名詞,一般,*
	comma separated, always four fields, padded with asterisks

	2. The tenth field has a list of \| separated attributes. BILOU NER tags are
	included, but O tags are omitted. spaCy looks for just NER data in the 10th
	field, so this script discards the other attributes and adds O tags.
	"""

	import fileinput

	def pos_convert(oldpos):
	"""Convert pos format.
	"""
	parts = oldpos.split('-')
	newfields = (parts + list('***'))[:4]
	return ','.join(newfields)

	def ner_convert(nerfield):
	exfields = nerfield.split('\|')
	ner = [xx for xx in exfields if xx[:3] == 'NE=']
	if len(ner) == 1:
	return ner[0][3:] # discard the prefix
	elif not ner:
	return 'O'
	else:
	raise RuntimeError("Multiple NER annotations?")

	for line in fileinput.input():
	line = line.strip()

	# full sentences and blank lines are unchanged
	if (not line) or line[0] == '#':
	print(line)
	continue

	fields = line.split('\t')
	fields[4] = pos_convert(fields[4])
	fields[9] = ner_convert(fields[9])

	print(*fields, sep='\t')