Created
March 11, 2019 23:35
-
-
Save erickrf/b3ede1e817c58d0df71dc81ad6f5d8dc to your computer and use it in GitHub Desktop.
Script to find combinations of word and tags that have more than one lemma in the UD treebanks
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import argparse | |
from collections import defaultdict | |
parser = argparse.ArgumentParser() | |
parser.add_argument('input', | |
help='Input file in CoNLLU format') | |
parser.add_argument('-u', action='store_true', dest='upos', | |
help='Use UPOS to disambiguate') | |
parser.add_argument('-x', action='store_true', dest='xpos', | |
help='Use XPOS to disambiguate') | |
parser.add_argument('-m', action='store_true', dest='morph', | |
help='Use morph tags to disambiguate') | |
args = parser.parse_args() | |
lemma_dict = defaultdict(set) | |
with open(args.input, 'r') as f: | |
for line in f: | |
line = line.strip() | |
if line.startswith('#') or line == '': | |
continue | |
fields = line.split() | |
form = fields[1].lower() | |
comb = [form] | |
lemma = fields[2].lower() | |
if args.upos: | |
upos = fields[3] | |
comb.append(upos) | |
if args.xpos: | |
xpos = fields[4] | |
comb.append(xpos) | |
if args.morph: | |
morph = fields[5] | |
comb.append(morph) | |
comb = tuple(comb) | |
lemma_dict[comb].add(lemma) | |
for comb in lemma_dict: | |
lemmas = lemma_dict[comb] | |
if len(lemmas) > 1: | |
print('Combination', comb, 'has the following possible lemmas:', lemmas) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment