Created
November 19, 2013 17:12
-
-
Save kmike/7548835 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
""" | |
Utilities for measuring parse quality. | |
""" | |
import functools | |
from russian_tagsets.ruscorpora import from_opencorpora_int | |
# memoize results for 4x speedup | |
op2rnc = functools.lru_cache(maxsize=10000)(from_opencorpora_int) | |
# функции для сопоставления граммем OpenCorpora и НКРЯ | |
def rnc_tags_match(tag1, tag2): | |
""" | |
Return True if RNC ``tag1`` matches ``tag2``, | |
assuming that tag2 was converted from OpenCorpora format. | |
""" | |
gr1, gr2 = _get_grammemes(str(tag1)), _get_grammemes(tag2) | |
tag1_POS = _get_pos(tag1) | |
diff_full = gr1 ^ gr2 | |
# FIXME? | |
if diff_full == set(['ANUM', 'NUM']) and 'ciph' in gr1: | |
# по цифровой записи не понять | |
return True | |
gr1 = _simplify_grammemes(gr1) | |
gr2 = _simplify_grammemes(gr2) | |
diff = gr1 ^ gr2 | |
if not diff: | |
return True | |
if gr1 == set(['PRAEDIC', 'comp']) and gr2 == set(['A', 'comp']): | |
return True | |
if gr1 == set(['PRAEDIC', 'comp2']) and gr2 == set(['A', 'comp2']): | |
return True | |
if diff == set(['persn', 'famn']): | |
# одно и то же? | |
return True | |
if diff == set(['inan']) or diff == set(['anim']): | |
# anim/inan пропущено - бывает | |
return True | |
if diff == set(['intr']) or diff == set(['tran']): | |
# переходность бывает пропущена | |
return True | |
if diff <= set(['A-PRO', 'S-PRO', 'PRAEDIC-PRO', 'ADV']): | |
# с местоимениями/предикативами/наречиями беда | |
return True | |
if diff == set(['PARENTH', 'PART']): | |
# вводные слова из НКРЯ - это часто частицы в OpenCorpora | |
return True | |
if diff == set(['PRAEDIC', 'PART']): | |
# предикативы - часто частицы в OpenCorpora (?) | |
return True | |
if diff == set(['PRAEDIC', 'ADV']): | |
# это вроде тоже разница в словарях - предикативы НКРЯ - это часто наречия в OpenCorpora | |
return True | |
#'ANUM' ? | |
if tag1_POS in set(['S-PRO', 'ANUM']) and diff == set(['sg']): | |
# в НКРЯ у S-PRO и ANUM число часто не указывается | |
# себе 'S-PRO=dat' | |
# одна 'ANUM=f,nom' | |
return True | |
if tag1_POS in set(['ADV-PRO', 'NUM']) and tag2 == 'ADV': | |
# никогда, потому, много и т.д. | |
return True | |
if tag1_POS == 'ADV' and gr2 == set(['A', 'comp']): | |
# дальше, скорее и т.д. | |
return True | |
if tag1_POS == 'ADV' and gr2 == set(['A', 'comp2']): | |
# поменьше, побольше и т.д. | |
return True | |
if tag1_POS == 'A-PRO' and set(['A-PRO', '0']) <= gr2: | |
# их | |
return True | |
if diff == set(['0', 'abbr']) and 'abbr' in gr1 and '0' in gr2: | |
# млн | |
return True | |
return False | |
def _get_grammemes_tuple(tag): | |
return tag.replace('=', ',').split(',') | |
def _get_grammemes(tag): | |
return frozenset(_get_grammemes_tuple(tag)) | |
def _get_pos(tag): | |
return _get_grammemes_tuple(str(tag))[0] | |
def _simplify_grammemes(gr): | |
gr -= set(['med', 'act']) # med/act сложно с OpenCorpora сравнивать | |
gr -= set(['distort']) # где-то distort, где-то не distort считается | |
gr -= set(['ciph']) # этого в OpenCorpora нет: токены вроде "7-литровый" бьются на 2 | |
gr -= set(['supr']) # почему-то в разметке НКРЯ нет | |
# у причастий plen иногда не указывается | |
# лежавшей 'V,ipf,intr,act=partcp,f,sg,gen,praet' | |
if 'V' in gr: | |
gr -= set(['plen']) | |
return gr | |
def rnc_pos_match(tag1, tag2): | |
""" | |
Return if POS of tag1 and tag2 match | |
(assuming that tag2 was converted from OpenCorpora format). | |
""" | |
if rnc_tags_match(tag1, tag2): | |
return True | |
return _get_pos(tag1) == _get_pos(tag2) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment