Skip to content

Instantly share code, notes, and snippets.

@kmike
Created March 25, 2015 01:42
Show Gist options
  • Save kmike/52fb0a9b3ed627310bea to your computer and use it in GitHub Desktop.
Save kmike/52fb0a9b3ed627310bea to your computer and use it in GitHub Desktop.
pymorphy2 - mystem evaluation
Display the source blob
Display the rendered blob
Raw
{"cells": [{"source": "# \u041e\u0446\u0435\u043d\u043a\u0430 \u043a\u0430\u0447\u0435\u0441\u0442\u0432\u0430 \u0440\u0430\u0437\u0431\u043e\u0440\u0430", "metadata": {}, "cell_type": "markdown"}, {"execution_count": 1, "source": "import os\nimport re\nimport glob\nimport json\n\nfrom pymorphy2 import MorphAnalyzer\nfrom pymystem3 import Mystem\n\nimport russian_tagsets", "metadata": {"trusted": true, "collapsed": false}, "cell_type": "code", "outputs": []}, {"source": "## \u0412\u0435\u0440\u0441\u0438\u0438 \u043f\u0440\u043e\u0433\u0440\u0430\u043c\u043c:\n\n* pymorphy2 - https://github.com/kmike/pymorphy2, \u043a\u043e\u043c\u043c\u0438\u0442 7c3bd6d27d23e1b4fa5cfb0f3d4bb604b6f3e68b\n* pymorphy2-dicts-ru == 2.4.393442.3710985\n* mystem v3.0\n* pymystem == 0.1.2\n* russian_tagsets == 0.5.2", "metadata": {}, "cell_type": "markdown"}, {"execution_count": 2, "source": "# pymorphy2\nmorph = MorphAnalyzer(lang='ru')\nmorph_noprob = MorphAnalyzer(lang='ru', probability_estimator_cls=None)\n\n# mystem\nmystem = Mystem(disambiguation=False, grammar_info=True)\nmystem._mystemargs.remove('-gi')\nmystem._mystemargs.remove('-c')\nmystem._mystemargs += ['-i', '--eng-gr']", "metadata": {"trusted": true, "collapsed": false}, "cell_type": "code", "outputs": []}, {"source": "### \u041f\u0440\u0435\u043e\u0431\u0440\u0430\u0437\u043e\u0432\u0430\u043d\u0438\u0435 \u0442\u0435\u0433\u043e\u0432 \u043a \u043e\u0431\u0449\u0435\u043c\u0443 \u0444\u043e\u0440\u043c\u0430\u0442\u0443\n\n\u0423\u0447\u0430\u0441\u0442\u0432\u0443\u0435\u0442 3 \u0442\u0435\u0433\u0441\u0435\u0442\u0430: OpenCorpora (pymorphy2 \u0438 \u0447\u0430\u0441\u0442\u044c \u0440\u0430\u0437\u043c\u0435\u0447\u0435\u043d\u043d\u044b\u0445 \u0434\u0430\u043d\u043d\u044b\u0445), \u041d\u041a\u0420\u042f (\u0447\u0430\u0441\u0442\u044c \u0440\u0430\u0437\u043c\u0435\u0447\u0435\u043d\u043d\u044b\u0445 \u0434\u0430\u043d\u043d\u044b\u0445) \u0438 mystem.\n\n\u0411\u0438\u0431\u043b\u0438\u043e\u0442\u0435\u043a\u0430 russian-tagsets \u0443\u043c\u0435\u0435\u0442 \u043f\u0440\u0435\u043e\u0431\u0440\u0430\u0437\u043e\u0432\u044b\u0432\u0430\u0442\u044c \u0438\u0437 \u0444\u043e\u0440\u043c\u0430\u0442\u0430 OpenCorpora \u0432 \u0444\u043e\u0440\u043c\u0430\u0442 \u041d\u041a\u0420\u042f.\n\u0418\u0437 \u041d\u041a\u0420\u042f \u0432\u0441\u0435 \u043f\u0440\u0438\u0432\u043e\u0434\u0438\u043c \u043a \u0444\u043e\u0440\u043c\u0430\u0442\u0443 mystem.", "metadata": {}, "cell_type": "markdown"}, {"execution_count": 3, "source": "_pym2ruscorpora = russian_tagsets.converters.converter('opencorpora-int', 'ruscorpora')\n\ndef ruscorpora2mystem(tag):\n \"\"\" Convert ruscorpora.ru tag to mystem tag \"\"\"\n tag = tag.replace('-', '').replace('zoon', 'persn')\n tag = tag.replace('loc2', 'LOC').replace('loc', 'abl').replace('LOC', 'loc')\n tag = tag.replace('fut', 'inpraes') # ~sort of\n tag = tag.replace('gen2', 'part')\n tag = tag.replace('PARENTH', 'parenth').replace('PRAEDIC', 'praed')\n return tag\n \n\ndef py2mystem(tag):\n \"\"\" Convert pymorphy2 tag to mystem tag. \"\"\"\n tag = _pym2ruscorpora(str(tag))\n return ruscorpora2mystem(tag)\n\n\n_tag2grammemes = re.compile('[,=]').split\ntag2grammemes = lambda tag: _tag2grammemes(tag)\n\n\ndef mystem_analyze(token):\n \"\"\" \n Analyze a single token using mystem. \n Return None if mystem analyzes the token as multiple tokens.\n \"\"\"\n res = mystem.analyze(token) \n if res[0]['text'] != token:\n return None\n result = res[0]['analysis']\n for p in result:\n if 'gr' in p:\n p['gr'] = p['gr'].rstrip('=')\n return result \n\n\ndef pymorphy2_analyze(token, prob=True):\n m = morph if prob else morph_noprob\n return [\n {'gr': py2mystem(p.tag), 'lex': p.normal_form} \n for p in m.parse(token)\n ]", "metadata": {"trusted": true, "collapsed": false}, "cell_type": "code", "outputs": []}, {"execution_count": 4, "source": "mystem_analyze('\u0434\u0440\u0443\u0437\u044c\u044f\u0445')", "metadata": {"trusted": true, "collapsed": false}, "cell_type": "code", "outputs": [{"output_type": "execute_result", "execution_count": 4, "data": {"text/plain": "[{'gr': 'S,m,anim=abl,pl', 'lex': '\u0434\u0440\u0443\u0433'}]"}, "metadata": {}}]}, {"execution_count": 5, "source": "pymorphy2_analyze('\u0434\u0440\u0443\u0437\u044c\u044f\u0445')", "metadata": {"trusted": true, "collapsed": false}, "cell_type": "code", "outputs": [{"output_type": "execute_result", "execution_count": 5, "data": {"text/plain": "[{'gr': 'S,anim,m=pl,abl', 'lex': '\u0434\u0440\u0443\u0433'}]"}, "metadata": {}}]}, {"source": "### \u0422\u0435\u0441\u0442\u043e\u0432\u044b\u0439 \u043a\u043e\u0440\u043f\u0443\u0441\n\n\u0414\u043b\u044f \u043e\u0446\u0435\u043d\u043a\u0438 \u043a\u0430\u0447\u0435\u0441\u0442\u0432\u0430 \u0441\u043e\u0431\u0440\u0430\u043d \u043a\u043e\u0440\u043f\u0443\u0441 \u0438\u0437 2 \u0447\u0430\u0441\u0442\u0435\u0439: \n\n* https://github.com/kmike/microcorpus - 100 \u0441\u043b\u0443\u0447\u0430\u0439\u043d\u043e \u0432\u044b\u0431\u0440\u0430\u043d\u043d\u044b\u0445 \u043f\u0440\u0435\u0434\u043b\u043e\u0436\u0435\u043d\u0438\u0439 \u0438\u0437 OpenCorpora, \u0440\u0430\u0437\u043c\u0435\u0447\u0435\u043d\u043d\u044b\u0445 \u0432\u0440\u0443\u0447\u043d\u0443\u044e (\u044d\u0442\u043e \u043d\u0435 \u0442\u043e \u0436\u0435 \u0441\u0430\u043c\u043e\u0435, \u0447\u0442\u043e 100 \u0441\u043b\u0443\u0447\u0430\u0439\u043d\u044b\u0445 \u043f\u0440\u0435\u0434\u043b\u043e\u0436\u0435\u043d\u0438\u0435 \u0438\u0437 \u0440\u0430\u0437\u043c\u0435\u0447\u0435\u043d\u043d\u043e\u0439 \u0447\u0430\u0441\u0442\u0438 OpenCorpora!);\n* 100 \u0441\u043b\u0443\u0447\u0430\u0439\u043d\u044b\u0445 \u043f\u0440\u0435\u0434\u043b\u043e\u0436\u0435\u043d\u0438\u0439 \u0441\u043e \u0441\u043d\u044f\u0442\u043e\u0439 \u043d\u0435\u043e\u0434\u043d\u043e\u0437\u043d\u0430\u0447\u043d\u043e\u0441\u0442\u044c\u044e \u0438\u0437 \u043a\u0430\u043a\u043e\u0439-\u0442\u043e \u0441\u0442\u0430\u0440\u043e\u0439 \u0432\u044b\u0433\u0440\u0443\u0437\u043a\u0438 \u041d\u041a\u0420\u042f.", "metadata": {}, "cell_type": "markdown"}, {"execution_count": 6, "source": "def read_microcorpus_file(path): \n with open(path, 'rt', encoding='utf8') as f:\n sent = [line.split(' ', 1) for line in f if line.strip()]\n return [(tok.strip(), tag.strip()) for tok, tag in sent]\n\n\ndef read_ruscorpora_json(path):\n with open(path, 'rt', encoding='utf8') as f:\n return json.load(f)\n \n\nsents_microcorpus_src = [\n read_microcorpus_file(path)\n for path in glob.glob('./microcorpus-done/*.txt')\n]\nsents_ruscorpora_src = read_ruscorpora_json('./ruscorpora-100-fixed.json')\n\nsents_microcorpus = [\n [(tok, py2mystem(tag)) for tok, tag in sent] \n for sent in sents_microcorpus_src\n]\n\nsents_ruscorpora = [\n [(tok, ruscorpora2mystem(tag)) for tok, tag in sent] \n for sent in sents_ruscorpora_src\n]\n\nsents = sents_microcorpus + sents_ruscorpora\n\nprint(\"microcorpus: %d sents; ruscorpora: %d sents\" % (len(sents_microcorpus), len(sents_ruscorpora)))", "metadata": {"trusted": true, "collapsed": false}, "cell_type": "code", "outputs": [{"output_type": "stream", "name": "stdout", "text": "microcorpus: 100 sents; ruscorpora: 100 sents\n"}]}, {"source": "### \u0422\u0438\u043f\u044b \u0442\u043e\u043a\u0435\u043d\u043e\u0432\n\n\u041f\u0443\u043d\u043a\u0442\u0443\u0430\u0446\u0438\u044f, \u0446\u0438\u0444\u0440\u044b \u0438 \u043b\u0430\u0442\u0438\u043d\u0441\u043a\u0438\u0435 \u0441\u043b\u043e\u0432\u0430 \u043d\u0435 \u0443\u0447\u0438\u0442\u044b\u0432\u0430\u044e\u0442\u0441\u044f.", "metadata": {}, "cell_type": "markdown"}, {"execution_count": 7, "source": "def to_tokens(sents):\n tokens = [(tok, tag2grammemes(tag)) for sent in sents for (tok, tag) in sent]\n tokens = [(tok, gr) for tok, gr in tokens if not (set(gr) & {'PNCT', 'NONLEX', 'ciph'})]\n return tokens\n\ntokens = to_tokens(sents)\ntokens_microcorpus = to_tokens(sents_microcorpus)\ntokens_ruscorpora = to_tokens(sents_ruscorpora)", "metadata": {"trusted": true, "collapsed": false}, "cell_type": "code", "outputs": []}, {"execution_count": 8, "source": "print(\n \"Total tokens: %d (%d microcorpus + %d ruscorpora)\" % (\n len(tokens), len(tokens_microcorpus), len(tokens_ruscorpora))\n)", "metadata": {"trusted": true, "collapsed": false}, "cell_type": "code", "outputs": [{"output_type": "stream", "name": "stdout", "text": "Total tokens: 2498 (1405 microcorpus + 1093 ruscorpora)\n"}]}, {"source": "### \u0421\u043e\u043f\u043e\u0441\u0442\u0430\u0432\u043b\u0435\u043d\u0438\u0435 \u0442\u0435\u0433\u043e\u0432\n\n\u0423\u0447\u0438\u0442\u044b\u0432\u0430\u044e\u0442\u0441\u044f \u0442\u0435\u0433\u0438 \u0446\u0435\u043b\u0438\u043a\u043e\u043c - \u043e\u0446\u0435\u043d\u0438\u0432\u0430\u0435\u0442\u0441\u044f \u043a\u0430\u0447\u0435\u0441\u0442\u0432\u043e \u043f\u043e\u043b\u043d\u043e\u0433\u043e \u043c\u043e\u0440\u0444\u043e\u043b\u043e\u0433\u0438\u0447\u0435\u0441\u043a\u043e\u0433\u043e \u0430\u043d\u0430\u043b\u0438\u0437\u0430. \n\n\u041f\u0440\u0438 \u044d\u0442\u043e\u043c \u0438\u0437-\u0437\u0430 \u0442\u043e\u0433\u043e, \u0447\u0442\u043e \u043f\u0440\u0438\u0445\u043e\u0434\u0438\u0442\u0441\u044f \u043f\u0440\u0435\u043e\u0431\u0440\u0430\u0437\u043e\u0432\u044b\u0432\u0430\u0442\u044c 3 \u0440\u0430\u0437\u043b\u0438\u0447\u043d\u044b\u0445 \u043d\u0430\u0431\u043e\u0440\u0430 \u0442\u0435\u0433\u043e\u0432 \u0434\u0440\u0443\u0433 \u0432 \u0434\u0440\u0443\u0433\u0430, \u043d\u0435\u043a\u043e\u0442\u043e\u0440\u044b\u0435 \u0440\u0430\u0437\u043b\u0438\u0447\u0438\u044f \u0440\u0430\u0437\u043b\u0438\u0447\u0438\u044f\u043c\u0438 \u043d\u0435 \u0441\u0447\u0438\u0442\u0430\u044e\u0442\u0441\u044f - \u0440\u0430\u0437\u043b\u0438\u0447\u0438\u044f \u043c\u043e\u0433\u0443\u0442 \u0431\u044b\u0442\u044c \u0432\u044b\u0437\u0432\u0430\u043d\u044b \u043d\u0435\u0442\u043e\u0447\u043d\u043e\u0441\u0442\u044c\u044e \u043f\u0440\u0435\u043e\u0431\u0440\u0430\u0437\u043e\u0432\u0430\u043d\u0438\u044f \u0442\u0435\u0433\u043e\u0432 \u0438\u043b\u0438 \u0440\u0430\u0437\u043d\u044b\u043c\u0438 \u043f\u043e\u0434\u0445\u043e\u0434\u0430\u043c\u0438 \u043a \u0440\u0430\u0437\u043c\u0435\u0442\u043a\u0435. \u0427\u0442\u043e\u0431 \u043f\u043e\u043d\u044f\u0442\u044c, \u0437\u0430\u0447\u0435\u043c \u043a\u0430\u0436\u0434\u043e\u0435 \u0438\u0437 \u0443\u0441\u043b\u043e\u0432\u0438\u0439, \u043c\u043e\u0436\u043d\u043e \u0435\u0433\u043e \u0437\u0430\u043a\u043e\u043c\u043c\u0435\u043d\u0442\u0438\u0440\u043e\u0432\u0430\u0442\u044c \u0438 \u043f\u043e\u0441\u043c\u043e\u0442\u0440\u0435\u0442\u044c, \u043a\u0430\u043a\u0438\u0435 \u043f\u043e\u044f\u0432\u044f\u0442\u0441\u044f \u0434\u043e\u043f\u043e\u043b\u043d\u0438\u0442\u0435\u043b\u044c\u043d\u044b\u0435 \u043d\u0435\u0441\u043e\u043e\u0442\u0432\u0435\u0442\u0441\u0442\u0432\u0438\u044f \u0432 \u0440\u0430\u0437\u0431\u043e\u0440\u0430\u0445.", "metadata": {}, "cell_type": "markdown"}, {"execution_count": 9, "source": "def _gram(p):\n \"\"\" Extract grammemes from a parse result. \"\"\"\n if isinstance(p, dict):\n return tag2grammemes(p['gr'])\n if isinstance(p, str):\n return tag2grammemes(p)\n return p\n \n\ndef tags_diff(t1, t2):\n \"\"\" \n Return a set of grammemes which are different between t1 and t2,\n taking conversion issues in account.\n \"\"\"\n gr1 = set(_gram(t1))\n gr2 = set(_gram(t2))\n diff = gr1 ^ gr2\n comb = gr1 | gr2\n common = gr1 & gr2\n diff -= {'anim', 'inan', 'persn', 'famn', '0', 'obsol', 'geo', 'distort', 'med', 'act', 'plen'}\n \n if diff == {'ADV'} and ({'parenth', 'praed'} & comb):\n return {}\n \n if diff == {'PART'} and 'parenth' in comb:\n return {}\n \n if diff == {'parenth'}:\n return {}\n \n if diff == {'CONJ', 'parenth'}:\n return {} \n \n if diff == {'inpraes', 'praes'} and 'ipf' in comb:\n return {}\n \n if diff == {'fut', 'inpraes', 'ipf'}:\n return {}\n \n if diff == {'tran'} or diff == {'inpraes', 'praes', 'tran'}:\n return {}\n \n if diff == {'ipf'}:\n return {}\n \n if 'S' in diff and 'INIT' in diff and 'abbr' in common:\n return {}\n \n if diff == {'SPRO', 'APRO'}:\n return {}\n \n if 'SPRO' in common:\n return {}\n \n if 'APRO' in common:\n return {}\n \n if diff == {'A', 'NUM'}:\n return {} \n \n if diff == {'praed', 'ADV'}:\n return {} \n \n if diff == {'praed', 'A'}:\n return {} \n \n if diff == {'APRO', 'ANUM', 'sg'}:\n return {}\n \n if diff == {'ADV', 'ADVPRO'}:\n return {}\n \n if diff == {'A', 'ADV'} and 'comp' in common:\n return {} \n \n if diff == {'A', 'pl', 'brev', 'ADV'}:\n return {}\n \n if diff == {'mf', 'm'}:\n return {}\n \n if diff == {'abbr'} or diff == {'abbr', 'f'}:\n return {}\n \n if diff == {'f'} or diff == {'m'} and 'abbr' in common:\n return {}\n \n return diff\n\n\ndef tags_match(t1, t2):\n \"\"\" \n Return True if t1 and t2 tags are the same\n (taking in account tagset conversion issues).\n \"\"\"\n return not tags_diff(t1, t2)\n\n\ndef has_correct(correct, parses):\n if parses is None:\n # mystem can't parse most hyphenated words as a single token;\n # don't consider it an error\n return True\n \n for p in parses:\n if tags_match(p, correct):\n return True\n return False\n\n\ndef is_bad(correct, parses):\n return not has_correct(correct, parses)", "metadata": {"trusted": true, "collapsed": false}, "cell_type": "code", "outputs": []}, {"source": "### \u041f\u043e\u0438\u0441\u043a \u043e\u0448\u0438\u0431\u043e\u043a \u0440\u0430\u0437\u0431\u043e\u0440\u0430\n\n\u0415\u0441\u043b\u0438 \u0441\u0440\u0435\u0434\u0438 \u043f\u0440\u0435\u0434\u043b\u043e\u0436\u0435\u043d\u043d\u044b\u0445 \u043c\u043e\u0440\u0444\u043e\u043b\u043e\u0433\u0438\u0447\u0435\u0441\u043a\u0438\u043c \u0430\u043d\u0430\u043b\u0438\u0437\u0430\u0442\u043e\u0440\u043e\u043c \u0432\u0430\u0440\u0438\u0430\u043d\u0442\u043e\u0432 \u043d\u0435\u0442 \u0441\u043e\u0432\u043f\u0430\u0434\u0430\u044e\u0449\u0435\u0433\u043e \u0441 \u043f\u0440\u0430\u0432\u0438\u043b\u044c\u043d\u044b\u043c, \u0442\u043e \u0440\u0430\u0437\u0431\u043e\u0440 \u0441\u0447\u0438\u0442\u0430\u0435\u0442\u0441\u044f \u043d\u0435\u043f\u0440\u0430\u0432\u0438\u043b\u044c\u043d\u044b\u043c.\n\n\u042d\u0442\u043e \u043f\u0440\u0435\u0434\u0432\u0430\u0440\u0438\u0442\u0435\u043b\u044c\u043d\u044b\u0439 \u0448\u0430\u0433 - \u0432\u0441\u0435 \u043e\u0448\u0438\u0431\u043a\u0438 \u0431\u0443\u0434\u0443\u0442 \u043f\u043e\u0442\u043e\u043c \u0435\u0449\u0435 \u0440\u0430\u0437 \u043f\u0440\u043e\u0432\u0435\u0440\u0435\u043d\u044b \u0432\u0440\u0443\u0447\u043d\u0443\u044e.\n\n\u041f\u0440\u0438\u043c\u0435\u0447\u0430\u043d\u0438\u0435: \u0432 \u0432\u044b\u0431\u043e\u0440\u043a\u0435 \u0438\u0437 \u041d\u041a\u0420\u042f \u0431\u044b\u043b\u043e \u043d\u0430\u0439\u0434\u0435\u043d\u043e 6 \u043e\u0448\u0438\u0431\u043e\u043a \u043d\u0430 100 \u043f\u0440\u0435\u0434\u043b\u043e\u0436\u0435\u043d\u0438\u0439; \u0432\u043e\u0442 \u0447\u0442\u043e \u0443\u0441\u0442\u0440\u0430\u043d\u0435\u043d\u043e:\n\n* \u0440\u0435\u0431\u044f\u0442\u0430 - \u043f\u043e\u0447\u0435\u043c\u0443-\u0442\u043e \u0441\u0440\u0435\u0434\u043d\u0438\u0439 \u0440\u043e\u0434;\n* \u043d\u0430\u043f\u0440\u0430\u0432\u043b\u0435\u043d\u043d\u044b\u0435 - \u043e\u0442\u043a\u0443\u0434\u0430 inan? \u0412\u0438\u0434\u0438\u043c\u043e, \u0438\u043c\u0435\u043b\u043e\u0441\u044c \u0432 \u0432\u0438\u0434\u0443 intr;\n* \u0434\u043e\u0441\u0442\u0438\u0433\u0430\u0435\u0442 - \u043f\u043e\u0447\u0435\u043c\u0443-\u0442\u043e \u043d\u0435\u043f\u0435\u0440\u0435\u0445\u043e\u0434\u043d\u044b\u0439;\n* \u0441\u043b\u0435\u0434\u043e\u0432\u0430\u0442\u0435\u043b\u044c\u043d\u043e - \u044d\u0442\u043e \u0441\u043a\u043e\u0440\u0435\u0435 \u0441\u043e\u044e\u0437/\u0432\u0432\u043e\u0434\u043d\u043e\u0435 \u0441\u043b\u043e\u0432\u043e;\n* \"\u0435\u0449\u0435 \u043f\u043e\u043b\u0432\u0435\u043a\u0430 \u043d\u0430\u0437\u0430\u0434\" \u0438 \"\u0443\u0436\u0435 \u043f\u043e\u043b\u0433\u043e\u0434\u0430 \u043a\u0430\u043a\" - \u0432\u0438\u043d\u0438\u0442\u0435\u043b\u044c\u043d\u044b\u0439 \u043f\u0430\u0434\u0435\u0436, \u0430 \u043d\u0435 \u0440\u043e\u0434\u0438\u0442\u0435\u043b\u044c\u043d\u044b\u0439;", "metadata": {}, "cell_type": "markdown"}, {"execution_count": 10, "source": "pymorphy2_errors = [(tok, gr) for tok, gr in tokens if is_bad(gr, pymorphy2_analyze(tok))]\nmystem_errors = [(tok, gr) for tok, gr in tokens if is_bad(gr, mystem_analyze(tok))]\n\npy_err = len(pymorphy2_errors)\nmy_err = len(mystem_errors)\nprint(\"pymorphy2: %d errors ==> %0.1f%% has correct results\" % (py_err, 100*(1-py_err/len(tokens))))\nprint(\"mystem: %d errors ==> %0.1f%% has correct results\" % (my_err, 100*(1-my_err/len(tokens))))\nprint(\"Note: not all errors are real errors; see below\")", "metadata": {"trusted": true, "collapsed": false}, "cell_type": "code", "outputs": [{"output_type": "stream", "name": "stdout", "text": "pymorphy2: 21 errors ==> 99.2% has correct results\nmystem: 28 errors ==> 98.9% has correct results\nNote: not all errors are real errors; see below\n"}]}, {"execution_count": 11, "source": "pymorphy2_errors", "metadata": {"trusted": true, "collapsed": false}, "cell_type": "code", "outputs": [{"output_type": "execute_result", "execution_count": 11, "data": {"text/plain": "[('\u042e\u043d\u0433', ['S', 'anim', 'm', 'famn', 'sg', 'nom']),\n ('\u0412\u041e\u0412', ['S', 'inan', 'f', '0', 'abbr', 'sg', 'gen']),\n ('\u0442', ['PART', 'abbr']),\n ('\u0433', ['S', 'inan', 'm', '0', 'abbr', 'sg', 'loc']),\n ('\u041c\u0430\u043b\u0445\u043e\u043b\u043b\u0430\u043d\u0434\u0430', ['S', 'anim', 'm', 'famn', 'sg', 'gen']),\n ('\u0441\u0432', ['A', 'plen', 'abbr', 'm', 'sg', 'gen']),\n ('\u0414\u044b\u0430', ['PART', 'distort']),\n ('\u0434\u044b\u0430', ['PART', 'distort']),\n ('\u0414\u0438\u0430\u043d\u0430', ['S', 'anim', 'mf', '0', 'famn', 'sg', 'nom']),\n ('\u0422\u0430-\u0430-\u0430\u043a', ['PART', 'distort']),\n ('\u041f\u0420\u041e', ['S', 'inan', 'f', '0', 'abbr', 'sg', 'nom']),\n ('\u041c\u0411\u041f', ['S', 'n', 'inan', '0', 'sg', 'gen']),\n ('\u043f\u043e\u043b\u0433\u043e\u0434\u0430', ['S', 'm', 'inan', 'sg', 'acc']),\n ('\u041d\u041f\u0424', ['S', 'm', 'inan', '0', 'pl', 'acc']),\n ('\u0442\u0430\u043a\u0436\u0435', ['PART']),\n ('\u0444\u0438\u0442\u043d\u0435\u0441\u0441\u0430', ['S', 'm', 'inan', 'sg', 'gen']),\n ('\u0421\u043b\u0443\u0446\u043a\u0435\u0440', ['S', 'famn', 'f', 'anim', 'sg', 'ins']),\n ('\u043d\u0430\u043f\u0440\u0430\u0432\u043b\u0435\u043d\u043d\u044b\u0435', ['V', 'pf', 'partcp', 'praet', 'pass', 'pl', 'acc', 'intr']),\n ('\u043f\u043e\u043b\u0432\u0435\u043a\u0430', ['S', 'm', 'inan', 'sg', 'acc']),\n ('\u0421\u0411', ['S', 'm', 'inan', '0', 'sg', 'gen']),\n ('\u041e\u041c\u0423', ['S', 'n', 'inan', '0', 'sg', 'gen'])]"}, "metadata": {}}]}, {"execution_count": 12, "source": "mystem_errors", "metadata": {"trusted": true, "collapsed": false}, "cell_type": "code", "outputs": [{"output_type": "execute_result", "execution_count": 12, "data": {"text/plain": "[('\u0442', ['APRO', 'n', 'sg', 'dat', 'abbr']),\n ('\u043f', ['A', 'plen', 'n', 'sg', 'nom', 'abbr']),\n ('\u0442', ['PART', 'abbr']),\n ('\u0435', ['V', 'ipf', 'intr', '0', 'abbr', 'sg', '3p', 'praes', 'indic']),\n ('\u0414\u043e\u043d\u0441\u043a\u043e\u0433\u043e', ['S', 'anim', 'm', 'famn', 'sg', 'gen']),\n ('\u0433', ['S', 'inan', 'm', '0', 'abbr', 'sg', 'loc']),\n ('\u0441\u0432', ['A', 'plen', 'abbr', 'm', 'sg', 'gen']),\n ('\u043c\u043b\u043d', ['S', 'inan', 'm', '0', 'abbr', 'pl', 'gen']),\n ('\u041f\u0440\u043e\u0436\u0435\u043a\u0442\u043e\u0440\u043f\u0435\u0440\u0438\u0441\u0445\u0438\u043b\u0442\u043e\u043d', ['S', 'inan', 'm', 'sg', 'acc']),\n ('\u0414\u044b\u0430', ['PART', 'distort']),\n ('\u0434\u044b\u0430', ['PART', 'distort']),\n ('\u0414\u0438\u0430\u043d\u0430', ['S', 'anim', 'mf', '0', 'famn', 'sg', 'nom']),\n ('\u0441\u043a\u043e\u0440\u0435\u0435', ['parenth']),\n ('\u0441\u043d\u0430\u0440\u044f\u0433\u0443', ['S', 'inan', 'f', 'sg', 'acc']),\n ('\u041f\u0420\u041e', ['S', 'inan', 'f', '0', 'abbr', 'sg', 'nom']),\n ('\u043f\u0440\u043e\u0436\u0438\u0432\u0430\u044e\u0449\u0438\u0435',\n ['V', 'partcp', 'plen', 'ipf', 'intr', 'praes', 'act', 'pl', 'nom']),\n ('\u043f\u0440\u043e\u0436\u0438\u0432\u0430\u044e\u0449\u0435\u0433\u043e',\n ['V', 'partcp', 'plen', 'ipf', 'intr', 'praes', 'act', 'n', 'sg', 'gen']),\n ('\u043f\u0440\u043e\u0436\u0438\u0432\u0430\u044e\u0449\u0438\u0435',\n ['V', 'partcp', 'plen', 'ipf', 'intr', 'praes', 'act', 'pl', 'nom']),\n ('\u041c\u0411\u041f', ['S', 'n', 'inan', '0', 'sg', 'gen']),\n ('\u0442', ['ADVPRO', 'abbr']),\n ('\u0434', ['ADV', 'abbr']),\n ('\u043f\u043e\u043b\u0433\u043e\u0434\u0430', ['S', 'm', 'inan', 'sg', 'acc']),\n ('\u041d\u041f\u0424', ['S', 'm', 'inan', '0', 'pl', 'acc']),\n ('\u0438\u0437\u043c\u0435\u043d\u044f\u0435\u0442', ['V', 'ipf', 'intr', 'act', 'sg', 'praes', '3p', 'indic']),\n ('\u043d\u0430\u043f\u0440\u0430\u0432\u043b\u0435\u043d\u043d\u044b\u0435', ['V', 'pf', 'partcp', 'praet', 'pass', 'pl', 'acc', 'intr']),\n ('\u043f\u043e\u043b\u0432\u0435\u043a\u0430', ['S', 'm', 'inan', 'sg', 'acc']),\n ('\u0421\u0411', ['S', 'm', 'inan', '0', 'sg', 'gen']),\n ('\u041e\u041c\u0423', ['S', 'n', 'inan', '0', 'sg', 'gen'])]"}, "metadata": {}}]}, {"source": "## \u0420\u0435\u0437\u0443\u043b\u044c\u0442\u0430\u0442\u044b \u0440\u0443\u0447\u043d\u043e\u0439 \u043f\u0440\u043e\u0432\u0435\u0440\u043a\u0438\n\n### \u041d\u0435 \u0443\u0447\u0438\u0442\u044b\u0432\u0430\u0435\u043c \u043a\u0430\u043a \u043e\u0448\u0438\u0431\u043a\u0438\n\npymorphy2: \n\n* \u043e\u0434\u0438\u043d \u0438\u0437 \"\u0434\u044b\u0430\", \u0442.\u043a. \u044d\u0442\u043e \u043e\u0434\u043d\u0430 \u0438 \u0442\u0430 \u0436\u0435 \u043e\u0448\u0438\u0431\u043a\u0430 2 \u0440\u0430\u0437\u0430;\n* \u0441\u043b\u043e\u0432\u043e \"\u0442\u0430\u043a\u0436\u0435\". \u0412 \u0440\u0430\u0437\u043c\u0435\u0442\u043a\u0435 - \u0447\u0430\u0441\u0442\u0438\u0446\u0430, pymorphy2 \u0433\u043e\u0432\u043e\u0440\u0438\u0442, \u0447\u0442\u043e \u0441\u043e\u044e\u0437.\n\nmystem: \n\n* \"\u0414\u043e\u043d\u0441\u043a\u043e\u0433\u043e\" - \u043d\u0435\u043f\u043e\u043d\u044f\u0442\u043d\u043e, \u0441\u0443\u0449\u0435\u0441\u0442\u0432\u0438\u0442\u0435\u043b\u044c\u043d\u043e\u0435 \u044d\u0442\u043e (\u0444\u0430\u043c\u0438\u043b\u0438\u044f), \u0438\u043b\u0438 \u043f\u0440\u0438\u043b\u0430\u0433\u0430\u0442\u0435\u043b\u044c\u043d\u043e\u0435. mystem \u0441\u0447\u0438\u0442\u0430\u0435\u0442, \u0447\u0442\u043e \u043f\u0440\u0438\u043b\u0430\u0433\u0430\u0442\u0435\u043b\u044c\u043d\u043e\u0435, \u0432 \u0440\u0430\u0437\u043c\u0435\u0442\u043a\u0435 \u0438 \u0443 pymorphy2 - \u0441\u0443\u0449\u0435\u0441\u0442\u0432\u0438\u0442\u0435\u043b\u044c\u043d\u043e\u0435. \n* \u043e\u0434\u0438\u043d \u0438\u0437 \"\u0434\u044b\u0430\", \u0442.\u043a. \u044d\u0442\u043e \u043e\u0434\u043d\u0430 \u0438 \u0442\u0430 \u0436\u0435 \u043e\u0448\u0438\u0431\u043a\u0430 2 \u0440\u0430\u0437\u0430;\n* \u0434\u0432\u0435 \u0438\u0437 \u0442\u0440\u0435\u0445 \u0444\u043e\u0440\u043c \"\u043f\u0440\u043e\u0436\u0438\u0432\u0430\u044e\u0449\u0438\u0435\", \u0442.\u043a. \u044d\u0442\u043e \u043e\u0434\u043d\u0430 \u0438 \u0442\u0430 \u0436\u0435 \u043e\u0448\u0438\u0431\u043a\u0430, \u043d\u043e 3 \u0440\u0430\u0437\u0430;\n* \u0442\u043e, \u0447\u0442\u043e mystem \u043e\u0442\u043a\u0430\u0437\u0430\u043b\u0441\u044f \u0440\u0430\u0437\u0431\u0438\u0440\u0430\u0442\u044c \u043a\u0430\u043a \u0435\u0434\u0438\u043d\u044b\u0435 \u0442\u043e\u043a\u0435\u043d\u044b: \u0438\u043d\u0442\u0435\u0440\u043d\u0435\u0442-\u043f\u043e\u0440\u0442\u0430\u043b\u043e\u043c, \u0440\u043e\u0441\u0441\u0438\u0439\u0441\u043a\u043e-\u0430\u0437\u0435\u0440\u0431\u0430\u0439\u0434\u0436\u0430\u043d\u0441\u043a\u0438\u043c, \u041e\u2019\u0425\u0430\u0440\u0430, \u0432\u0443\u043b\u043a\u0430\u043d\u043e\u0433\u0435\u043d\u043d\u043e-\u043e\u0441\u0430\u0434\u043e\u0447\u043d\u044b\u0435, \u0438\u043d\u043d\u043e\u0432\u0430\u0446\u0438\u043e\u043d\u043d\u043e-\u043e\u0431\u0440\u0430\u0437\u043e\u0432\u0430\u0442\u0435\u043b\u044c\u043d\u043e\u0439, \u0438\u0441\u0441\u0438\u043d\u044f-\u0431\u0435\u043b\u043e\u0433\u043e, \u042d\u043a\u043e\u043d\u043e\u043c\u0438\u043a\u043e-\u0440\u0430\u0441\u0441\u0435\u043b\u0435\u043d\u0447\u0435\u0441\u043a\u0438\u0439, \u0444\u0438\u0442\u043d\u0435\u0441\u0441-\u043c\u0435\u0440\u043e\u043f\u0440\u0438\u044f\u0442\u0438\u0435\u043c, \u043e\u043d\u043b\u0430\u0439\u043d-\u043c\u0430\u0433\u0430\u0437\u0438\u043d, \u041c-\u0412\u0438\u0434\u0435\u043e, \u0438\u043d\u0442\u0435\u0440\u043d\u0435\u0442-\u0443\u043d\u0438\u0432\u0435\u0440\u043c\u0430\u0433\u043e\u043c - \u0432\u0441\u0435\u0433\u043e 11 \u0441\u043b\u0443\u0447\u0430\u0435\u0432. \u0418\u0441\u043a\u043b\u044e\u0447\u0435\u043d\u0438\u0435 - \u0443\u0447\u0438\u0442\u044b\u0432\u0430\u0435\u0442\u0441\u044f \u0442\u043e\u043a\u0435\u043d \"\u0422\u0430-\u0430-\u0430\u043a\", \u043a\u043e\u0442\u043e\u0440\u044b\u0439 \u0437\u0430\u0441\u0447\u0438\u0442\u0430\u043b\u0441\u044f \u043a\u0430\u043a \u043e\u0448\u0438\u0431\u043a\u0430 \u0432 pymorphy2.\n* \u043f\u043e\u043b\u0432\u0435\u043a\u0430, \u043f\u043e\u043b\u0433\u043e\u0434\u0430 - mystem \u0441\u0447\u0438\u0442\u0430\u0435\u0442, \u0447\u0442\u043e \u044d\u0442\u043e \u043c\u043d\u043e\u0436\u0435\u0441\u0442\u0432\u0435\u043d\u043d\u043e\u0435 \u0447\u0438\u0441\u043b\u043e, \u0447\u0442\u043e \u0434\u043e\u043f\u0443\u0441\u0442\u0438\u043c\u043e.\n\n### \u0418\u0442\u043e\u0433\u043e\u0432\u044b\u0439 \u043d\u0430\u0431\u043e\u0440 \u043e\u0448\u0438\u0431\u043e\u043a (microcorpus+\u041d\u041a\u0420\u042f=\u0438\u0442\u043e\u0433\u043e):\n\npymorphy2: **10+9=19** (\u0438\u043b\u0438 **7+9=16** \u0431\u0435\u0437 \u0443\u0447\u0435\u0442\u0430 \u0441\u043e\u043a\u0440\u0430\u0449\u0435\u043d\u0438\u0439)\n\n* 3+0 - \u0441\u043e\u043a\u0440\u0430\u0449\u0435\u043d\u0438\u044f (\u0442.; \u0433; \u0441\u0432.)\n* 2+4 - \u0430\u0431\u0431\u0440\u0438\u0432\u0435\u0430\u0442\u0443\u0440\u044b (\u041f\u0420\u041e; \u0412\u041e\u0412; \u041c\u0411\u041f; \u041d\u041f\u0424; \u0421\u0411; \u041e\u041c\u0423)\n* 0+1 - \u043f\u0440\u0435\u0434\u0441\u043a\u0430\u0437\u0430\u0442\u0435\u043b\u0438 (\u0444\u0438\u0442\u043d\u0435\u0441\u0441\u0430);\n* 0+2 - \u0441\u043e\u043c\u043d\u0438\u0442\u0435\u043b\u044c\u043d\u044b\u0435 \u0440\u0430\u0437\u0431\u043e\u0440\u044b \u0434\u043b\u044f \u0441\u043b\u043e\u0432\u0430\u0440\u043d\u044b\u0445 \u0441\u043b\u043e\u0432 (\u043f\u043e\u043b\u0433\u043e\u0434\u0430, \u043f\u043e\u043b\u0432\u0435\u043a\u0430) - \u0443 \u043d\u0438\u0445 \u0441\u0442\u043e\u0438\u0442 \u0441\u0440\u0435\u0434\u043d\u0438\u0439 \u0440\u043e\u0434\n* 0+1 - \u043e\u0448\u0438\u0431\u043a\u0438 \u043d\u0430 \u043f\u0435\u0440\u0435\u0445\u043e\u0434\u043d\u043e\u0441\u0442\u044c (\u043d\u0430\u043f\u0440\u0430\u0432\u043b\u0435\u043d\u043d\u044b\u0435)\n* 3+1 - \u0438\u043c\u0435\u043d\u0430/\u0444\u0430\u043c\u0438\u043b\u0438\u0438 (\u042e\u043d\u0433, \u041c\u0430\u043b\u0445\u043e\u043b\u043b\u0430\u043d\u0434\u0430, \u043e\u0442\u0435\u0446 \u0414\u0438\u0430\u043d\u0430, \u0421\u043b\u0443\u0446\u043a\u0435\u0440)\n* 2+0 - \u0438\u0441\u043a\u0430\u0436\u0435\u043d\u043d\u044b\u0435 \u0441\u043b\u043e\u0432\u0430 (\u0414\u044b\u0430, \u0422\u0430-\u0430-\u0430\u043a)\n\nmystem: **15+8=23** (\u0438\u043b\u0438 **8+6=14** \u0431\u0435\u0437 \u0443\u0447\u0435\u0442\u0430 \u0441\u043e\u043a\u0440\u0430\u0449\u0435\u043d\u0438\u0439)\n\n* 7+2 - \u0441\u043e\u043a\u0440\u0430\u0449\u0435\u043d\u0438\u044f (\u0442.\u043f.; \u0442.\u0435.; \u0442.\u0434.; \u0433; \u0441\u0432.; \u043c\u043b\u043d;)\n* 1+4 - \u0430\u0431\u0431\u0440\u0438\u0432\u0435\u0430\u0442\u0443\u0440\u044b (\u041f\u0420\u041e; \u041c\u0411\u041f; \u041d\u041f\u0424; \u0421\u0411; \u041e\u041c\u0423)\n* 2+0 - \u043f\u0440\u0435\u0434\u0441\u043a\u0430\u0437\u0430\u0442\u0435\u043b\u0438 (\u041f\u0440\u043e\u0436\u0435\u043a\u0442\u043e\u0440\u043f\u0435\u0440\u0438\u0441\u0445\u0438\u043b\u0442\u043e\u043d, \u0441\u043d\u0430\u0440\u044f\u0433\u0443)\n* 1+0 - \u0441\u043e\u043c\u043d\u0438\u0442\u0435\u043b\u044c\u043d\u044b\u0435 \u0440\u0430\u0437\u0431\u043e\u0440\u044b \u0434\u043b\u044f \u0441\u043b\u043e\u0432\u0430\u0440\u043d\u044b\u0445 \u0441\u043b\u043e\u0432 (\u0441\u043a\u043e\u0440\u0435\u0435)\n* 1+2 - \u043e\u0448\u0438\u0431\u043a\u0438 \u043d\u0430 \u043f\u0435\u0440\u0435\u0445\u043e\u0434\u043d\u043e\u0441\u0442\u044c (\u043f\u0440\u043e\u0436\u0438\u0432\u0430\u044e\u0449\u0438\u0435, \u0438\u0437\u043c\u0435\u043d\u044f\u0435\u0442, \u043d\u0430\u043f\u0440\u0430\u0432\u043b\u0435\u043d\u043d\u044b\u0435)\n* 1+0 - \u0438\u043c\u0435\u043d\u0430/\u0444\u0430\u043c\u0438\u043b\u0438\u0438 (\u043e\u0442\u0435\u0446 \u0414\u0438\u0430\u043d\u0430)\n* 2+0 - \u0438\u0441\u043a\u0430\u0436\u0435\u043d\u043d\u044b\u0435 \u0441\u043b\u043e\u0432\u0430 (\u0414\u044b\u0430, \u0422\u0430-\u0430-\u0430\u043a)\n\n\u0412 \u0440\u0443\u0447\u043d\u043e\u0439 \u0440\u0430\u0437\u043c\u0435\u0442\u043a\u0435 \u041d\u041a\u0420\u042f (\u0441\u043b\u0443\u0447\u0430\u0439\u043d\u0430\u044f \u0432\u044b\u0431\u043e\u0440\u043a\u0430 \u0438\u0437 100 \u043f\u0440\u0435\u0434\u043b\u043e\u0436\u0435\u043d\u0438\u0439, \u043a\u0430\u043a\u0430\u044f-\u0442\u043e \u0441\u0442\u0430\u0440\u0430\u044f \u0432\u044b\u0433\u0440\u0443\u0437\u043a\u0430) \u0431\u044b\u043b\u043e 6 \u043e\u0448\u0438\u0431\u043e\u043a.", "metadata": {}, "cell_type": "markdown"}, {"execution_count": null, "source": "", "metadata": {"trusted": true, "collapsed": true}, "cell_type": "code", "outputs": []}, {"source": "## \u0421\u043d\u044f\u0442\u0438\u0435 \u043d\u0435\u043e\u0434\u043d\u043e\u0437\u043d\u0430\u0447\u043d\u043e\u0441\u0442\u0438\n\n** ==\u042d\u0422\u041e \u0422\u041e\u041b\u042c\u041a\u041e \u041d\u0410\u0411\u0420\u041e\u0421\u041e\u041a== **\n\npymorphy2 \u0443\u043c\u0435\u0435\u0442 \u0441\u043d\u0438\u043c\u0430\u0442\u044c \u043d\u0435\u043e\u0434\u043d\u043e\u0437\u043d\u0430\u0447\u043d\u043e\u0441\u0442\u044c \u043d\u0430 \u0443\u0440\u043e\u0432\u043d\u0435 \u043e\u0442\u0434\u0435\u043b\u044c\u043d\u044b\u0445 \u0441\u043b\u043e\u0432 (\u0431\u0435\u0437 \u0443\u0447\u0435\u0442\u0430 \u043a\u043e\u043d\u0442\u0435\u043a\u0441\u0442\u0430). \n\u041d\u0430\u0431\u0440\u043e\u0441\u043e\u043a \u043a\u043e\u0434\u0430 \u0434\u043b\u044f \u043e\u0446\u0435\u043d\u043a\u0438 \u043a\u0430\u0447\u0435\u0441\u0442\u0432\u0430 (\u043d\u0430\u0431\u0440\u043e\u0441\u043e\u043a, \u0442.\u043a. \u0442\u0430\u043c, \u0441\u043a\u043e\u0440\u0435\u0435 \u0432\u0441\u0435\u0433\u043e, \u0432\u0441\u044f\u043a\u0438\u0435 \u043e\u0448\u0438\u0431\u043a\u0438 \u0438\u0437-\u0437\u0430 \u043f\u0440\u0435\u043e\u0431\u0440\u0430\u0437\u043e\u0432\u0430\u043d\u0438\u044f \u0442\u0435\u0433\u043e\u0432 \u0438\u0437 \u043e\u0434\u043d\u043e\u0433\u043e \u0442\u0435\u0433\u0441\u0435\u0442\u0430 \u0432 \u0434\u0440\u0443\u0433\u043e\u0439):", "metadata": {}, "cell_type": "markdown"}, {"execution_count": 13, "source": "def POS_match(t1, t2):\n # FIXME: code is a copy-paste of tags_diff with minor variations\n gr1, gr2 = _gram(t1), _gram(t2)\n pos1, pos2 = gr1[0], gr2[0]\n if pos1 == pos2:\n return True \n \n diff = set(gr1) ^ set(gr2)\n comb = set(gr1) | set(gr2)\n common = set(gr1) & set(gr2)\n diff -= {'anim', 'inan', 'persn', 'famn', '0', 'obsol', 'geo', 'distort', 'med', 'act', 'plen'}\n \n if not diff:\n return True\n \n if diff == {'ADV'} and ({'parenth', 'praed'} & comb):\n return True\n \n if diff == {'PART'} and 'parenth' in comb:\n return True\n \n if 'S' in diff and 'INIT' in diff and 'abbr' in common:\n return True\n \n if diff == {'CONJ', 'parenth'}:\n return True\n \n if diff == {'SPRO', 'APRO'}:\n return True\n \n if diff == {'A', 'NUM'}:\n return True\n\n if diff == {'APRO', 'ANUM', 'sg'}:\n return True\n\n if diff == {'A', 'ADV'} and 'comp' in common:\n return True\n \n if 'SPRO' in common:\n return True\n \n if diff == {'ADV', 'ADVPRO'}:\n return True\n \n if diff == {'A', 'pl', 'brev', 'ADV'}:\n return True\n \n if diff == {'praed', 'ADV'}:\n return True\n \n if diff == {'praed', 'A'}:\n return True\n \n return False\n \n\ndef first_correct(correct, parses):\n if parses is None:\n return True\n \n if not parses:\n return False\n\n return tags_match(parses[0], correct)\n\n\ndef first_POS_correct(correct, parses):\n if parses is None:\n return True\n \n if not parses:\n return False\n\n return POS_match(parses[0], correct)", "metadata": {"trusted": true, "collapsed": true}, "cell_type": "code", "outputs": []}, {"execution_count": 15, "source": "pymorphy2_disambig_errors = [\n (tok, gr) for tok, gr in tokens \n if not first_correct(gr, pymorphy2_analyze(tok))\n]\npymorphy2_noprob_disambig_errors = [\n (tok, gr) for tok, gr in tokens \n if not first_correct(gr, pymorphy2_analyze(tok, prob=False))\n]\npymorphy2_POS_disambig_errors = [\n (tok, gr) for tok, gr in tokens \n if not first_POS_correct(gr, pymorphy2_analyze(tok))\n]\npymorphy2_noprob_POS_disambig_errors = [\n (tok, gr) for tok, gr in tokens \n if not first_POS_correct(gr, pymorphy2_analyze(tok, prob=False))\n]\n\ndef perc_txt(errors):\n percent = 100 - (len(errors) / len(tokens) * 100)\n return \"%0.1f%%\" % percent\n\nprint(\"pymorphy2 context-unaware disambiguation, % of correct analyses\\n\")\n\nprint(\"no P(tag|word): %s (full tagset), %s (POS only)\" % (\n perc_txt(pymorphy2_noprob_disambig_errors), \n perc_txt(pymorphy2_noprob_POS_disambig_errors)))\n\nprint(\"with P(tag|word): %s (full tagset), %s (POS only)\" % (\n perc_txt(pymorphy2_disambig_errors), \n perc_txt(pymorphy2_POS_disambig_errors)))", "metadata": {"trusted": true, "collapsed": false}, "cell_type": "code", "outputs": [{"output_type": "stream", "name": "stdout", "text": "pymorphy2 context-unaware disambiguation, % of correct analyses\n\nno P(tag|word): 72.5% (full tagset), 86.1% (POS only)\nwith P(tag|word): 81.7% (full tagset), 93.7% (POS only)\n"}]}, {"execution_count": null, "source": "", "metadata": {"trusted": true, "collapsed": true}, "cell_type": "code", "outputs": []}], "nbformat": 4, "nbformat_minor": 0, "metadata": {"kernelspec": {"language": "python", "name": "python3", "display_name": "Python 3"}, "language_info": {"name": "python", "codemirror_mode": {"name": "ipython", "version": 3}, "pygments_lexer": "ipython3", "nbconvert_exporter": "python", "file_extension": ".py", "mimetype": "text/x-python", "version": "3.4.2"}}}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment