Skip to content

Instantly share code, notes, and snippets.

@radixvinni
Last active December 11, 2015 12:48
Show Gist options
  • Save radixvinni/4602908 to your computer and use it in GitHub Desktop.
Save radixvinni/4602908 to your computer and use it in GitHub Desktop.
groups = {
"POST": ["NOUN","ADJF","ADJS","COMP","VERB","INFN","PRTF","PRTS","GRND","NUMR","ADVB","NPRO","PRED","PREP","CONJ","PRCL","INTJ"],
"anim": ["ANim","anim","inan","Inmx"],
"GNdr": ["masc","femn","neut","Ms-f"],
"NMbr":["sing","plur","Sgtm","Pltm","Fixd"],
"CAse":["nomn","gent","datv","accs","ablt","ablt","loct","voct","gen1","gen2","acc2","loc1","loc2"],
"ASpc":["perf","impf"],
"TRns":["tran","intr"],
"PErs":["1per","2per","3per"],
"TEns":["pres","past","futr"],
"MOod":["indc","impr"],
"INvl":["incl","excl"],
"VOic":["actv","pssv"]
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
from funcparserlib.parser import some, a, many, skip, finished, maybe
import re
import codecs
import sys
from morph import MorphDict, MorphInfo
ru = MorphDict()
splitter = re.compile(r'^\w+$', flags=re.U)
def tokenize(test):
symbols = []
words = []
for line in test:
for punc,word,punct in re.findall(r'(\W*)(\S*?)(\W*?)\s',line, flags=re.U):
if len(punc.lstrip()):
symbols.append(MorphInfo(["PUNCT"],punc))
words.append(punc)
if len(word):
symbols.append(ru[word])
words.append(word)
if len(punct):
symbols.append(MorphInfo(["PUNCT"],punct))
words.append(punct)
return symbols
tok = lambda t: lambda x: x.isA(t)
tok2 = lambda t,tt: lambda x: x.isBoth([t,tt])
const = lambda t: lambda _: t
punct = lambda t: lambda x: x.word == t
concord = lambda t: lambda x: x.concord(t)
noun = tok('NOUN') #имя существительное
unknown = tok('UNKNOWN') #неизвестно
punct = tok('PUNCT') #пунктуация
verb = tok('VERB') #глагол (личная форма)
adjf = tok('ADJF') #имя прилагательное (полное)
adjs = tok('ADJS') #имя прилагательное (краткое)
comp = tok('COMP') #компаратив
infn = tok('INFN') #глагол (инфинитив)
prtf = tok('PRTF') #причастие (полное)
prfs = tok('PRTS') #причастие (краткое)
grnd = tok('GRND') #деепричастие
numr = tok('NUMR') #числительное
advb = tok('ADVB') #наречие
npro = tok('NPRO') #местоимение-существительное
pred = tok('PRED') #предикатив
prep = tok('PREP') #предлог
conj = tok('CONJ') #союз
prcl = tok('PRCL') #частица
intj = tok('INTJ') #междометие
case = concord('CAse') # согласование по падежу
nmbr = concord('NMbr') # согласование по числу
gndr = concord('GNdr') # согласование по роду
pers = concord('PErs') # согласование по лицу
tens = concord('TEns') # согласование по времени
mood = concord('MOod') # согласование по наклонению
voic = concord('VOic') # согласование по залогу
class mkplural(MorphInfo):
tags = ['plural']
np_accs = tok2('NOUN','accs')
vp = verb + np_accs
sentance = noun + vp >> nmbr >> gndr >> pers
smth = some(const(True))
#заметим. отсутствует определеие разделителя предложений(отсутствует педпосмотр).
#распознавание начала предложений будет происходить на следующем этапе анализа.
text = many(smth)
def parser_test():
"""
Parser tests.
>>> s('Мама мыла раму')
<s>
<np>
<n g="NOUN,...">Мама</n>
</np>
<vp>
<v g="...">мыла</v>
<n g="...">раму</n>
</vp>
</s>
"""
pass
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment