Last active
December 11, 2015 12:48
-
-
Save radixvinni/4602908 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
groups = { | |
"POST": ["NOUN","ADJF","ADJS","COMP","VERB","INFN","PRTF","PRTS","GRND","NUMR","ADVB","NPRO","PRED","PREP","CONJ","PRCL","INTJ"], | |
"anim": ["ANim","anim","inan","Inmx"], | |
"GNdr": ["masc","femn","neut","Ms-f"], | |
"NMbr":["sing","plur","Sgtm","Pltm","Fixd"], | |
"CAse":["nomn","gent","datv","accs","ablt","ablt","loct","voct","gen1","gen2","acc2","loc1","loc2"], | |
"ASpc":["perf","impf"], | |
"TRns":["tran","intr"], | |
"PErs":["1per","2per","3per"], | |
"TEns":["pres","past","futr"], | |
"MOod":["indc","impr"], | |
"INvl":["incl","excl"], | |
"VOic":["actv","pssv"] | |
# -*- coding: utf-8 -*- | |
from __future__ import unicode_literals | |
from funcparserlib.parser import some, a, many, skip, finished, maybe | |
import re | |
import codecs | |
import sys | |
from morph import MorphDict, MorphInfo | |
ru = MorphDict() | |
splitter = re.compile(r'^\w+$', flags=re.U) | |
def tokenize(test): | |
symbols = [] | |
words = [] | |
for line in test: | |
for punc,word,punct in re.findall(r'(\W*)(\S*?)(\W*?)\s',line, flags=re.U): | |
if len(punc.lstrip()): | |
symbols.append(MorphInfo(["PUNCT"],punc)) | |
words.append(punc) | |
if len(word): | |
symbols.append(ru[word]) | |
words.append(word) | |
if len(punct): | |
symbols.append(MorphInfo(["PUNCT"],punct)) | |
words.append(punct) | |
return symbols | |
tok = lambda t: lambda x: x.isA(t) | |
tok2 = lambda t,tt: lambda x: x.isBoth([t,tt]) | |
const = lambda t: lambda _: t | |
punct = lambda t: lambda x: x.word == t | |
concord = lambda t: lambda x: x.concord(t) | |
noun = tok('NOUN') #имя существительное | |
unknown = tok('UNKNOWN') #неизвестно | |
punct = tok('PUNCT') #пунктуация | |
verb = tok('VERB') #глагол (личная форма) | |
adjf = tok('ADJF') #имя прилагательное (полное) | |
adjs = tok('ADJS') #имя прилагательное (краткое) | |
comp = tok('COMP') #компаратив | |
infn = tok('INFN') #глагол (инфинитив) | |
prtf = tok('PRTF') #причастие (полное) | |
prfs = tok('PRTS') #причастие (краткое) | |
grnd = tok('GRND') #деепричастие | |
numr = tok('NUMR') #числительное | |
advb = tok('ADVB') #наречие | |
npro = tok('NPRO') #местоимение-существительное | |
pred = tok('PRED') #предикатив | |
prep = tok('PREP') #предлог | |
conj = tok('CONJ') #союз | |
prcl = tok('PRCL') #частица | |
intj = tok('INTJ') #междометие | |
case = concord('CAse') # согласование по падежу | |
nmbr = concord('NMbr') # согласование по числу | |
gndr = concord('GNdr') # согласование по роду | |
pers = concord('PErs') # согласование по лицу | |
tens = concord('TEns') # согласование по времени | |
mood = concord('MOod') # согласование по наклонению | |
voic = concord('VOic') # согласование по залогу | |
class mkplural(MorphInfo): | |
tags = ['plural'] | |
np_accs = tok2('NOUN','accs') | |
vp = verb + np_accs | |
sentance = noun + vp >> nmbr >> gndr >> pers | |
smth = some(const(True)) | |
#заметим. отсутствует определеие разделителя предложений(отсутствует педпосмотр). | |
#распознавание начала предложений будет происходить на следующем этапе анализа. | |
text = many(smth) | |
def parser_test(): | |
""" | |
Parser tests. | |
>>> s('Мама мыла раму') | |
<s> | |
<np> | |
<n g="NOUN,...">Мама</n> | |
</np> | |
<vp> | |
<v g="...">мыла</v> | |
<n g="...">раму</n> | |
</vp> | |
</s> | |
""" | |
pass |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment