Last active
December 25, 2015 20:39
-
-
Save menzenski/7036964 to your computer and use it in GitHub Desktop.
Define a Python class and methods for locating diminutive nouns in a Russian text.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #! /usr/bin/env python | |
| # -*- coding: utf-8 -*- | |
| import nltk | |
| from nltk.stem import SnowballStemmer | |
| from nltk import FreqDist | |
| import codecs | |
| def print_list(mylist): | |
| '''Print a list containing unicode characters.''' | |
| print '[' + ', '.join( | |
| "" + word.encode('utf8') + "" for word in mylist) + ']' | |
| class DiminutiveFinder(object): | |
| def __init__(self, text): | |
| self.text = text | |
| self.verified_diminutives = [] | |
| self.diminutives = [] | |
| self.stemlist = [] | |
| self.tokens = [] | |
| # possible endings of diminutive stems | |
| self.diminutive_endings = ( | |
| # first degree of expressiveness | |
| #u"к", | |
| u"ик", | |
| u"чик", | |
| u"ок", #u"ек", | |
| u"ец", u"иц", | |
| u"енок", u"онок", u"еныш", | |
| u"инк", u"инок", | |
| u"ыш", | |
| #u"ц", | |
| # second degree of expressiveness | |
| u"еньк", | |
| u"оньк", | |
| u"ушк", u"юшк", u"ушек", u"юшек", | |
| u"уш", u"юш", | |
| u"ышк", u"ышек", | |
| u"ишк", u"ишек", | |
| u"ашк", u"ашек", | |
| u"онка", u"енка", u"онок", u"енок", | |
| # third degree of expressiveness | |
| u"очк", u"ечк", u"очок", u"ечок", | |
| u"ичк", u"ичок", | |
| u"очек", u"ечек", | |
| u"оночек", u"еночек", u"оночк", u"еночк", | |
| u"иночк", u"иночек", u"иночок", | |
| u"ишечк", u"ишечек", u"ишечок", u"ушечк", u"ушечек", u"ушечок", | |
| u"юшечк", u"юшечек", u"юшечшк", | |
| # diminutive adjectives | |
| u"еньк", u"оньк", | |
| u"ехоньк", u"охоньк", u"ешеньк", u"ошеньк", | |
| u"юсеньк", u"усеньк", | |
| u"оват", u"еват", | |
| u"оватеньк", u"еватеньк" | |
| ) | |
| def find_diminutives(self, text): | |
| # data = codecs.open(text_file, encoding="utf8") | |
| # text = data.read() | |
| self.tokens = nltk.word_tokenize(text) | |
| stemmer = SnowballStemmer("russian") | |
| for item in self.tokens: | |
| barestem = stemmer.stem(unicode(item)) | |
| self.stemlist.append(barestem) | |
| for word in self.stemlist: | |
| if word.endswith(self.diminutive_endings): | |
| self.diminutives.append(word) | |
| self.dim_fd = FreqDist(self.diminutives) | |
| def find_verified_diminutives(self, text): | |
| self.tokens = nltk.word_tokenize(text) | |
| stemmer = SnowballStemmer("russian") | |
| for item in self.tokens: | |
| barestem = stemmer.stem(unicode(item)) | |
| self.stemlist.append(barestem) | |
| for word in self.stemlist: | |
| if word in self.verified_dims: | |
| self.verified_diminutives.append(word) | |
| self.ver_dim_fd = FreqDist(self.verified_diminutives) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment