Skip to content

Instantly share code, notes, and snippets.

@menzenski
Last active December 25, 2015 20:39
Show Gist options
  • Select an option

  • Save menzenski/7036964 to your computer and use it in GitHub Desktop.

Select an option

Save menzenski/7036964 to your computer and use it in GitHub Desktop.
Define a Python class and methods for locating diminutive nouns in a Russian text.
#! /usr/bin/env python
# -*- coding: utf-8 -*-
import nltk
from nltk.stem import SnowballStemmer
from nltk import FreqDist
import codecs
def print_list(mylist):
'''Print a list containing unicode characters.'''
print '[' + ', '.join(
"" + word.encode('utf8') + "" for word in mylist) + ']'
class DiminutiveFinder(object):
def __init__(self, text):
self.text = text
self.verified_diminutives = []
self.diminutives = []
self.stemlist = []
self.tokens = []
# possible endings of diminutive stems
self.diminutive_endings = (
# first degree of expressiveness
#u"к",
u"ик",
u"чик",
u"ок", #u"ек",
u"ец", u"иц",
u"енок", u"онок", u"еныш",
u"инк", u"инок",
u"ыш",
#u"ц",
# second degree of expressiveness
u"еньк",
u"оньк",
u"ушк", u"юшк", u"ушек", u"юшек",
u"уш", u"юш",
u"ышк", u"ышек",
u"ишк", u"ишек",
u"ашк", u"ашек",
u"онка", u"енка", u"онок", u"енок",
# third degree of expressiveness
u"очк", u"ечк", u"очок", u"ечок",
u"ичк", u"ичок",
u"очек", u"ечек",
u"оночек", u"еночек", u"оночк", u"еночк",
u"иночк", u"иночек", u"иночок",
u"ишечк", u"ишечек", u"ишечок", u"ушечк", u"ушечек", u"ушечок",
u"юшечк", u"юшечек", u"юшечшк",
# diminutive adjectives
u"еньк", u"оньк",
u"ехоньк", u"охоньк", u"ешеньк", u"ошеньк",
u"юсеньк", u"усеньк",
u"оват", u"еват",
u"оватеньк", u"еватеньк"
)
def find_diminutives(self, text):
# data = codecs.open(text_file, encoding="utf8")
# text = data.read()
self.tokens = nltk.word_tokenize(text)
stemmer = SnowballStemmer("russian")
for item in self.tokens:
barestem = stemmer.stem(unicode(item))
self.stemlist.append(barestem)
for word in self.stemlist:
if word.endswith(self.diminutive_endings):
self.diminutives.append(word)
self.dim_fd = FreqDist(self.diminutives)
def find_verified_diminutives(self, text):
self.tokens = nltk.word_tokenize(text)
stemmer = SnowballStemmer("russian")
for item in self.tokens:
barestem = stemmer.stem(unicode(item))
self.stemlist.append(barestem)
for word in self.stemlist:
if word in self.verified_dims:
self.verified_diminutives.append(word)
self.ver_dim_fd = FreqDist(self.verified_diminutives)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment