Skip to content

Instantly share code, notes, and snippets.

@menzenski
Last active December 25, 2015 20:39
Show Gist options
  • Select an option

  • Save menzenski/7036775 to your computer and use it in GitHub Desktop.

Select an option

Save menzenski/7036775 to your computer and use it in GitHub Desktop.
Find specific diminutive nouns in a Russian text.
#! /usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import division
import nltk
from nltk.stem import SnowballStemmer
from nltk import FreqDist
import codecs
import glob
from diminutivefinder_04_asclass import DiminutiveFinder
import matplotlib.pyplot as plt
from collections import OrderedDict
def save_results_as_latex_table(dictionary, filename):
"""
Save the list of possible diminutive stems in a format that will
paste into a LaTeX table.
"""
# generate a file name for the saved list of diminutive stems
destination_file = filename + '.txt'
# write the list of diminutive stems to that file
with codecs.open(destination_file, "w", encoding="utf-8") as stream:
for item in dictionary.keys():
word = item
freq = dictionary[item]
stream.write("%s & %r & \\\\ \n" % (word, freq))
verified_dims = (
u'уголок', u'кусочк', u'столик', u'лампочк', u'чемоданчик',
u'ленточк', u'вещиц', u'чертик', u'палочк', u'юбочк', u'кружок',
u'книжечк', u'пузыречк', u'ложечк', u'стеклышк', u'домик', u'звездочк',
u'спинк', u'старичок', u'«лампочк', u'подвальчик', u'шуточк',
u'кусочек', u'дождик', u'чашечк', u'пятнышк', u'гвоздик', u'кучечк',
u'близехоньк', u'окошечк', u'хвостик', u'котик', u'колесик', u'глазок',
u'мешочк', u'волосок', u'шляпочк', u'клеточк', u'мышк', u'ниточк',
u'домишк', u'бантик', u'коробочк'
)
def main():
textfile = 'TMaM_CompleteText.txt'
data = codecs.open(textfile, encoding="utf8")
novel_text = data.read()
tokens = nltk.word_tokenize(novel_text)
stemmer = SnowballStemmer("russian")
stemlist = []
for item in tokens:
barestem = stemmer.stem(unicode(item))
stemlist.append(barestem)
list_of_diminutives = [word for word in stemlist if word in verified_dims]
dim_freq = FreqDist(list_of_diminutives)
sorted_dims = OrderedDict(sorted(dim_freq.keys(), key=lambda t: t[0]))
save_results_as_latex_table(sorted_dims, 'actual_diminutives_total')
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment