Last active
December 25, 2015 20:39
-
-
Save menzenski/7036775 to your computer and use it in GitHub Desktop.
Find specific diminutive nouns in a Russian text.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #! /usr/bin/env python | |
| # -*- coding: utf-8 -*- | |
| from __future__ import division | |
| import nltk | |
| from nltk.stem import SnowballStemmer | |
| from nltk import FreqDist | |
| import codecs | |
| import glob | |
| from diminutivefinder_04_asclass import DiminutiveFinder | |
| import matplotlib.pyplot as plt | |
| from collections import OrderedDict | |
| def save_results_as_latex_table(dictionary, filename): | |
| """ | |
| Save the list of possible diminutive stems in a format that will | |
| paste into a LaTeX table. | |
| """ | |
| # generate a file name for the saved list of diminutive stems | |
| destination_file = filename + '.txt' | |
| # write the list of diminutive stems to that file | |
| with codecs.open(destination_file, "w", encoding="utf-8") as stream: | |
| for item in dictionary.keys(): | |
| word = item | |
| freq = dictionary[item] | |
| stream.write("%s & %r & \\\\ \n" % (word, freq)) | |
| verified_dims = ( | |
| u'уголок', u'кусочк', u'столик', u'лампочк', u'чемоданчик', | |
| u'ленточк', u'вещиц', u'чертик', u'палочк', u'юбочк', u'кружок', | |
| u'книжечк', u'пузыречк', u'ложечк', u'стеклышк', u'домик', u'звездочк', | |
| u'спинк', u'старичок', u'«лампочк', u'подвальчик', u'шуточк', | |
| u'кусочек', u'дождик', u'чашечк', u'пятнышк', u'гвоздик', u'кучечк', | |
| u'близехоньк', u'окошечк', u'хвостик', u'котик', u'колесик', u'глазок', | |
| u'мешочк', u'волосок', u'шляпочк', u'клеточк', u'мышк', u'ниточк', | |
| u'домишк', u'бантик', u'коробочк' | |
| ) | |
| def main(): | |
| textfile = 'TMaM_CompleteText.txt' | |
| data = codecs.open(textfile, encoding="utf8") | |
| novel_text = data.read() | |
| tokens = nltk.word_tokenize(novel_text) | |
| stemmer = SnowballStemmer("russian") | |
| stemlist = [] | |
| for item in tokens: | |
| barestem = stemmer.stem(unicode(item)) | |
| stemlist.append(barestem) | |
| list_of_diminutives = [word for word in stemlist if word in verified_dims] | |
| dim_freq = FreqDist(list_of_diminutives) | |
| sorted_dims = OrderedDict(sorted(dim_freq.keys(), key=lambda t: t[0])) | |
| save_results_as_latex_table(sorted_dims, 'actual_diminutives_total') | |
| if __name__ == '__main__': | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment