Created
October 18, 2013 05:15
-
-
Save menzenski/7036795 to your computer and use it in GitHub Desktop.
Automate a search of slovari.yandex.ru
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #! /usr/bin/env python | |
| # -*- coding: utf-8 -*- | |
| import nltk | |
| from nltk.stem import SnowballStemmer | |
| from nltk import FreqDist | |
| import codecs | |
| import glob | |
| from diminutivefinder_04_asclass import DiminutiveFinder | |
| import matplotlib.pyplot as plt | |
| from collections import OrderedDict | |
| import urllib | |
| import sgmllib | |
| import lxml.html | |
| def print_list(mylist): | |
| '''Print a list containing unicode characters.''' | |
| print '[' + ', '.join( | |
| "" + word.encode('utf8') + "" for word in mylist) + ']' | |
| verified_dims = [] | |
| def yandex_search(search_term): | |
| search_url_begin = 'http://slovari.yandex.ru/' | |
| search_url_end = '/en/#lingvo/' | |
| search_url_whole = search_url_begin + search_term.encode( | |
| 'utf8') + search_url_end | |
| results_page = urllib.urlopen(search_url_whole) | |
| results_html = results_page.read() | |
| # parsing the html | |
| # results_summary = lxml.hmtl.find_title( | |
| # results_html, "уменьшительная форма") | |
| dim_tag = '\xd1\x83\xd0\xbc\xd0\xb5\xd0\xbd\xd1\x8c\xd1\x88' | |
| if dim_tag in results_html: | |
| verified_dims.append(search_term) | |
| def main(): | |
| data = codecs.open("TMaM_CompleteText.txt", encoding="utf8") | |
| textfile = data.read() | |
| dims = DiminutiveFinder(textfile) | |
| dims.find_diminutives(textfile) | |
| distinct_dim_stems = set(dims.diminutives) | |
| for stem in distinct_dim_stems: | |
| yandex_search(stem) | |
| print_list(verified_dims) | |
| if __name__ == "__main__": | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment