Skip to content

Instantly share code, notes, and snippets.

@menzenski
Created October 21, 2014 18:05
Show Gist options
  • Save menzenski/30c985958ed6b0a18841 to your computer and use it in GitHub Desktop.
Save menzenski/30c985958ed6b0a18841 to your computer and use it in GitHub Desktop.
This Python script takes a list of words and returns the frequency with which each word occurs in the Russian National Corpus.
#! /usr/bin/env python
# -*- coding: utf-8 -*-
##########
## rncfrequencyfinder.py Version 1.0 (2014-10-20)
##
## Original author: Matthew Menzenski ([email protected])
##
## License: CC-BY-4.0 ( https://creativecommons.org/licenses/by/4.0/ )
##########
import urllib
import sgmllib
import lxml.html
import codecs
## designate a file with the words you want to search for
## (One word per line)
input_file = "wordlist.txt"
## designate a file for saving the results
## (this will be rewritten every time this script is run)
results_file = 'search_results.txt'
def corpus_search(search_term):
'''Return the number of occurrences of a lemma (search_term) in the
Russian National Corpus.'''
# assemble a url including the search term
search_url_begin = (
"http://search.ruscorpora.ru/search.xml?mycorp="
"&mysent=&mysize=&dpp=&spp=&spd=&text=lexgramm"
"&mode=main&sort=gr_tagging&lang=en&parent1=0"
"&level1=0&lex1="
)
search_url_end = (
"&gramm1=&sem1=&flags1=&sem-mod1=sem&sem-mod1=sem2"
"&parent2=0&level2=0&min2=1&max2=1&lex2=&gramm2="
"&sem2=&flags2=&sem-mod2=sem&sem-mod2=sem2"
)
search_url_whole = search_url_begin + search_term + search_url_end
# load the page
results_page = urllib.urlopen(search_url_whole)
# read the results page html source
results_html = results_page.read()
# find the numbers in the results page html
results_summary = lxml.html.find_class(results_html, "stat-number")
# pull that text out
result_numbers = [number.text for number in results_summary]
# remove the spaces
results_no_space = [number.replace(" ", "") for number in result_numbers]
# convert that number from a (text) string to an integer
result_integers = [int(number) for number in results_no_space]
# make it printable in utf-8 encoding
word = search_term.decode('utf8')
# if there are any tokens of the search term, there will be several numbers
# on the page. The fifth such number is the total tokens; that's the one
# we want
if len(result_integers) >= 5:
word_results = result_integers[4]
# if there are no results, there won't be any numbers, so we assign zero.
else:
word_results = 0
return word_results
def main():
with codecs.open(results_file, "a", encoding="utf-8") as stream:
with open(input_file, 'r') as words:
for line in words:
for word in line.split():
freq = corpus_search(word)
stream.write("%s;%r\n" % (word.decode("utf-8"), freq))
if __name__ == '__main__':
main()
яблоко
апельсин
банан
ягоды
ягода
молоко
стол
стул
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment