iwouldnot · November 5, 2017 19:20
diff --git a/lab5.py b/lab5.py
 from nltk.stem.snowball import SnowballStemmer
 from nltk.tokenize import word_tokenize
 import io
 from functools import reduce
 from pprint import pprint as pp
 from glob import glob


 def parse_texts(fileglob='*.txt'):
    stemmer = SnowballStemmer("russian")
    texts, words = {}, set()
    for txtfile in glob(fileglob):
        with io.open(txtfile, 'r', encoding='utf-8') as f:
            txt = word_tokenize(f.read())
            txt = list(filter(lambda x: x.isalpha(), txt))
            txt = list(map(lambda x: stemmer.stem(x), txt))
            words |= set(txt)
            texts[txtfile.split('\\')[-1]] = txt
    return texts, words


 def term_search(terms):  # Searches simple inverted index
    stemmer = SnowballStemmer("russian")
    terms = [stemmer.stem(word) for word in terms]
    print(terms)
    return reduce(set.intersection,
                  (inverted_index[term] for term in terms),
                  set(texts.keys()))


 texts, words = parse_texts()
 # print('\nTexts')
 # pp(texts)
 # print('\nWords')
 # pp(sorted(words))

 inverted_index = {word: set(txt
                            for txt, wrds in texts.items() if word in wrds)
                  for word in words}

 # print('\nInverted Index')
 # pp({k: sorted(v) for k, v in inverted_index.items()})

 terms = ["я", "дедушка"]
 print('\nTerm Search for: ' + repr(terms))
 try:
    pp(sorted(term_search(terms)))
 except KeyError:
    print("Not found")
	from nltk.stem.snowball import SnowballStemmer
	from nltk.tokenize import word_tokenize
	import io
	from functools import reduce
	from pprint import pprint as pp
	from glob import glob


	def parse_texts(fileglob='*.txt'):
	stemmer = SnowballStemmer("russian")
	texts, words = {}, set()
	for txtfile in glob(fileglob):
	with io.open(txtfile, 'r', encoding='utf-8') as f:
	txt = word_tokenize(f.read())
	txt = list(filter(lambda x: x.isalpha(), txt))
	txt = list(map(lambda x: stemmer.stem(x), txt))
	words \|= set(txt)
	texts[txtfile.split('\\')[-1]] = txt
	return texts, words


	def term_search(terms): # Searches simple inverted index
	stemmer = SnowballStemmer("russian")
	terms = [stemmer.stem(word) for word in terms]
	print(terms)
	return reduce(set.intersection,
	(inverted_index[term] for term in terms),
	set(texts.keys()))


	texts, words = parse_texts()
	# print('\nTexts')
	# pp(texts)
	# print('\nWords')
	# pp(sorted(words))

	inverted_index = {word: set(txt
	for txt, wrds in texts.items() if word in wrds)
	for word in words}

	# print('\nInverted Index')
	# pp({k: sorted(v) for k, v in inverted_index.items()})

	terms = ["я", "дедушка"]
	print('\nTerm Search for: ' + repr(terms))
	try:
	pp(sorted(term_search(terms)))
	except KeyError:
	print("Not found")