Skip to content

Instantly share code, notes, and snippets.

@iwouldnot
Created November 5, 2017 19:20
Show Gist options
  • Save iwouldnot/420c82ca05a02c01a5e7089956949853 to your computer and use it in GitHub Desktop.
Save iwouldnot/420c82ca05a02c01a5e7089956949853 to your computer and use it in GitHub Desktop.
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import word_tokenize
import io
from functools import reduce
from pprint import pprint as pp
from glob import glob
def parse_texts(fileglob='*.txt'):
stemmer = SnowballStemmer("russian")
texts, words = {}, set()
for txtfile in glob(fileglob):
with io.open(txtfile, 'r', encoding='utf-8') as f:
txt = word_tokenize(f.read())
txt = list(filter(lambda x: x.isalpha(), txt))
txt = list(map(lambda x: stemmer.stem(x), txt))
words |= set(txt)
texts[txtfile.split('\\')[-1]] = txt
return texts, words
def term_search(terms): # Searches simple inverted index
stemmer = SnowballStemmer("russian")
terms = [stemmer.stem(word) for word in terms]
print(terms)
return reduce(set.intersection,
(inverted_index[term] for term in terms),
set(texts.keys()))
texts, words = parse_texts()
# print('\nTexts')
# pp(texts)
# print('\nWords')
# pp(sorted(words))
inverted_index = {word: set(txt
for txt, wrds in texts.items() if word in wrds)
for word in words}
# print('\nInverted Index')
# pp({k: sorted(v) for k, v in inverted_index.items()})
terms = ["я", "дедушка"]
print('\nTerm Search for: ' + repr(terms))
try:
pp(sorted(term_search(terms)))
except KeyError:
print("Not found")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment