import plac |
import pymorphy2 |
import numpy as np |
from navec import Navec |
from spacy.language import Language |
from spacy.vectors import Vectors |
from spacy.vocab import Vocab |
from pathlib import Path |
class RussianLanguage(Language): |
lang = 'ru' |
def tags_from(word): |
return str(word.tag).split(' ')[0] |
@plac.annotations( |
model=("Navec model archive.", "option", "m", str), |
output_dir=("Model output directory", "option", "o", Path) |
) |
def main(model='navec_hudlit_v1_12B_500K_300d_100q.tar', output_dir='./model'): |
morph_analyzer = pymorphy2.MorphAnalyzer() |
navec_model = Navec.load(model) |
print('Loaded model') |
known_tags = [tag for tag in morph_analyzer.TagClass.KNOWN_GRAMMEMES] |
vectors_dims = 300 + len(known_tags) |
words = navec_model.vocab.words |
vocabulary = Vocab() |
vocabulary.vectors = Vectors(shape=(len(words), vectors_dims), name='navec_lex') |
added_vectors = 0 |
added_lexemes = 0 |
for word in words: |
parsed_word = morph_analyzer.parse(word) |
word_tags = tags_from(parsed_word[0]) |
# retrieve unique lexemes for a given word filtered by similar tags |
lexemes = set([lexeme.word for lexeme in parsed_word[0].lexeme if word_tags == tags_from(lexeme)]) |
# check if at least 1 lexeme is absent in a vocabulary (returned list will have -1 values) |
rows = vocabulary.vectors.find(keys=lexemes) |
if any(row == -1 for row in rows): |
# find a vector for each lexeme if exists |
vectors = [] |
for lexeme in lexemes: |
vector = navec_model.get(lexeme) |
if vector is not None: |
vectors.append(vector) |
if len(vectors) > 0: |
tags_cipher = [+(tag in word_tags) for tag in known_tags] |
# create mean vector merged with tags |
mean_vector = np.append(np.unique(vectors, axis=0).mean(axis=0), tags_cipher, axis=0) |
# filter lexemes by indices which reflect -1 values returned by vocab's rows lookup |
unique_lexemes = [lexeme for idx, lexeme in enumerate(lexemes) if rows[idx] == -1] |
# add a new vector and a first hash from paradigm |
vector_row = vocabulary.vectors.add(unique_lexemes.pop(), vector=mean_vector) |
# map other lexemes' hashes with the above vector |
for lexeme in unique_lexemes: |
vocabulary.vectors.add(lexeme, row=vector_row) |
# collect stats |
added_lexemes += (len(unique_lexemes) + 1) |
added_vectors += 1 |
print('Vectors:', added_vectors, 'Lexemes:', added_lexemes) |
removed_vectors = vocabulary.vectors.resize(shape=(added_vectors, vectors_dims)) |
print('Resized vocabulary to', added_vectors) |
print('Removed vectors:', removed_vectors) |
nlp = RussianLanguage(vocabulary) |
nlp.to_disk(output_dir) |
print('Saved model to disk') |
if __name__ == "__main__": |
plac.call(main) |