Skip to content

Instantly share code, notes, and snippets.

@buriy
Forked from sskorol/navec2spacy.py
Last active October 26, 2020 18:29
Show Gist options
  • Save buriy/f81330ccd5f35e503f957e96844540e3 to your computer and use it in GitHub Desktop.
Save buriy/f81330ccd5f35e503f957e96844540e3 to your computer and use it in GitHub Desktop.
Convert Navec model into Spacy format for further POS/DEP training
import plac
import pymorphy2
import numpy as np
from navec import Navec
from spacy.language import Language
from spacy.vocab import Vocab
from pathlib import Path
from collections import defaultdict
@plac.annotations(
model=("Navec model archive.", "option", "m", str),
output_dir=("Model output directory", "option", "o", Path)
)
def main(model='navec_hudlit_v1_12B_500K_300d_100q.tar', output_dir='./model'):
morph_analyzer = pymorphy2.MorphAnalyzer()
navec_model = Navec.load(model)
# Map lexemes with lemmas
lexeme_to_lemma = defaultdict(str)
for word in navec_model.vocab.words:
parsed_word = morph_analyzer.parse(word)
lexeme_to_lemma[word] = parsed_word[0].normal_form
# Group lexeme vectors under lemmas
lemma_to_vectors = defaultdict(list)
for lexeme, lemma in lexeme_to_lemma.items():
vector = navec_model.get(lexeme)
if vector is not None:
lemma_to_vectors[lemma].append(vector)
# Compute avg vectors for each lemma and save on disk
vocabulary = Vocab()
for lemma, vectors in lemma_to_vectors.items():
vocabulary.set_vector(lemma, np.array(vectors).mean(axis=0))
nlp = Language(vocabulary)
nlp.to_disk(output_dir)
if __name__ == "__main__":
plac.call(main)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment