Last active
July 27, 2021 15:59
-
-
Save unhammer/41f6852457b635a8eb62004805ff4004 to your computer and use it in GitHub Desktop.
Multilingual word vectors for SpaCy (based on https://github.com/Babylonpartners/fastText_multilingual )
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# -*- coding: utf-8 -*- | |
# A SpaCy implementation of | |
# https://github.com/Babylonpartners/fastText_multilingual | |
# | |
# heavily based on | |
# https://github.com/Babylonpartners/fastText_multilingual/blob/master/align_your_own.ipynb | |
import numpy as np | |
import spacy | |
import sys | |
import argparse | |
from distutils.sysconfig import get_python_lib | |
# from https://stackoverflow.com/questions/21030391/how-to-normalize-array-numpy | |
def normalized(a, axis=-1, order=2): | |
"""Utility function to normalize the rows of a numpy array.""" | |
l2 = np.atleast_1d(np.linalg.norm(a, order, axis)) | |
l2[l2 == 0] = 1 | |
return a / np.expand_dims(l2, axis) | |
def make_training_matrices(src_vocab, trg_vocab, bilingual_dictionary): | |
""" | |
Src and trg dictionaries are the FastVector objects of | |
src/trg languages. bilingual_vocab is a list of | |
translation pair tuples [(src_word, trg_word), ...]. | |
""" | |
src_matrix = [] | |
trg_matrix = [] | |
for (src, trg) in bilingual_dictionary: | |
if src in src_vocab and trg in trg_vocab: | |
src_matrix.append(src_vocab[src].vector) | |
trg_matrix.append(trg_vocab[trg].vector) | |
# return training matrices | |
return np.array(src_matrix), np.array(trg_matrix) | |
def learn_transformation(source_matrix, target_matrix, normalize_vectors=True): | |
""" | |
Source and target matrices are numpy arrays, shape | |
(dictionary_length, embedding_dimension). These contain paired | |
word vectors from the bilingual dictionary. | |
""" | |
# optionally normalize the training vectors | |
if normalize_vectors: | |
source_matrix = normalized(source_matrix) | |
target_matrix = normalized(target_matrix) | |
# perform the SVD | |
product = np.matmul(source_matrix.transpose(), target_matrix) | |
U, s, V = np.linalg.svd(product) | |
# return orthogonal transformation which aligns source language to the target | |
return np.matmul(U, V) | |
def apply_transform(embed, transform): | |
""" | |
Apply the given transformation to the vector space | |
Right-multiplies given transform with embeddings E: | |
E = E * transform | |
""" | |
return np.matmul(embed, transform) | |
def cosine_similarity(vec_a, vec_b): | |
"""Compute cosine similarity between vec_a and vec_b""" | |
return np.dot(vec_a, vec_b) / \ | |
(np.linalg.norm(vec_a) * np.linalg.norm(vec_b)) | |
def compare_before_after(src_vectors, trg_vectors, dictionary, src_vectordata_transformed): | |
for (src_word, trg_word) in dictionary: | |
src_row = src_vectors.find(key=src_word) | |
trg_row = trg_vectors.find(key=trg_word) | |
src_vector = src_vectors.data[src_row] | |
trg_vector = trg_vectors.data[trg_row] | |
src_vector_transformed = src_vectordata_transformed[src_row] | |
print("Similarity before transform: %.3f\tafter: %.3f\tfor word-pair (%s, %s)" % ( | |
cosine_similarity(src_vector, trg_vector), | |
cosine_similarity(src_vector_transformed, trg_vector), | |
src_word, | |
trg_word)) | |
def normalised(mat, axis=-1, order=2): | |
"""Utility function to normalise the rows of a numpy array.""" | |
norm = np.linalg.norm( | |
mat, axis=axis, ord=order, keepdims=True) | |
norm[norm == 0] = 1 | |
return mat / norm | |
def translate_vec_nearest_neighbour(src_vectors, trg_vocab, src_vector): | |
"""Obtain translation of src_vector using nearest neighbour retrieval""" | |
similarity_vector = np.matmul(normalised(src_vectors.data), src_vector) | |
target_id = np.argmax(similarity_vector) | |
vector_key = trg_vocab.vectors.find(row=target_id) | |
try: | |
return [trg_vocab.strings[h] for h in vector_key] | |
except KeyError: | |
return [] | |
def translate_str_nearest_neighbour(src_vocab, src_vectors, trg_vocab, src_word): | |
"""Obtain translation of src_vector using nearest neighbour retrieval""" | |
src_vector = src_vectors[src_vocab.vectors.find(key=src_word)] | |
return translate_vec_nearest_neighbour(src_vectors, trg_vocab, src_vector) | |
def intervec(src_vocab, trg_vocab, bilingual_dictionary): | |
src_matrix, trg_matrix = make_training_matrices( | |
src_vocab, trg_vocab, bilingual_dictionary) | |
transform = learn_transformation(src_matrix, trg_matrix) | |
src_vectors_transformed = apply_transform(src_vocab.vectors.data, transform) | |
return src_vectors_transformed | |
def naive_overlap_bidictionary(src_vocab, trg_vocab): | |
src_words = {w.text for w in src_vocab} | |
trg_words = {w.text for w in trg_vocab} | |
return [(e, e) for e in set(src_words & trg_words)] | |
def read_bidictionary(path): | |
lines = (l.strip().split("\t") | |
for l in open(path, 'r').readlines()) | |
return [(l[0], l[1]) | |
for l in lines | |
if len(l) == 2] | |
def make_argparser(): | |
parser = argparse.ArgumentParser(description='Align vector spaces by bilingual dictionary') | |
parser.add_argument('-b', '--bidict', | |
help='input bilingual dictionary from source to target (one pair per line, tab separated);' | |
+ ' by default we include as pairs strings that are equal in both languages') | |
parser.add_argument('-n', '--no-overlap-bidict', | |
action='store_false', | |
help='by default, we also include overlapping strings from the vocabulary in the ' | |
+ ' bilingual dictionary; use this option to only use the --bidict file') | |
parser.add_argument('-s', '--lang-src', | |
default='fr', | |
help='source language (default: fr)') | |
parser.add_argument('-t', '--lang-trg', | |
default='en', | |
help='target language (default: en)') | |
parser.add_argument('vec_src', | |
help='input directory of source language vectors, e.g. "' | |
+ get_python_lib() + '/fr_core_news_md/fr_core_news_md-2.0.0/vocab"') | |
parser.add_argument('vec_trg', | |
help='input directory of target language vectors, e.g. "' | |
+ get_python_lib() + '/en_core_web_md/en_core_web_md-2.0.0/vocab"',) | |
parser.add_argument('vec_src_transformed', | |
help='output directory where we store source language vectors ' | |
+ ' transformed/aligned to target language') | |
return parser | |
if __name__ == "__main__": | |
args = make_argparser().parse_args() | |
if args.bidict is None and args.no_overlap_bidict: | |
print("Either supply a --bidict file or don't use --no-overlap-bidict", file=sys.stderr) | |
sys.exit(1) | |
# Load the smallest model, then add vectors from the bigger model, | |
# otherwise we get crazy memory usage | |
sp_src = spacy.load(args.lang_src, disable=['tagger', 'parser', 'ner']) | |
sp_src.vocab.vectors.from_disk(args.vec_src) | |
sp_trg = spacy.load(args.lang_trg, disable=['tagger', 'parser', 'ner']) | |
sp_trg.vocab.vectors.from_disk(args.vec_trg) | |
given_bidict = [] if args.bidict is None else read_bidictionary(args.bidict) | |
overlap_bidict = [] if args.no_overlap_bidict else naive_overlap_bidictionary(sp_src.vocab, sp_trg.vocab) | |
bidict = given_bidict + overlap_bidict | |
src_vectors_transformed = intervec(sp_src.vocab, sp_trg.vocab, bidict) | |
compare_before_after(sp_src.vocab.vectors, | |
sp_trg.vocab.vectors, | |
bidict[:10], | |
src_vectors_transformed) | |
sp_src.vocab.vectors.data = src_vectors_transformed | |
sp_src.vocab.vectors.to_disk(args.vec_src_transformed) | |
# and if you run the output as input the second time, similarity | |
# scores are even higher =P | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment