Skip to content

Instantly share code, notes, and snippets.

@unhammer
Last active July 27, 2021 15:59
Show Gist options
  • Save unhammer/41f6852457b635a8eb62004805ff4004 to your computer and use it in GitHub Desktop.
Save unhammer/41f6852457b635a8eb62004805ff4004 to your computer and use it in GitHub Desktop.
Multilingual word vectors for SpaCy (based on https://github.com/Babylonpartners/fastText_multilingual )
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# A SpaCy implementation of
# https://github.com/Babylonpartners/fastText_multilingual
#
# heavily based on
# https://github.com/Babylonpartners/fastText_multilingual/blob/master/align_your_own.ipynb
import numpy as np
import spacy
import sys
import argparse
from distutils.sysconfig import get_python_lib
# from https://stackoverflow.com/questions/21030391/how-to-normalize-array-numpy
def normalized(a, axis=-1, order=2):
"""Utility function to normalize the rows of a numpy array."""
l2 = np.atleast_1d(np.linalg.norm(a, order, axis))
l2[l2 == 0] = 1
return a / np.expand_dims(l2, axis)
def make_training_matrices(src_vocab, trg_vocab, bilingual_dictionary):
"""
Src and trg dictionaries are the FastVector objects of
src/trg languages. bilingual_vocab is a list of
translation pair tuples [(src_word, trg_word), ...].
"""
src_matrix = []
trg_matrix = []
for (src, trg) in bilingual_dictionary:
if src in src_vocab and trg in trg_vocab:
src_matrix.append(src_vocab[src].vector)
trg_matrix.append(trg_vocab[trg].vector)
# return training matrices
return np.array(src_matrix), np.array(trg_matrix)
def learn_transformation(source_matrix, target_matrix, normalize_vectors=True):
"""
Source and target matrices are numpy arrays, shape
(dictionary_length, embedding_dimension). These contain paired
word vectors from the bilingual dictionary.
"""
# optionally normalize the training vectors
if normalize_vectors:
source_matrix = normalized(source_matrix)
target_matrix = normalized(target_matrix)
# perform the SVD
product = np.matmul(source_matrix.transpose(), target_matrix)
U, s, V = np.linalg.svd(product)
# return orthogonal transformation which aligns source language to the target
return np.matmul(U, V)
def apply_transform(embed, transform):
"""
Apply the given transformation to the vector space
Right-multiplies given transform with embeddings E:
E = E * transform
"""
return np.matmul(embed, transform)
def cosine_similarity(vec_a, vec_b):
"""Compute cosine similarity between vec_a and vec_b"""
return np.dot(vec_a, vec_b) / \
(np.linalg.norm(vec_a) * np.linalg.norm(vec_b))
def compare_before_after(src_vectors, trg_vectors, dictionary, src_vectordata_transformed):
for (src_word, trg_word) in dictionary:
src_row = src_vectors.find(key=src_word)
trg_row = trg_vectors.find(key=trg_word)
src_vector = src_vectors.data[src_row]
trg_vector = trg_vectors.data[trg_row]
src_vector_transformed = src_vectordata_transformed[src_row]
print("Similarity before transform: %.3f\tafter: %.3f\tfor word-pair (%s, %s)" % (
cosine_similarity(src_vector, trg_vector),
cosine_similarity(src_vector_transformed, trg_vector),
src_word,
trg_word))
def normalised(mat, axis=-1, order=2):
"""Utility function to normalise the rows of a numpy array."""
norm = np.linalg.norm(
mat, axis=axis, ord=order, keepdims=True)
norm[norm == 0] = 1
return mat / norm
def translate_vec_nearest_neighbour(src_vectors, trg_vocab, src_vector):
"""Obtain translation of src_vector using nearest neighbour retrieval"""
similarity_vector = np.matmul(normalised(src_vectors.data), src_vector)
target_id = np.argmax(similarity_vector)
vector_key = trg_vocab.vectors.find(row=target_id)
try:
return [trg_vocab.strings[h] for h in vector_key]
except KeyError:
return []
def translate_str_nearest_neighbour(src_vocab, src_vectors, trg_vocab, src_word):
"""Obtain translation of src_vector using nearest neighbour retrieval"""
src_vector = src_vectors[src_vocab.vectors.find(key=src_word)]
return translate_vec_nearest_neighbour(src_vectors, trg_vocab, src_vector)
def intervec(src_vocab, trg_vocab, bilingual_dictionary):
src_matrix, trg_matrix = make_training_matrices(
src_vocab, trg_vocab, bilingual_dictionary)
transform = learn_transformation(src_matrix, trg_matrix)
src_vectors_transformed = apply_transform(src_vocab.vectors.data, transform)
return src_vectors_transformed
def naive_overlap_bidictionary(src_vocab, trg_vocab):
src_words = {w.text for w in src_vocab}
trg_words = {w.text for w in trg_vocab}
return [(e, e) for e in set(src_words & trg_words)]
def read_bidictionary(path):
lines = (l.strip().split("\t")
for l in open(path, 'r').readlines())
return [(l[0], l[1])
for l in lines
if len(l) == 2]
def make_argparser():
parser = argparse.ArgumentParser(description='Align vector spaces by bilingual dictionary')
parser.add_argument('-b', '--bidict',
help='input bilingual dictionary from source to target (one pair per line, tab separated);'
+ ' by default we include as pairs strings that are equal in both languages')
parser.add_argument('-n', '--no-overlap-bidict',
action='store_false',
help='by default, we also include overlapping strings from the vocabulary in the '
+ ' bilingual dictionary; use this option to only use the --bidict file')
parser.add_argument('-s', '--lang-src',
default='fr',
help='source language (default: fr)')
parser.add_argument('-t', '--lang-trg',
default='en',
help='target language (default: en)')
parser.add_argument('vec_src',
help='input directory of source language vectors, e.g. "'
+ get_python_lib() + '/fr_core_news_md/fr_core_news_md-2.0.0/vocab"')
parser.add_argument('vec_trg',
help='input directory of target language vectors, e.g. "'
+ get_python_lib() + '/en_core_web_md/en_core_web_md-2.0.0/vocab"',)
parser.add_argument('vec_src_transformed',
help='output directory where we store source language vectors '
+ ' transformed/aligned to target language')
return parser
if __name__ == "__main__":
args = make_argparser().parse_args()
if args.bidict is None and args.no_overlap_bidict:
print("Either supply a --bidict file or don't use --no-overlap-bidict", file=sys.stderr)
sys.exit(1)
# Load the smallest model, then add vectors from the bigger model,
# otherwise we get crazy memory usage
sp_src = spacy.load(args.lang_src, disable=['tagger', 'parser', 'ner'])
sp_src.vocab.vectors.from_disk(args.vec_src)
sp_trg = spacy.load(args.lang_trg, disable=['tagger', 'parser', 'ner'])
sp_trg.vocab.vectors.from_disk(args.vec_trg)
given_bidict = [] if args.bidict is None else read_bidictionary(args.bidict)
overlap_bidict = [] if args.no_overlap_bidict else naive_overlap_bidictionary(sp_src.vocab, sp_trg.vocab)
bidict = given_bidict + overlap_bidict
src_vectors_transformed = intervec(sp_src.vocab, sp_trg.vocab, bidict)
compare_before_after(sp_src.vocab.vectors,
sp_trg.vocab.vectors,
bidict[:10],
src_vectors_transformed)
sp_src.vocab.vectors.data = src_vectors_transformed
sp_src.vocab.vectors.to_disk(args.vec_src_transformed)
# and if you run the output as input the second time, similarity
# scores are even higher =P
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment