unhammer · July 27, 2021 15:59
diff --git a/intervec.py b/intervec.py
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-

 # A SpaCy implementation of
 # https://github.com/Babylonpartners/fastText_multilingual
 #
 # heavily based on
 # https://github.com/Babylonpartners/fastText_multilingual/blob/master/align_your_own.ipynb

 import numpy as np
 import spacy
 import sys
 import argparse
 from distutils.sysconfig import get_python_lib


 # from https://stackoverflow.com/questions/21030391/how-to-normalize-array-numpy
 def normalized(a, axis=-1, order=2):
    """Utility function to normalize the rows of a numpy array."""
    l2 = np.atleast_1d(np.linalg.norm(a, order, axis))
    l2[l2 == 0] = 1
    return a / np.expand_dims(l2, axis)


 def make_training_matrices(src_vocab, trg_vocab, bilingual_dictionary):
    """
    Src and trg dictionaries are the FastVector objects of
    src/trg languages. bilingual_vocab is a list of
    translation pair tuples [(src_word, trg_word), ...].
    """
    src_matrix = []
    trg_matrix = []
    for (src, trg) in bilingual_dictionary:
        if src in src_vocab and trg in trg_vocab:
            src_matrix.append(src_vocab[src].vector)
            trg_matrix.append(trg_vocab[trg].vector)
    # return training matrices
    return np.array(src_matrix), np.array(trg_matrix)


 def learn_transformation(source_matrix, target_matrix, normalize_vectors=True):
    """
    Source and target matrices are numpy arrays, shape
    (dictionary_length, embedding_dimension). These contain paired
    word vectors from the bilingual dictionary.
    """
    # optionally normalize the training vectors
    if normalize_vectors:
        source_matrix = normalized(source_matrix)
        target_matrix = normalized(target_matrix)

    # perform the SVD
    product = np.matmul(source_matrix.transpose(), target_matrix)
    U, s, V = np.linalg.svd(product)

    # return orthogonal transformation which aligns source language to the target
    return np.matmul(U, V)


 def apply_transform(embed, transform):
    """
    Apply the given transformation to the vector space

    Right-multiplies given transform with embeddings E:
        E = E * transform
    """
    return np.matmul(embed, transform)


 def cosine_similarity(vec_a, vec_b):
    """Compute cosine similarity between vec_a and vec_b"""
    return np.dot(vec_a, vec_b) / \
        (np.linalg.norm(vec_a) * np.linalg.norm(vec_b))


 def compare_before_after(src_vectors, trg_vectors, dictionary, src_vectordata_transformed):
    for (src_word, trg_word) in dictionary:
        src_row = src_vectors.find(key=src_word)
        trg_row = trg_vectors.find(key=trg_word)
        src_vector             = src_vectors.data[src_row]
        trg_vector             = trg_vectors.data[trg_row]
        src_vector_transformed = src_vectordata_transformed[src_row]
        print("Similarity before transform: %.3f\tafter: %.3f\tfor word-pair (%s, %s)" % (
            cosine_similarity(src_vector, trg_vector),
            cosine_similarity(src_vector_transformed, trg_vector),
            src_word,
            trg_word))


 def normalised(mat, axis=-1, order=2):
    """Utility function to normalise the rows of a numpy array."""
    norm = np.linalg.norm(
        mat, axis=axis, ord=order, keepdims=True)
    norm[norm == 0] = 1
    return mat / norm


 def translate_vec_nearest_neighbour(src_vectors, trg_vocab, src_vector):
    """Obtain translation of src_vector using nearest neighbour retrieval"""
    similarity_vector = np.matmul(normalised(src_vectors.data), src_vector)
    target_id = np.argmax(similarity_vector)
    vector_key = trg_vocab.vectors.find(row=target_id)
    try:
        return [trg_vocab.strings[h] for h in vector_key]
    except KeyError:
        return []


 def translate_str_nearest_neighbour(src_vocab, src_vectors, trg_vocab, src_word):
    """Obtain translation of src_vector using nearest neighbour retrieval"""
    src_vector = src_vectors[src_vocab.vectors.find(key=src_word)]
    return translate_vec_nearest_neighbour(src_vectors, trg_vocab, src_vector)


 def intervec(src_vocab, trg_vocab, bilingual_dictionary):
    src_matrix, trg_matrix = make_training_matrices(
        src_vocab, trg_vocab, bilingual_dictionary)
    transform = learn_transformation(src_matrix, trg_matrix)
    src_vectors_transformed = apply_transform(src_vocab.vectors.data, transform)
    return src_vectors_transformed


 def naive_overlap_bidictionary(src_vocab, trg_vocab):
    src_words = {w.text for w in src_vocab}
    trg_words = {w.text for w in trg_vocab}
    return [(e, e) for e in set(src_words & trg_words)]


 def read_bidictionary(path):
    lines = (l.strip().split("\t")
             for l in open(path, 'r').readlines())
    return [(l[0], l[1])
            for l in lines
            if len(l) == 2]


 def make_argparser():
    parser = argparse.ArgumentParser(description='Align vector spaces by bilingual dictionary')
    parser.add_argument('-b', '--bidict',
                        help='input bilingual dictionary from source to target (one pair per line, tab separated);'
                        + ' by default we include as pairs strings that are equal in both languages')
    parser.add_argument('-n', '--no-overlap-bidict',
                        action='store_false',
                        help='by default, we also include overlapping strings from the vocabulary in the '
                        + ' bilingual dictionary; use this option to only use the --bidict file')
    parser.add_argument('-s', '--lang-src',
                        default='fr',
                        help='source language (default: fr)')
    parser.add_argument('-t', '--lang-trg',
                        default='en',
                        help='target language (default: en)')
    parser.add_argument('vec_src',
                        help='input directory of source language vectors, e.g. "'
                        + get_python_lib() + '/fr_core_news_md/fr_core_news_md-2.0.0/vocab"')
    parser.add_argument('vec_trg',
                        help='input directory of target language vectors, e.g. "'
                        + get_python_lib() + '/en_core_web_md/en_core_web_md-2.0.0/vocab"',)
    parser.add_argument('vec_src_transformed',
                        help='output directory where we store source language vectors '
                        + ' transformed/aligned to target language')
    return parser


 if __name__ == "__main__":
    args = make_argparser().parse_args()
    if args.bidict is None and args.no_overlap_bidict:
        print("Either supply a --bidict file or don't use --no-overlap-bidict", file=sys.stderr)
        sys.exit(1)
    # Load the smallest model, then add vectors from the bigger model,
    # otherwise we get crazy memory usage
    sp_src = spacy.load(args.lang_src, disable=['tagger', 'parser', 'ner'])
    sp_src.vocab.vectors.from_disk(args.vec_src)
    sp_trg = spacy.load(args.lang_trg, disable=['tagger', 'parser', 'ner'])
    sp_trg.vocab.vectors.from_disk(args.vec_trg)

    given_bidict = [] if args.bidict is None else read_bidictionary(args.bidict)
    overlap_bidict = [] if args.no_overlap_bidict else naive_overlap_bidictionary(sp_src.vocab, sp_trg.vocab)
    bidict = given_bidict + overlap_bidict

    src_vectors_transformed = intervec(sp_src.vocab, sp_trg.vocab, bidict)
    compare_before_after(sp_src.vocab.vectors,
                         sp_trg.vocab.vectors,
                         bidict[:10],
                         src_vectors_transformed)
    sp_src.vocab.vectors.data = src_vectors_transformed
    sp_src.vocab.vectors.to_disk(args.vec_src_transformed)
    # and if you run the output as input the second time, similarity
    # scores are even higher =P
	#!/usr/bin/env python3
	# -- coding: utf-8 --

	# A SpaCy implementation of
	# https://github.com/Babylonpartners/fastText_multilingual
	#
	# heavily based on
	# https://github.com/Babylonpartners/fastText_multilingual/blob/master/align_your_own.ipynb

	import numpy as np
	import spacy
	import sys
	import argparse
	from distutils.sysconfig import get_python_lib


	# from https://stackoverflow.com/questions/21030391/how-to-normalize-array-numpy
	def normalized(a, axis=-1, order=2):
	"""Utility function to normalize the rows of a numpy array."""
	l2 = np.atleast_1d(np.linalg.norm(a, order, axis))
	l2[l2 == 0] = 1
	return a / np.expand_dims(l2, axis)


	def make_training_matrices(src_vocab, trg_vocab, bilingual_dictionary):
	"""
	Src and trg dictionaries are the FastVector objects of
	src/trg languages. bilingual_vocab is a list of
	translation pair tuples [(src_word, trg_word), ...].
	"""
	src_matrix = []
	trg_matrix = []
	for (src, trg) in bilingual_dictionary:
	if src in src_vocab and trg in trg_vocab:
	src_matrix.append(src_vocab[src].vector)
	trg_matrix.append(trg_vocab[trg].vector)
	# return training matrices
	return np.array(src_matrix), np.array(trg_matrix)


	def learn_transformation(source_matrix, target_matrix, normalize_vectors=True):
	"""
	Source and target matrices are numpy arrays, shape
	(dictionary_length, embedding_dimension). These contain paired
	word vectors from the bilingual dictionary.
	"""
	# optionally normalize the training vectors
	if normalize_vectors:
	source_matrix = normalized(source_matrix)
	target_matrix = normalized(target_matrix)

	# perform the SVD
	product = np.matmul(source_matrix.transpose(), target_matrix)
	U, s, V = np.linalg.svd(product)

	# return orthogonal transformation which aligns source language to the target
	return np.matmul(U, V)


	def apply_transform(embed, transform):
	"""
	Apply the given transformation to the vector space

	Right-multiplies given transform with embeddings E:
	E = E * transform
	"""
	return np.matmul(embed, transform)


	def cosine_similarity(vec_a, vec_b):
	"""Compute cosine similarity between vec_a and vec_b"""
	return np.dot(vec_a, vec_b) / \
	(np.linalg.norm(vec_a) * np.linalg.norm(vec_b))


	def compare_before_after(src_vectors, trg_vectors, dictionary, src_vectordata_transformed):
	for (src_word, trg_word) in dictionary:
	src_row = src_vectors.find(key=src_word)
	trg_row = trg_vectors.find(key=trg_word)
	src_vector = src_vectors.data[src_row]
	trg_vector = trg_vectors.data[trg_row]
	src_vector_transformed = src_vectordata_transformed[src_row]
	print("Similarity before transform: %.3f\tafter: %.3f\tfor word-pair (%s, %s)" % (
	cosine_similarity(src_vector, trg_vector),
	cosine_similarity(src_vector_transformed, trg_vector),
	src_word,
	trg_word))


	def normalised(mat, axis=-1, order=2):
	"""Utility function to normalise the rows of a numpy array."""
	norm = np.linalg.norm(
	mat, axis=axis, ord=order, keepdims=True)
	norm[norm == 0] = 1
	return mat / norm


	def translate_vec_nearest_neighbour(src_vectors, trg_vocab, src_vector):
	"""Obtain translation of src_vector using nearest neighbour retrieval"""
	similarity_vector = np.matmul(normalised(src_vectors.data), src_vector)
	target_id = np.argmax(similarity_vector)
	vector_key = trg_vocab.vectors.find(row=target_id)
	try:
	return [trg_vocab.strings[h] for h in vector_key]
	except KeyError:
	return []


	def translate_str_nearest_neighbour(src_vocab, src_vectors, trg_vocab, src_word):
	"""Obtain translation of src_vector using nearest neighbour retrieval"""
	src_vector = src_vectors[src_vocab.vectors.find(key=src_word)]
	return translate_vec_nearest_neighbour(src_vectors, trg_vocab, src_vector)


	def intervec(src_vocab, trg_vocab, bilingual_dictionary):
	src_matrix, trg_matrix = make_training_matrices(
	src_vocab, trg_vocab, bilingual_dictionary)
	transform = learn_transformation(src_matrix, trg_matrix)
	src_vectors_transformed = apply_transform(src_vocab.vectors.data, transform)
	return src_vectors_transformed


	def naive_overlap_bidictionary(src_vocab, trg_vocab):
	src_words = {w.text for w in src_vocab}
	trg_words = {w.text for w in trg_vocab}
	return [(e, e) for e in set(src_words & trg_words)]


	def read_bidictionary(path):
	lines = (l.strip().split("\t")
	for l in open(path, 'r').readlines())
	return [(l[0], l[1])
	for l in lines
	if len(l) == 2]


	def make_argparser():
	parser = argparse.ArgumentParser(description='Align vector spaces by bilingual dictionary')
	parser.add_argument('-b', '--bidict',
	help='input bilingual dictionary from source to target (one pair per line, tab separated);'
	+ ' by default we include as pairs strings that are equal in both languages')
	parser.add_argument('-n', '--no-overlap-bidict',
	action='store_false',
	help='by default, we also include overlapping strings from the vocabulary in the '
	+ ' bilingual dictionary; use this option to only use the --bidict file')
	parser.add_argument('-s', '--lang-src',
	default='fr',
	help='source language (default: fr)')
	parser.add_argument('-t', '--lang-trg',
	default='en',
	help='target language (default: en)')
	parser.add_argument('vec_src',
	help='input directory of source language vectors, e.g. "'
	+ get_python_lib() + '/fr_core_news_md/fr_core_news_md-2.0.0/vocab"')
	parser.add_argument('vec_trg',
	help='input directory of target language vectors, e.g. "'
	+ get_python_lib() + '/en_core_web_md/en_core_web_md-2.0.0/vocab"',)
	parser.add_argument('vec_src_transformed',
	help='output directory where we store source language vectors '
	+ ' transformed/aligned to target language')
	return parser


	if __name__ == "__main__":
	args = make_argparser().parse_args()
	if args.bidict is None and args.no_overlap_bidict:
	print("Either supply a --bidict file or don't use --no-overlap-bidict", file=sys.stderr)
	sys.exit(1)
	# Load the smallest model, then add vectors from the bigger model,
	# otherwise we get crazy memory usage
	sp_src = spacy.load(args.lang_src, disable=['tagger', 'parser', 'ner'])
	sp_src.vocab.vectors.from_disk(args.vec_src)
	sp_trg = spacy.load(args.lang_trg, disable=['tagger', 'parser', 'ner'])
	sp_trg.vocab.vectors.from_disk(args.vec_trg)

	given_bidict = [] if args.bidict is None else read_bidictionary(args.bidict)
	overlap_bidict = [] if args.no_overlap_bidict else naive_overlap_bidictionary(sp_src.vocab, sp_trg.vocab)
	bidict = given_bidict + overlap_bidict

	src_vectors_transformed = intervec(sp_src.vocab, sp_trg.vocab, bidict)
	compare_before_after(sp_src.vocab.vectors,
	sp_trg.vocab.vectors,
	bidict[:10],
	src_vectors_transformed)
	sp_src.vocab.vectors.data = src_vectors_transformed
	sp_src.vocab.vectors.to_disk(args.vec_src_transformed)
	# and if you run the output as input the second time, similarity
	# scores are even higher =P