Last active
April 4, 2019 10:43
-
-
Save davidalbertonogueira/a7becd3ce6ff93f1da848e2393033941 to your computer and use it in GitHub Desktop.
alignment to english of Fast text embeddings (using alignment matrices from Babylonpartners/fastText_multilingual)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Align vector spaces to English (Multilingual word vectors)!" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import numpy as np\n", | |
"from fasttext import FastVector\n", | |
"\n", | |
"# from https://stackoverflow.com/questions/21030391/how-to-normalize-array-numpy\n", | |
"def normalized(a, axis=-1, order=2):\n", | |
" \"\"\"Utility function to normalize the rows of a numpy array.\"\"\"\n", | |
" l2 = np.atleast_1d(np.linalg.norm(a, order, axis))\n", | |
" l2[l2==0] = 1\n", | |
" return a / np.expand_dims(l2, axis)\n", | |
"\n", | |
"def make_training_matrices(source_dictionary, target_dictionary, bilingual_dictionary):\n", | |
" \"\"\"\n", | |
" Source and target dictionaries are the FastVector objects of\n", | |
" source/target languages. bilingual_dictionary is a list of \n", | |
" translation pair tuples [(source_word, target_word), ...].\n", | |
" \"\"\"\n", | |
" source_matrix = []\n", | |
" target_matrix = []\n", | |
"\n", | |
" for (source, target) in bilingual_dictionary:\n", | |
" if source in source_dictionary and target in target_dictionary:\n", | |
" source_matrix.append(source_dictionary[source])\n", | |
" target_matrix.append(target_dictionary[target])\n", | |
"\n", | |
" # return training matrices\n", | |
" return np.array(source_matrix), np.array(target_matrix)\n", | |
"\n", | |
"def learn_transformation(source_matrix, target_matrix, normalize_vectors=True):\n", | |
" \"\"\"\n", | |
" Source and target matrices are numpy arrays, shape\n", | |
" (dictionary_length, embedding_dimension). These contain paired\n", | |
" word vectors from the bilingual dictionary.\n", | |
" \"\"\"\n", | |
" # optionally normalize the training vectors\n", | |
" if normalize_vectors:\n", | |
" source_matrix = normalized(source_matrix)\n", | |
" target_matrix = normalized(target_matrix)\n", | |
"\n", | |
" # perform the SVD\n", | |
" product = np.matmul(source_matrix.transpose(), target_matrix)\n", | |
" U, s, V = np.linalg.svd(product)\n", | |
"\n", | |
" # return orthogonal transformation which aligns source language to the target\n", | |
" return np.matmul(U, V)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"class Downloder():\n", | |
" def download_manager(self, url, destination='', try_number=\"10\", time_out=\"60\"):\n", | |
" #threading.Thread(target=self._wget_dl, args=(url, destination, try_number, time_out, log_file)).start()\n", | |
" if self._wget_dl(url, destination, try_number, time_out, log_file) == 0:\n", | |
" return True\n", | |
" else:\n", | |
" return False\n", | |
"\n", | |
"\n", | |
" def _wget_dl(self,url, destination, try_number, time_out):\n", | |
" import subprocess\n", | |
" command=[\"wget\", \"-c\", \"-P\", destination, \"-t\", try_number, \"-T\", time_out , url]\n", | |
" try:\n", | |
" download_state=subprocess.call(command)\n", | |
" except Exception as e:\n", | |
" print(e)\n", | |
" #if download_state==0 => successfull download\n", | |
" return download_state" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"16\n", | |
"reading word vectors from /mnt/hdd_disk/dan/datasets/embeddings_fastText/ar/wiki.ar.vec\n", | |
"reading word vectors from /mnt/hdd_disk/dan/datasets/embeddings_fastText/da/wiki.da.vec\n", | |
"reading word vectors from /mnt/hdd_disk/dan/datasets/embeddings_fastText/de/wiki.de.vec\n", | |
"reading word vectors from /mnt/hdd_disk/dan/datasets/embeddings_fastText/el/wiki.el.vec\n", | |
"reading word vectors from /mnt/hdd_disk/dan/datasets/embeddings_fastText/en/wiki.en.vec\n", | |
"reading word vectors from /mnt/hdd_disk/dan/datasets/embeddings_fastText/es/wiki.es.vec\n", | |
"reading word vectors from /mnt/hdd_disk/dan/datasets/embeddings_fastText/fa/wiki.fa.vec\n", | |
"reading word vectors from /mnt/hdd_disk/dan/datasets/embeddings_fastText/hu/wiki.hu.vec\n", | |
"reading word vectors from /mnt/hdd_disk/dan/datasets/embeddings_fastText/fr/wiki.fr.vec\n", | |
"reading word vectors from /mnt/hdd_disk/dan/datasets/embeddings_fastText/it/wiki.it.vec\n", | |
"reading word vectors from /mnt/hdd_disk/dan/datasets/embeddings_fastText/ja/wiki.ja.vec\n", | |
"reading word vectors from /mnt/hdd_disk/dan/datasets/embeddings_fastText/lv/wiki.lv.vec\n", | |
"reading word vectors from /mnt/hdd_disk/dan/datasets/embeddings_fastText/pt/wiki.pt.vec\n", | |
"reading word vectors from /mnt/hdd_disk/dan/datasets/embeddings_fastText/ru/wiki.ru.vec\n", | |
"reading word vectors from /mnt/hdd_disk/dan/datasets/embeddings_fastText/tr/wiki.tr.vec\n", | |
"reading word vectors from /mnt/hdd_disk/dan/datasets/embeddings_fastText/uk/wiki.uk.vec\n", | |
"16\n" | |
] | |
} | |
], | |
"source": [ | |
"languages=[\"ar\",\n", | |
"\"da\",\n", | |
"\"de\",\n", | |
"\"el\",\n", | |
"\"en\",\n", | |
"\"es\",\n", | |
"\"fa\",\n", | |
"\"hu\",\n", | |
"\"fr\",\n", | |
"\"it\",\n", | |
"\"ja\",\n", | |
"\"lv\",\n", | |
"\"pt\",\n", | |
"\"ru\",\n", | |
"\"tr\",\n", | |
"\"uk\"]\n", | |
"print(len(languages))\n", | |
"\n", | |
"languages_dictionary={}\n", | |
"\n", | |
"path_prefix = \"/mnt/hdd_disk/dan/datasets/embeddings_fastText/\"\n", | |
"for language in languages:\n", | |
" folder_path = path_prefix + language + \"/\"\n", | |
" file_path = path_prefix + language + \"/wiki.\"+language+\".vec\"\n", | |
" import os \n", | |
" if not os.path.isfile(file_path) :\n", | |
" if os.path.isdir(folder_path) :\n", | |
" os.mkdir(folder_path)\n", | |
" downloader = Downloder()\n", | |
" downloader._wget_dl(\"https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.\"+language+\".vec\", folder_path)\n", | |
" languages_dictionary[language]=FastVector(vector_file=file_path)\n", | |
"\n", | |
"print(len(languages_dictionary))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"-0.014731081984914952\n" | |
] | |
} | |
], | |
"source": [ | |
"fa_vector = languages_dictionary[\"fa\"][\"گربه\"]\n", | |
"uk_vector = languages_dictionary[\"uk\"][\"кіт\"]\n", | |
"print(FastVector.cosine_similarity(fa_vector, uk_vector))" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"\"گربه\" and \"кіт\" both mean \"cat\", so they should be highly similar; clearly the two word vector spaces are not yet aligned. To align them, ..." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"for language, language_dictionary in languages_dictionary.items():\n", | |
" language_dictionary.apply_transform('alignment_matrices/'+language+'.txt')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"0.500099345287656\n" | |
] | |
} | |
], | |
"source": [ | |
"fa_vector = languages_dictionary[\"fa\"][\"گربه\"]\n", | |
"uk_vector = languages_dictionary[\"uk\"][\"кіт\"]\n", | |
"\n", | |
"print(FastVector.cosine_similarity(fa_vector, uk_vector))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"for language, language_dictionary in languages_dictionary.items():\n", | |
" target_path = path_prefix + language + '/multiling_'+language+'.vec')\n", | |
" import os \n", | |
" if not os.path.isfile(target_path) :\n", | |
" language_dictionary.export(target_path)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.6.8" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 1 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment