davidalbertonogueira · April 4, 2019 10:43
diff --git a/fastText_multilingual_alignment.ipynb b/fastText_multilingual_alignment.ipynb
 {
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Align vector spaces to English (Multilingual word vectors)!"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "from fasttext import FastVector\n",
    "\n",
    "# from https://stackoverflow.com/questions/21030391/how-to-normalize-array-numpy\n",
    "def normalized(a, axis=-1, order=2):\n",
    "    \"\"\"Utility function to normalize the rows of a numpy array.\"\"\"\n",
    "    l2 = np.atleast_1d(np.linalg.norm(a, order, axis))\n",
    "    l2[l2==0] = 1\n",
    "    return a / np.expand_dims(l2, axis)\n",
    "\n",
    "def make_training_matrices(source_dictionary, target_dictionary, bilingual_dictionary):\n",
    "    \"\"\"\n",
    "    Source and target dictionaries are the FastVector objects of\n",
    "    source/target languages. bilingual_dictionary is a list of \n",
    "    translation pair tuples [(source_word, target_word), ...].\n",
    "    \"\"\"\n",
    "    source_matrix = []\n",
    "    target_matrix = []\n",
    "\n",
    "    for (source, target) in bilingual_dictionary:\n",
    "        if source in source_dictionary and target in target_dictionary:\n",
    "            source_matrix.append(source_dictionary[source])\n",
    "            target_matrix.append(target_dictionary[target])\n",
    "\n",
    "    # return training matrices\n",
    "    return np.array(source_matrix), np.array(target_matrix)\n",
    "\n",
    "def learn_transformation(source_matrix, target_matrix, normalize_vectors=True):\n",
    "    \"\"\"\n",
    "    Source and target matrices are numpy arrays, shape\n",
    "    (dictionary_length, embedding_dimension). These contain paired\n",
    "    word vectors from the bilingual dictionary.\n",
    "    \"\"\"\n",
    "    # optionally normalize the training vectors\n",
    "    if normalize_vectors:\n",
    "        source_matrix = normalized(source_matrix)\n",
    "        target_matrix = normalized(target_matrix)\n",
    "\n",
    "    # perform the SVD\n",
    "    product = np.matmul(source_matrix.transpose(), target_matrix)\n",
    "    U, s, V = np.linalg.svd(product)\n",
    "\n",
    "    # return orthogonal transformation which aligns source language to the target\n",
    "    return np.matmul(U, V)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "class Downloder():\n",
    "    def download_manager(self, url, destination='', try_number=\"10\", time_out=\"60\"):\n",
    "        #threading.Thread(target=self._wget_dl, args=(url, destination, try_number, time_out, log_file)).start()\n",
    "        if self._wget_dl(url, destination, try_number, time_out, log_file) == 0:\n",
    "            return True\n",
    "        else:\n",
    "            return False\n",
    "\n",
    "\n",
    "    def _wget_dl(self,url, destination, try_number, time_out):\n",
    "        import subprocess\n",
    "        command=[\"wget\", \"-c\", \"-P\", destination, \"-t\", try_number, \"-T\", time_out , url]\n",
    "        try:\n",
    "            download_state=subprocess.call(command)\n",
    "        except Exception as e:\n",
    "            print(e)\n",
    "        #if download_state==0 => successfull download\n",
    "        return download_state"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "16\n",
      "reading word vectors from /mnt/hdd_disk/dan/datasets/embeddings_fastText/ar/wiki.ar.vec\n",
      "reading word vectors from /mnt/hdd_disk/dan/datasets/embeddings_fastText/da/wiki.da.vec\n",
      "reading word vectors from /mnt/hdd_disk/dan/datasets/embeddings_fastText/de/wiki.de.vec\n",
      "reading word vectors from /mnt/hdd_disk/dan/datasets/embeddings_fastText/el/wiki.el.vec\n",
      "reading word vectors from /mnt/hdd_disk/dan/datasets/embeddings_fastText/en/wiki.en.vec\n",
      "reading word vectors from /mnt/hdd_disk/dan/datasets/embeddings_fastText/es/wiki.es.vec\n",
      "reading word vectors from /mnt/hdd_disk/dan/datasets/embeddings_fastText/fa/wiki.fa.vec\n",
      "reading word vectors from /mnt/hdd_disk/dan/datasets/embeddings_fastText/hu/wiki.hu.vec\n",
      "reading word vectors from /mnt/hdd_disk/dan/datasets/embeddings_fastText/fr/wiki.fr.vec\n",
      "reading word vectors from /mnt/hdd_disk/dan/datasets/embeddings_fastText/it/wiki.it.vec\n",
      "reading word vectors from /mnt/hdd_disk/dan/datasets/embeddings_fastText/ja/wiki.ja.vec\n",
      "reading word vectors from /mnt/hdd_disk/dan/datasets/embeddings_fastText/lv/wiki.lv.vec\n",
      "reading word vectors from /mnt/hdd_disk/dan/datasets/embeddings_fastText/pt/wiki.pt.vec\n",
      "reading word vectors from /mnt/hdd_disk/dan/datasets/embeddings_fastText/ru/wiki.ru.vec\n",
      "reading word vectors from /mnt/hdd_disk/dan/datasets/embeddings_fastText/tr/wiki.tr.vec\n",
      "reading word vectors from /mnt/hdd_disk/dan/datasets/embeddings_fastText/uk/wiki.uk.vec\n",
      "16\n"
     ]
    }
   ],
   "source": [
    "languages=[\"ar\",\n",
    "\"da\",\n",
    "\"de\",\n",
    "\"el\",\n",
    "\"en\",\n",
    "\"es\",\n",
    "\"fa\",\n",
    "\"hu\",\n",
    "\"fr\",\n",
    "\"it\",\n",
    "\"ja\",\n",
    "\"lv\",\n",
    "\"pt\",\n",
    "\"ru\",\n",
    "\"tr\",\n",
    "\"uk\"]\n",
    "print(len(languages))\n",
    "\n",
    "languages_dictionary={}\n",
    "\n",
    "path_prefix = \"/mnt/hdd_disk/dan/datasets/embeddings_fastText/\"\n",
    "for language in languages:\n",
    "    folder_path = path_prefix + language + \"/\"\n",
    "    file_path = path_prefix + language + \"/wiki.\"+language+\".vec\"\n",
    "    import os  \n",
    "    if not os.path.isfile(file_path) :\n",
    "            if os.path.isdir(folder_path) :\n",
    "                os.mkdir(folder_path)\n",
    "            downloader = Downloder()\n",
    "            downloader._wget_dl(\"https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.\"+language+\".vec\", folder_path)\n",
    "    languages_dictionary[language]=FastVector(vector_file=file_path)\n",
    "\n",
    "print(len(languages_dictionary))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "-0.014731081984914952\n"
     ]
    }
   ],
   "source": [
    "fa_vector = languages_dictionary[\"fa\"][\"گربه\"]\n",
    "uk_vector = languages_dictionary[\"uk\"][\"кіт\"]\n",
    "print(FastVector.cosine_similarity(fa_vector, uk_vector))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "\"گربه\" and \"кіт\" both mean \"cat\", so they should be highly similar; clearly the two word vector spaces are not yet aligned. To align them, ..."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "for language, language_dictionary in languages_dictionary.items():\n",
    "    language_dictionary.apply_transform('alignment_matrices/'+language+'.txt')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.500099345287656\n"
     ]
    }
   ],
   "source": [
    "fa_vector = languages_dictionary[\"fa\"][\"گربه\"]\n",
    "uk_vector = languages_dictionary[\"uk\"][\"кіт\"]\n",
    "\n",
    "print(FastVector.cosine_similarity(fa_vector, uk_vector))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "for language, language_dictionary in languages_dictionary.items():\n",
    "    target_path = path_prefix + language + '/multiling_'+language+'.vec')\n",
    "    import os  \n",
    "    if not os.path.isfile(target_path) :\n",
    "        language_dictionary.export(target_path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
 }
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# Align vector spaces to English (Multilingual word vectors)!"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {},
	"outputs": [],
	"source": [
	"import numpy as np\n",
	"from fasttext import FastVector\n",
	"\n",
	"# from https://stackoverflow.com/questions/21030391/how-to-normalize-array-numpy\n",
	"def normalized(a, axis=-1, order=2):\n",
	" \"\"\"Utility function to normalize the rows of a numpy array.\"\"\"\n",
	" l2 = np.atleast_1d(np.linalg.norm(a, order, axis))\n",
	" l2[l2==0] = 1\n",
	" return a / np.expand_dims(l2, axis)\n",
	"\n",
	"def make_training_matrices(source_dictionary, target_dictionary, bilingual_dictionary):\n",
	" \"\"\"\n",
	" Source and target dictionaries are the FastVector objects of\n",
	" source/target languages. bilingual_dictionary is a list of \n",
	" translation pair tuples [(source_word, target_word), ...].\n",
	" \"\"\"\n",
	" source_matrix = []\n",
	" target_matrix = []\n",
	"\n",
	" for (source, target) in bilingual_dictionary:\n",
	" if source in source_dictionary and target in target_dictionary:\n",
	" source_matrix.append(source_dictionary[source])\n",
	" target_matrix.append(target_dictionary[target])\n",
	"\n",
	" # return training matrices\n",
	" return np.array(source_matrix), np.array(target_matrix)\n",
	"\n",
	"def learn_transformation(source_matrix, target_matrix, normalize_vectors=True):\n",
	" \"\"\"\n",
	" Source and target matrices are numpy arrays, shape\n",
	" (dictionary_length, embedding_dimension). These contain paired\n",
	" word vectors from the bilingual dictionary.\n",
	" \"\"\"\n",
	" # optionally normalize the training vectors\n",
	" if normalize_vectors:\n",
	" source_matrix = normalized(source_matrix)\n",
	" target_matrix = normalized(target_matrix)\n",
	"\n",
	" # perform the SVD\n",
	" product = np.matmul(source_matrix.transpose(), target_matrix)\n",
	" U, s, V = np.linalg.svd(product)\n",
	"\n",
	" # return orthogonal transformation which aligns source language to the target\n",
	" return np.matmul(U, V)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {},
	"outputs": [],
	"source": [
	"class Downloder():\n",
	" def download_manager(self, url, destination='', try_number=\"10\", time_out=\"60\"):\n",
	" #threading.Thread(target=self._wget_dl, args=(url, destination, try_number, time_out, log_file)).start()\n",
	" if self._wget_dl(url, destination, try_number, time_out, log_file) == 0:\n",
	" return True\n",
	" else:\n",
	" return False\n",
	"\n",
	"\n",
	" def _wget_dl(self,url, destination, try_number, time_out):\n",
	" import subprocess\n",
	" command=[\"wget\", \"-c\", \"-P\", destination, \"-t\", try_number, \"-T\", time_out , url]\n",
	" try:\n",
	" download_state=subprocess.call(command)\n",
	" except Exception as e:\n",
	" print(e)\n",
	" #if download_state==0 => successfull download\n",
	" return download_state"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"16\n",
	"reading word vectors from /mnt/hdd_disk/dan/datasets/embeddings_fastText/ar/wiki.ar.vec\n",
	"reading word vectors from /mnt/hdd_disk/dan/datasets/embeddings_fastText/da/wiki.da.vec\n",
	"reading word vectors from /mnt/hdd_disk/dan/datasets/embeddings_fastText/de/wiki.de.vec\n",
	"reading word vectors from /mnt/hdd_disk/dan/datasets/embeddings_fastText/el/wiki.el.vec\n",
	"reading word vectors from /mnt/hdd_disk/dan/datasets/embeddings_fastText/en/wiki.en.vec\n",
	"reading word vectors from /mnt/hdd_disk/dan/datasets/embeddings_fastText/es/wiki.es.vec\n",
	"reading word vectors from /mnt/hdd_disk/dan/datasets/embeddings_fastText/fa/wiki.fa.vec\n",
	"reading word vectors from /mnt/hdd_disk/dan/datasets/embeddings_fastText/hu/wiki.hu.vec\n",
	"reading word vectors from /mnt/hdd_disk/dan/datasets/embeddings_fastText/fr/wiki.fr.vec\n",
	"reading word vectors from /mnt/hdd_disk/dan/datasets/embeddings_fastText/it/wiki.it.vec\n",
	"reading word vectors from /mnt/hdd_disk/dan/datasets/embeddings_fastText/ja/wiki.ja.vec\n",
	"reading word vectors from /mnt/hdd_disk/dan/datasets/embeddings_fastText/lv/wiki.lv.vec\n",
	"reading word vectors from /mnt/hdd_disk/dan/datasets/embeddings_fastText/pt/wiki.pt.vec\n",
	"reading word vectors from /mnt/hdd_disk/dan/datasets/embeddings_fastText/ru/wiki.ru.vec\n",
	"reading word vectors from /mnt/hdd_disk/dan/datasets/embeddings_fastText/tr/wiki.tr.vec\n",
	"reading word vectors from /mnt/hdd_disk/dan/datasets/embeddings_fastText/uk/wiki.uk.vec\n",
	"16\n"
	]
	}
	],
	"source": [
	"languages=[\"ar\",\n",
	"\"da\",\n",
	"\"de\",\n",
	"\"el\",\n",
	"\"en\",\n",
	"\"es\",\n",
	"\"fa\",\n",
	"\"hu\",\n",
	"\"fr\",\n",
	"\"it\",\n",
	"\"ja\",\n",
	"\"lv\",\n",
	"\"pt\",\n",
	"\"ru\",\n",
	"\"tr\",\n",
	"\"uk\"]\n",
	"print(len(languages))\n",
	"\n",
	"languages_dictionary={}\n",
	"\n",
	"path_prefix = \"/mnt/hdd_disk/dan/datasets/embeddings_fastText/\"\n",
	"for language in languages:\n",
	" folder_path = path_prefix + language + \"/\"\n",
	" file_path = path_prefix + language + \"/wiki.\"+language+\".vec\"\n",
	" import os \n",
	" if not os.path.isfile(file_path) :\n",
	" if os.path.isdir(folder_path) :\n",
	" os.mkdir(folder_path)\n",
	" downloader = Downloder()\n",
	" downloader._wget_dl(\"https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.\"+language+\".vec\", folder_path)\n",
	" languages_dictionary[language]=FastVector(vector_file=file_path)\n",
	"\n",
	"print(len(languages_dictionary))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 7,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"-0.014731081984914952\n"
	]
	}
	],
	"source": [
	"fa_vector = languages_dictionary[\"fa\"][\"گربه\"]\n",
	"uk_vector = languages_dictionary[\"uk\"][\"кіт\"]\n",
	"print(FastVector.cosine_similarity(fa_vector, uk_vector))"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"\"گربه\" and \"кіт\" both mean \"cat\", so they should be highly similar; clearly the two word vector spaces are not yet aligned. To align them, ..."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 8,
	"metadata": {},
	"outputs": [],
	"source": [
	"for language, language_dictionary in languages_dictionary.items():\n",
	" language_dictionary.apply_transform('alignment_matrices/'+language+'.txt')"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 9,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"0.500099345287656\n"
	]
	}
	],
	"source": [
	"fa_vector = languages_dictionary[\"fa\"][\"گربه\"]\n",
	"uk_vector = languages_dictionary[\"uk\"][\"кіт\"]\n",
	"\n",
	"print(FastVector.cosine_similarity(fa_vector, uk_vector))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 10,
	"metadata": {},
	"outputs": [],
	"source": [
	"for language, language_dictionary in languages_dictionary.items():\n",
	" target_path = path_prefix + language + '/multiling_'+language+'.vec')\n",
	" import os \n",
	" if not os.path.isfile(target_path) :\n",
	" language_dictionary.export(target_path)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.6.8"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 1
	}