reflash · February 25, 2019 15:09
diff --git a/cosine-similarity.ipynb b/cosine-similarity.ipynb
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Requirement already satisfied: sklearn in c:\\users\\daniil_ekzarian\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages (0.0)\n",
      "Requirement already satisfied: scikit-learn in c:\\users\\daniil_ekzarian\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages (from sklearn) (0.20.2)\n",
      "Requirement already satisfied: numpy>=1.8.2 in c:\\users\\daniil_ekzarian\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages (from scikit-learn->sklearn) (1.16.0)\n",
      "Requirement already satisfied: scipy>=0.13.3 in c:\\users\\daniil_ekzarian\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages (from scikit-learn->sklearn) (1.2.0)\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "You are using pip version 18.1, however version 19.0.3 is available.\n",
      "You should consider upgrading via the 'python -m pip install --upgrade pip' command.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Requirement already satisfied: gensim in c:\\users\\daniil_ekzarian\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages (3.7.1)\n",
      "Requirement already satisfied: six>=1.5.0 in c:\\users\\daniil_ekzarian\\appdata\\roaming\\python\\python37\\site-packages (from gensim) (1.12.0)\n",
      "Requirement already satisfied: smart-open>=1.7.0 in c:\\users\\daniil_ekzarian\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages (from gensim) (1.8.0)\n",
      "Requirement already satisfied: scipy>=0.18.1 in c:\\users\\daniil_ekzarian\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages (from gensim) (1.2.0)\n",
      "Requirement already satisfied: numpy>=1.11.3 in c:\\users\\daniil_ekzarian\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages (from gensim) (1.16.0)\n",
      "Requirement already satisfied: boto>=2.32 in c:\\users\\daniil_ekzarian\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages (from smart-open>=1.7.0->gensim) (2.49.0)\n",
      "Requirement already satisfied: bz2file in c:\\users\\daniil_ekzarian\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages (from smart-open>=1.7.0->gensim) (0.98)\n",
      "Requirement already satisfied: requests in c:\\users\\daniil_ekzarian\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages (from smart-open>=1.7.0->gensim) (2.21.0)\n",
      "Requirement already satisfied: boto3 in c:\\users\\daniil_ekzarian\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages (from smart-open>=1.7.0->gensim) (1.9.90)\n",
      "Requirement already satisfied: certifi>=2017.4.17 in c:\\users\\daniil_ekzarian\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages (from requests->smart-open>=1.7.0->gensim) (2018.11.29)\n",
      "Requirement already satisfied: idna<2.9,>=2.5 in c:\\users\\daniil_ekzarian\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages (from requests->smart-open>=1.7.0->gensim) (2.8)\n",
      "Requirement already satisfied: chardet<3.1.0,>=3.0.2 in c:\\users\\daniil_ekzarian\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages (from requests->smart-open>=1.7.0->gensim) (3.0.4)\n",
      "Requirement already satisfied: urllib3<1.25,>=1.21.1 in c:\\users\\daniil_ekzarian\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages (from requests->smart-open>=1.7.0->gensim) (1.24.1)\n",
      "Requirement already satisfied: s3transfer<0.3.0,>=0.2.0 in c:\\users\\daniil_ekzarian\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages (from boto3->smart-open>=1.7.0->gensim) (0.2.0)\n",
      "Requirement already satisfied: botocore<1.13.0,>=1.12.90 in c:\\users\\daniil_ekzarian\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages (from boto3->smart-open>=1.7.0->gensim) (1.12.90)\n",
      "Requirement already satisfied: jmespath<1.0.0,>=0.7.1 in c:\\users\\daniil_ekzarian\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages (from boto3->smart-open>=1.7.0->gensim) (0.9.3)\n",
      "Requirement already satisfied: python-dateutil<3.0.0,>=2.1; python_version >= \"2.7\" in c:\\users\\daniil_ekzarian\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages (from botocore<1.13.0,>=1.12.90->boto3->smart-open>=1.7.0->gensim) (2.7.5)\n",
      "Requirement already satisfied: docutils>=0.10 in c:\\users\\daniil_ekzarian\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages (from botocore<1.13.0,>=1.12.90->boto3->smart-open>=1.7.0->gensim) (0.14)\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "You are using pip version 18.1, however version 19.0.3 is available.\n",
      "You should consider upgrading via the 'python -m pip install --upgrade pip' command.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Requirement already satisfied: pandas in c:\\users\\daniil_ekzarian\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages (0.23.4)\n",
      "Requirement already satisfied: pytz>=2011k in c:\\users\\daniil_ekzarian\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages (from pandas) (2018.9)\n",
      "Requirement already satisfied: python-dateutil>=2.5.0 in c:\\users\\daniil_ekzarian\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages (from pandas) (2.7.5)\n",
      "Requirement already satisfied: numpy>=1.9.0 in c:\\users\\daniil_ekzarian\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages (from pandas) (1.16.0)\n",
      "Requirement already satisfied: six>=1.5 in c:\\users\\daniil_ekzarian\\appdata\\roaming\\python\\python37\\site-packages (from python-dateutil>=2.5.0->pandas) (1.12.0)\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "You are using pip version 18.1, however version 19.0.3 is available.\n",
      "You should consider upgrading via the 'python -m pip install --upgrade pip' command.\n"
     ]
    }
   ],
   "source": [
    "import sys\n",
    "!{sys.executable} -m pip install sklearn\n",
    "!{sys.executable} -m pip install gensim\n",
    "!{sys.executable} -m pip install pandas"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Define the documents\n",
    "doc_trump = \"Mr. Trump became president after winning the political election. Though he lost the support of some republican friends, Trump is friends with President Putin\"\n",
    "\n",
    "doc_election = \"President Trump says Putin had no political interference is the election outcome. He says it was a witchhunt by political parties. He claimed President Putin is a friend who had nothing to do with the election\"\n",
    "\n",
    "doc_putin = \"Post elections, Vladimir Putin became President of Russia. President Putin had served as the Prime Minister earlier in his political career\"\n",
    "\n",
    "documents = [doc_trump, doc_election, doc_putin]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>after</th>\n",
       "      <th>as</th>\n",
       "      <th>became</th>\n",
       "      <th>by</th>\n",
       "      <th>career</th>\n",
       "      <th>claimed</th>\n",
       "      <th>do</th>\n",
       "      <th>earlier</th>\n",
       "      <th>election</th>\n",
       "      <th>elections</th>\n",
       "      <th>...</th>\n",
       "      <th>the</th>\n",
       "      <th>though</th>\n",
       "      <th>to</th>\n",
       "      <th>trump</th>\n",
       "      <th>vladimir</th>\n",
       "      <th>was</th>\n",
       "      <th>who</th>\n",
       "      <th>winning</th>\n",
       "      <th>witchhunt</th>\n",
       "      <th>with</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>doc_trump</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>doc_election</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>doc_putin</th>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>...</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>3 rows × 48 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "              after  as  became  by  career  claimed  do  earlier  election  \\\n",
       "doc_trump         1   0       1   0       0        0   0        0         1   \n",
       "doc_election      0   0       0   1       0        1   1        0         2   \n",
       "doc_putin         0   1       1   0       1        0   0        1         0   \n",
       "\n",
       "              elections  ...   the  though  to  trump  vladimir  was  who  \\\n",
       "doc_trump             0  ...     2       1   0      2         0    0    0   \n",
       "doc_election          0  ...     2       0   1      1         0    1    1   \n",
       "doc_putin             1  ...     1       0   0      0         1    0    0   \n",
       "\n",
       "              winning  witchhunt  with  \n",
       "doc_trump           1          0     1  \n",
       "doc_election        0          1     1  \n",
       "doc_putin           0          0     0  \n",
       "\n",
       "[3 rows x 48 columns]"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Scikit Learn\n",
    "from sklearn.feature_extraction.text import CountVectorizer\n",
    "import pandas as pd\n",
    "\n",
    "# Create the Document Term Matrix\n",
    "count_vectorizer = CountVectorizer(stop_words='english')\n",
    "count_vectorizer = CountVectorizer()\n",
    "sparse_matrix = count_vectorizer.fit_transform(documents)\n",
    "\n",
    "# OPTIONAL: Convert Sparse Matrix to Pandas Dataframe if you want to see the word frequencies.\n",
    "doc_term_matrix = sparse_matrix.todense()\n",
    "df = pd.DataFrame(doc_term_matrix, \n",
    "                  columns=count_vectorizer.get_feature_names(), \n",
    "                  index=['doc_trump', 'doc_election', 'doc_putin'])\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[1.         0.51480485 0.38890873]\n",
      " [0.51480485 1.         0.38829014]\n",
      " [0.38890873 0.38829014 1.        ]]\n"
     ]
    }
   ],
   "source": [
    "# Compute Cosine Similarity\n",
    "from sklearn.metrics.pairwise import cosine_similarity\n",
    "print(cosine_similarity(df, df))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Define the documents\n",
    "doc_soup = \"Soup is a primarily liquid food, generally served warm or hot (but may be cool or cold), that is made by combining ingredients of meat or vegetables with stock, juice, water, or another liquid. \"\n",
    "\n",
    "doc_noodles = \"Noodles are a staple food in many cultures. They are made from unleavened dough which is stretched, extruded, or rolled flat and cut into one of a variety of shapes.\"\n",
    "\n",
    "doc_dosa = \"Dosa is a type of pancake from the Indian subcontinent, made from a fermented batter. It is somewhat similar to a crepe in appearance. Its main ingredients are rice and black gram.\"\n",
    "\n",
    "documents = [doc_trump, doc_election, doc_putin, doc_soup, doc_noodles, doc_dosa]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "3.7.1\n"
     ]
    },
    {
     "ename": "MemoryError",
     "evalue": "",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mMemoryError\u001b[0m                               Traceback (most recent call last)",
      "\u001b[1;32m<ipython-input-13-f2f4ba676c0c>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[0;32m     10\u001b[0m \u001b[1;31m# Download the FastText model\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     11\u001b[0m \u001b[1;31m# WARNING! the 3.7.1 model is about 1GB\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 12\u001b[1;33m \u001b[0mfasttext_model300\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mapi\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mload\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'fasttext-wiki-news-subwords-300'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[1;32mc:\\users\\daniil_ekzarian\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages\\gensim\\downloader.py\u001b[0m in \u001b[0;36mload\u001b[1;34m(name, return_path)\u001b[0m\n\u001b[0;32m    440\u001b[0m         \u001b[0msys\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mpath\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0minsert\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mbase_dir\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    441\u001b[0m         \u001b[0mmodule\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0m__import__\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mname\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 442\u001b[1;33m         \u001b[1;32mreturn\u001b[0m \u001b[0mmodule\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mload_data\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    443\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    444\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32m~\\gensim-data\\fasttext-wiki-news-subwords-300\\__init__.py\u001b[0m in \u001b[0;36mload_data\u001b[1;34m()\u001b[0m\n\u001b[0;32m      6\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0mload_data\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      7\u001b[0m     \u001b[0mpath\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mos\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mpath\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mjoin\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mbase_dir\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'fasttext-wiki-news-subwords-300'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m\"fasttext-wiki-news-subwords-300.gz\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 8\u001b[1;33m     \u001b[0mmodel\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mKeyedVectors\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mload_word2vec_format\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mpath\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mbinary\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mFalse\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m      9\u001b[0m     \u001b[1;32mreturn\u001b[0m \u001b[0mmodel\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mc:\\users\\daniil_ekzarian\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages\\gensim\\models\\keyedvectors.py\u001b[0m in \u001b[0;36mload_word2vec_format\u001b[1;34m(cls, fname, fvocab, binary, encoding, unicode_errors, limit, datatype)\u001b[0m\n\u001b[0;32m   1474\u001b[0m         return _load_word2vec_format(\n\u001b[0;32m   1475\u001b[0m             \u001b[0mcls\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mfname\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mfvocab\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mfvocab\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mbinary\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mbinary\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mencoding\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mencoding\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0municode_errors\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0municode_errors\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1476\u001b[1;33m             limit=limit, datatype=datatype)\n\u001b[0m\u001b[0;32m   1477\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1478\u001b[0m     \u001b[1;32mdef\u001b[0m \u001b[0mget_keras_embedding\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mtrain_embeddings\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mFalse\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mc:\\users\\daniil_ekzarian\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages\\gensim\\models\\utils_any2vec.py\u001b[0m in \u001b[0;36m_load_word2vec_format\u001b[1;34m(cls, fname, fvocab, binary, encoding, unicode_errors, limit, datatype)\u001b[0m\n\u001b[0;32m    347\u001b[0m         \u001b[0mresult\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mcls\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mvector_size\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    348\u001b[0m         \u001b[0mresult\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mvector_size\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mvector_size\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 349\u001b[1;33m         \u001b[0mresult\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mvectors\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mzeros\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mvocab_size\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mvector_size\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mdatatype\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    350\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    351\u001b[0m         \u001b[1;32mdef\u001b[0m \u001b[0madd_word\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mword\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mweights\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;31mMemoryError\u001b[0m: "
     ]
    }
   ],
   "source": [
    "import gensim\n",
    "# upgrade gensim if you can't import softcossim\n",
    "from gensim.matutils import softcossim \n",
    "from gensim import corpora\n",
    "import gensim.downloader as api\n",
    "from gensim.utils import simple_preprocess\n",
    "print(gensim.__version__)\n",
    "#> '3.6.0'\n",
    "\n",
    "# Download the FastText model\n",
    "# WARNING! the 3.7.1 model is about 1GB\n",
    "fasttext_model300 = api.load('fasttext-wiki-news-subwords-300')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "ename": "NameError",
     "evalue": "name 'fasttext_model300' is not defined",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mNameError\u001b[0m                                 Traceback (most recent call last)",
      "\u001b[1;32m<ipython-input-14-70ef5acc1290>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[0;32m      3\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      4\u001b[0m \u001b[1;31m# Prepare the similarity matrix\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 5\u001b[1;33m \u001b[0msimilarity_matrix\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mfasttext_model300\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msimilarity_matrix\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdictionary\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mtfidf\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mNone\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mthreshold\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;36m0.0\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mexponent\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;36m2.0\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mnonzero_limit\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;36m100\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m      6\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      7\u001b[0m \u001b[1;31m# Convert the sentences into bag-of-words vectors.\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;31mNameError\u001b[0m: name 'fasttext_model300' is not defined"
     ]
    }
   ],
   "source": [
    "# Prepare a dictionary and a corpus.\n",
    "dictionary = corpora.Dictionary([simple_preprocess(doc) for doc in documents])\n",
    "\n",
    "# Prepare the similarity matrix\n",
    "similarity_matrix = fasttext_model300.similarity_matrix(dictionary, tfidf=None, threshold=0.0, exponent=2.0, nonzero_limit=100)\n",
    "\n",
    "# Convert the sentences into bag-of-words vectors.\n",
    "sent_1 = dictionary.doc2bow(simple_preprocess(doc_trump))\n",
    "sent_2 = dictionary.doc2bow(simple_preprocess(doc_election))\n",
    "sent_3 = dictionary.doc2bow(simple_preprocess(doc_putin))\n",
    "sent_4 = dictionary.doc2bow(simple_preprocess(doc_soup))\n",
    "sent_5 = dictionary.doc2bow(simple_preprocess(doc_noodles))\n",
    "sent_6 = dictionary.doc2bow(simple_preprocess(doc_dosa))\n",
    "\n",
    "sentences = [sent_1, sent_2, sent_3, sent_4, sent_5, sent_6]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "ename": "NameError",
     "evalue": "name 'sent_1' is not defined",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mNameError\u001b[0m                                 Traceback (most recent call last)",
      "\u001b[1;32m<ipython-input-11-5b2ee7b32951>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[0;32m      1\u001b[0m \u001b[1;31m# Compute soft cosine similarity\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 2\u001b[1;33m \u001b[0mprint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0msoftcossim\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0msent_1\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0msent_2\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0msimilarity_matrix\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m      3\u001b[0m \u001b[1;31m#> 0.567228632589\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;31mNameError\u001b[0m: name 'sent_1' is not defined"
     ]
    }
   ],
   "source": [
    "# Compute soft cosine similarity\n",
    "print(softcossim(sent_1, sent_2, similarity_matrix))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
 }
No results found