Created
February 25, 2019 15:09
-
-
Save reflash/f0a089b493346c7f0099ccbe8c08417d to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "cells": [ | |
| { | |
| "cell_type": "code", | |
| "execution_count": 7, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "Requirement already satisfied: sklearn in c:\\users\\daniil_ekzarian\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages (0.0)\n", | |
| "Requirement already satisfied: scikit-learn in c:\\users\\daniil_ekzarian\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages (from sklearn) (0.20.2)\n", | |
| "Requirement already satisfied: numpy>=1.8.2 in c:\\users\\daniil_ekzarian\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages (from scikit-learn->sklearn) (1.16.0)\n", | |
| "Requirement already satisfied: scipy>=0.13.3 in c:\\users\\daniil_ekzarian\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages (from scikit-learn->sklearn) (1.2.0)\n" | |
| ] | |
| }, | |
| { | |
| "name": "stderr", | |
| "output_type": "stream", | |
| "text": [ | |
| "You are using pip version 18.1, however version 19.0.3 is available.\n", | |
| "You should consider upgrading via the 'python -m pip install --upgrade pip' command.\n" | |
| ] | |
| }, | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "Requirement already satisfied: gensim in c:\\users\\daniil_ekzarian\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages (3.7.1)\n", | |
| "Requirement already satisfied: six>=1.5.0 in c:\\users\\daniil_ekzarian\\appdata\\roaming\\python\\python37\\site-packages (from gensim) (1.12.0)\n", | |
| "Requirement already satisfied: smart-open>=1.7.0 in c:\\users\\daniil_ekzarian\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages (from gensim) (1.8.0)\n", | |
| "Requirement already satisfied: scipy>=0.18.1 in c:\\users\\daniil_ekzarian\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages (from gensim) (1.2.0)\n", | |
| "Requirement already satisfied: numpy>=1.11.3 in c:\\users\\daniil_ekzarian\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages (from gensim) (1.16.0)\n", | |
| "Requirement already satisfied: boto>=2.32 in c:\\users\\daniil_ekzarian\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages (from smart-open>=1.7.0->gensim) (2.49.0)\n", | |
| "Requirement already satisfied: bz2file in c:\\users\\daniil_ekzarian\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages (from smart-open>=1.7.0->gensim) (0.98)\n", | |
| "Requirement already satisfied: requests in c:\\users\\daniil_ekzarian\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages (from smart-open>=1.7.0->gensim) (2.21.0)\n", | |
| "Requirement already satisfied: boto3 in c:\\users\\daniil_ekzarian\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages (from smart-open>=1.7.0->gensim) (1.9.90)\n", | |
| "Requirement already satisfied: certifi>=2017.4.17 in c:\\users\\daniil_ekzarian\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages (from requests->smart-open>=1.7.0->gensim) (2018.11.29)\n", | |
| "Requirement already satisfied: idna<2.9,>=2.5 in c:\\users\\daniil_ekzarian\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages (from requests->smart-open>=1.7.0->gensim) (2.8)\n", | |
| "Requirement already satisfied: chardet<3.1.0,>=3.0.2 in c:\\users\\daniil_ekzarian\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages (from requests->smart-open>=1.7.0->gensim) (3.0.4)\n", | |
| "Requirement already satisfied: urllib3<1.25,>=1.21.1 in c:\\users\\daniil_ekzarian\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages (from requests->smart-open>=1.7.0->gensim) (1.24.1)\n", | |
| "Requirement already satisfied: s3transfer<0.3.0,>=0.2.0 in c:\\users\\daniil_ekzarian\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages (from boto3->smart-open>=1.7.0->gensim) (0.2.0)\n", | |
| "Requirement already satisfied: botocore<1.13.0,>=1.12.90 in c:\\users\\daniil_ekzarian\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages (from boto3->smart-open>=1.7.0->gensim) (1.12.90)\n", | |
| "Requirement already satisfied: jmespath<1.0.0,>=0.7.1 in c:\\users\\daniil_ekzarian\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages (from boto3->smart-open>=1.7.0->gensim) (0.9.3)\n", | |
| "Requirement already satisfied: python-dateutil<3.0.0,>=2.1; python_version >= \"2.7\" in c:\\users\\daniil_ekzarian\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages (from botocore<1.13.0,>=1.12.90->boto3->smart-open>=1.7.0->gensim) (2.7.5)\n", | |
| "Requirement already satisfied: docutils>=0.10 in c:\\users\\daniil_ekzarian\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages (from botocore<1.13.0,>=1.12.90->boto3->smart-open>=1.7.0->gensim) (0.14)\n" | |
| ] | |
| }, | |
| { | |
| "name": "stderr", | |
| "output_type": "stream", | |
| "text": [ | |
| "You are using pip version 18.1, however version 19.0.3 is available.\n", | |
| "You should consider upgrading via the 'python -m pip install --upgrade pip' command.\n" | |
| ] | |
| }, | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "Requirement already satisfied: pandas in c:\\users\\daniil_ekzarian\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages (0.23.4)\n", | |
| "Requirement already satisfied: pytz>=2011k in c:\\users\\daniil_ekzarian\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages (from pandas) (2018.9)\n", | |
| "Requirement already satisfied: python-dateutil>=2.5.0 in c:\\users\\daniil_ekzarian\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages (from pandas) (2.7.5)\n", | |
| "Requirement already satisfied: numpy>=1.9.0 in c:\\users\\daniil_ekzarian\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages (from pandas) (1.16.0)\n", | |
| "Requirement already satisfied: six>=1.5 in c:\\users\\daniil_ekzarian\\appdata\\roaming\\python\\python37\\site-packages (from python-dateutil>=2.5.0->pandas) (1.12.0)\n" | |
| ] | |
| }, | |
| { | |
| "name": "stderr", | |
| "output_type": "stream", | |
| "text": [ | |
| "You are using pip version 18.1, however version 19.0.3 is available.\n", | |
| "You should consider upgrading via the 'python -m pip install --upgrade pip' command.\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "import sys\n", | |
| "!{sys.executable} -m pip install sklearn\n", | |
| "!{sys.executable} -m pip install gensim\n", | |
| "!{sys.executable} -m pip install pandas" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 3, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "# Define the documents\n", | |
| "doc_trump = \"Mr. Trump became president after winning the political election. Though he lost the support of some republican friends, Trump is friends with President Putin\"\n", | |
| "\n", | |
| "doc_election = \"President Trump says Putin had no political interference is the election outcome. He says it was a witchhunt by political parties. He claimed President Putin is a friend who had nothing to do with the election\"\n", | |
| "\n", | |
| "doc_putin = \"Post elections, Vladimir Putin became President of Russia. President Putin had served as the Prime Minister earlier in his political career\"\n", | |
| "\n", | |
| "documents = [doc_trump, doc_election, doc_putin]" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 4, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<style scoped>\n", | |
| " .dataframe tbody tr th:only-of-type {\n", | |
| " vertical-align: middle;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe tbody tr th {\n", | |
| " vertical-align: top;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe thead th {\n", | |
| " text-align: right;\n", | |
| " }\n", | |
| "</style>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>after</th>\n", | |
| " <th>as</th>\n", | |
| " <th>became</th>\n", | |
| " <th>by</th>\n", | |
| " <th>career</th>\n", | |
| " <th>claimed</th>\n", | |
| " <th>do</th>\n", | |
| " <th>earlier</th>\n", | |
| " <th>election</th>\n", | |
| " <th>elections</th>\n", | |
| " <th>...</th>\n", | |
| " <th>the</th>\n", | |
| " <th>though</th>\n", | |
| " <th>to</th>\n", | |
| " <th>trump</th>\n", | |
| " <th>vladimir</th>\n", | |
| " <th>was</th>\n", | |
| " <th>who</th>\n", | |
| " <th>winning</th>\n", | |
| " <th>witchhunt</th>\n", | |
| " <th>with</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>doc_trump</th>\n", | |
| " <td>1</td>\n", | |
| " <td>0</td>\n", | |
| " <td>1</td>\n", | |
| " <td>0</td>\n", | |
| " <td>0</td>\n", | |
| " <td>0</td>\n", | |
| " <td>0</td>\n", | |
| " <td>0</td>\n", | |
| " <td>1</td>\n", | |
| " <td>0</td>\n", | |
| " <td>...</td>\n", | |
| " <td>2</td>\n", | |
| " <td>1</td>\n", | |
| " <td>0</td>\n", | |
| " <td>2</td>\n", | |
| " <td>0</td>\n", | |
| " <td>0</td>\n", | |
| " <td>0</td>\n", | |
| " <td>1</td>\n", | |
| " <td>0</td>\n", | |
| " <td>1</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>doc_election</th>\n", | |
| " <td>0</td>\n", | |
| " <td>0</td>\n", | |
| " <td>0</td>\n", | |
| " <td>1</td>\n", | |
| " <td>0</td>\n", | |
| " <td>1</td>\n", | |
| " <td>1</td>\n", | |
| " <td>0</td>\n", | |
| " <td>2</td>\n", | |
| " <td>0</td>\n", | |
| " <td>...</td>\n", | |
| " <td>2</td>\n", | |
| " <td>0</td>\n", | |
| " <td>1</td>\n", | |
| " <td>1</td>\n", | |
| " <td>0</td>\n", | |
| " <td>1</td>\n", | |
| " <td>1</td>\n", | |
| " <td>0</td>\n", | |
| " <td>1</td>\n", | |
| " <td>1</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>doc_putin</th>\n", | |
| " <td>0</td>\n", | |
| " <td>1</td>\n", | |
| " <td>1</td>\n", | |
| " <td>0</td>\n", | |
| " <td>1</td>\n", | |
| " <td>0</td>\n", | |
| " <td>0</td>\n", | |
| " <td>1</td>\n", | |
| " <td>0</td>\n", | |
| " <td>1</td>\n", | |
| " <td>...</td>\n", | |
| " <td>1</td>\n", | |
| " <td>0</td>\n", | |
| " <td>0</td>\n", | |
| " <td>0</td>\n", | |
| " <td>1</td>\n", | |
| " <td>0</td>\n", | |
| " <td>0</td>\n", | |
| " <td>0</td>\n", | |
| " <td>0</td>\n", | |
| " <td>0</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "<p>3 rows × 48 columns</p>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " after as became by career claimed do earlier election \\\n", | |
| "doc_trump 1 0 1 0 0 0 0 0 1 \n", | |
| "doc_election 0 0 0 1 0 1 1 0 2 \n", | |
| "doc_putin 0 1 1 0 1 0 0 1 0 \n", | |
| "\n", | |
| " elections ... the though to trump vladimir was who \\\n", | |
| "doc_trump 0 ... 2 1 0 2 0 0 0 \n", | |
| "doc_election 0 ... 2 0 1 1 0 1 1 \n", | |
| "doc_putin 1 ... 1 0 0 0 1 0 0 \n", | |
| "\n", | |
| " winning witchhunt with \n", | |
| "doc_trump 1 0 1 \n", | |
| "doc_election 0 1 1 \n", | |
| "doc_putin 0 0 0 \n", | |
| "\n", | |
| "[3 rows x 48 columns]" | |
| ] | |
| }, | |
| "execution_count": 4, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "# Scikit Learn\n", | |
| "from sklearn.feature_extraction.text import CountVectorizer\n", | |
| "import pandas as pd\n", | |
| "\n", | |
| "# Create the Document Term Matrix\n", | |
| "count_vectorizer = CountVectorizer(stop_words='english')\n", | |
| "count_vectorizer = CountVectorizer()\n", | |
| "sparse_matrix = count_vectorizer.fit_transform(documents)\n", | |
| "\n", | |
| "# OPTIONAL: Convert Sparse Matrix to Pandas Dataframe if you want to see the word frequencies.\n", | |
| "doc_term_matrix = sparse_matrix.todense()\n", | |
| "df = pd.DataFrame(doc_term_matrix, \n", | |
| " columns=count_vectorizer.get_feature_names(), \n", | |
| " index=['doc_trump', 'doc_election', 'doc_putin'])\n", | |
| "df" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 5, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "[[1. 0.51480485 0.38890873]\n", | |
| " [0.51480485 1. 0.38829014]\n", | |
| " [0.38890873 0.38829014 1. ]]\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "# Compute Cosine Similarity\n", | |
| "from sklearn.metrics.pairwise import cosine_similarity\n", | |
| "print(cosine_similarity(df, df))" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 8, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "# Define the documents\n", | |
| "doc_soup = \"Soup is a primarily liquid food, generally served warm or hot (but may be cool or cold), that is made by combining ingredients of meat or vegetables with stock, juice, water, or another liquid. \"\n", | |
| "\n", | |
| "doc_noodles = \"Noodles are a staple food in many cultures. They are made from unleavened dough which is stretched, extruded, or rolled flat and cut into one of a variety of shapes.\"\n", | |
| "\n", | |
| "doc_dosa = \"Dosa is a type of pancake from the Indian subcontinent, made from a fermented batter. It is somewhat similar to a crepe in appearance. Its main ingredients are rice and black gram.\"\n", | |
| "\n", | |
| "documents = [doc_trump, doc_election, doc_putin, doc_soup, doc_noodles, doc_dosa]" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 13, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "3.7.1\n" | |
| ] | |
| }, | |
| { | |
| "ename": "MemoryError", | |
| "evalue": "", | |
| "output_type": "error", | |
| "traceback": [ | |
| "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", | |
| "\u001b[1;31mMemoryError\u001b[0m Traceback (most recent call last)", | |
| "\u001b[1;32m<ipython-input-13-f2f4ba676c0c>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[0;32m 10\u001b[0m \u001b[1;31m# Download the FastText model\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 11\u001b[0m \u001b[1;31m# WARNING! the 3.7.1 model is about 1GB\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 12\u001b[1;33m \u001b[0mfasttext_model300\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mapi\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mload\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'fasttext-wiki-news-subwords-300'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", | |
| "\u001b[1;32mc:\\users\\daniil_ekzarian\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages\\gensim\\downloader.py\u001b[0m in \u001b[0;36mload\u001b[1;34m(name, return_path)\u001b[0m\n\u001b[0;32m 440\u001b[0m \u001b[0msys\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mpath\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0minsert\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mbase_dir\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 441\u001b[0m \u001b[0mmodule\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0m__import__\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mname\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 442\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mmodule\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mload_data\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 443\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 444\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", | |
| "\u001b[1;32m~\\gensim-data\\fasttext-wiki-news-subwords-300\\__init__.py\u001b[0m in \u001b[0;36mload_data\u001b[1;34m()\u001b[0m\n\u001b[0;32m 6\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0mload_data\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 7\u001b[0m \u001b[0mpath\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mos\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mpath\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mjoin\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mbase_dir\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'fasttext-wiki-news-subwords-300'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m\"fasttext-wiki-news-subwords-300.gz\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 8\u001b[1;33m \u001b[0mmodel\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mKeyedVectors\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mload_word2vec_format\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mpath\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mbinary\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mFalse\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 9\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mmodel\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", | |
| "\u001b[1;32mc:\\users\\daniil_ekzarian\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages\\gensim\\models\\keyedvectors.py\u001b[0m in \u001b[0;36mload_word2vec_format\u001b[1;34m(cls, fname, fvocab, binary, encoding, unicode_errors, limit, datatype)\u001b[0m\n\u001b[0;32m 1474\u001b[0m return _load_word2vec_format(\n\u001b[0;32m 1475\u001b[0m \u001b[0mcls\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mfname\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mfvocab\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mfvocab\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mbinary\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mbinary\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mencoding\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mencoding\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0municode_errors\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0municode_errors\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1476\u001b[1;33m limit=limit, datatype=datatype)\n\u001b[0m\u001b[0;32m 1477\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1478\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0mget_keras_embedding\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mtrain_embeddings\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mFalse\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", | |
| "\u001b[1;32mc:\\users\\daniil_ekzarian\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages\\gensim\\models\\utils_any2vec.py\u001b[0m in \u001b[0;36m_load_word2vec_format\u001b[1;34m(cls, fname, fvocab, binary, encoding, unicode_errors, limit, datatype)\u001b[0m\n\u001b[0;32m 347\u001b[0m \u001b[0mresult\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mcls\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mvector_size\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 348\u001b[0m \u001b[0mresult\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mvector_size\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mvector_size\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 349\u001b[1;33m \u001b[0mresult\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mvectors\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mzeros\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mvocab_size\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mvector_size\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mdatatype\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 350\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 351\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0madd_word\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mword\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mweights\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", | |
| "\u001b[1;31mMemoryError\u001b[0m: " | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "import gensim\n", | |
| "# upgrade gensim if you can't import softcossim\n", | |
| "from gensim.matutils import softcossim \n", | |
| "from gensim import corpora\n", | |
| "import gensim.downloader as api\n", | |
| "from gensim.utils import simple_preprocess\n", | |
| "print(gensim.__version__)\n", | |
| "#> '3.6.0'\n", | |
| "\n", | |
| "# Download the FastText model\n", | |
| "# WARNING! the 3.7.1 model is about 1GB\n", | |
| "fasttext_model300 = api.load('fasttext-wiki-news-subwords-300')" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 14, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "ename": "NameError", | |
| "evalue": "name 'fasttext_model300' is not defined", | |
| "output_type": "error", | |
| "traceback": [ | |
| "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", | |
| "\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)", | |
| "\u001b[1;32m<ipython-input-14-70ef5acc1290>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[0;32m 3\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 4\u001b[0m \u001b[1;31m# Prepare the similarity matrix\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 5\u001b[1;33m \u001b[0msimilarity_matrix\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mfasttext_model300\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msimilarity_matrix\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdictionary\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mtfidf\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mNone\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mthreshold\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;36m0.0\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mexponent\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;36m2.0\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mnonzero_limit\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;36m100\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 6\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 7\u001b[0m \u001b[1;31m# Convert the sentences into bag-of-words vectors.\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", | |
| "\u001b[1;31mNameError\u001b[0m: name 'fasttext_model300' is not defined" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "# Prepare a dictionary and a corpus.\n", | |
| "dictionary = corpora.Dictionary([simple_preprocess(doc) for doc in documents])\n", | |
| "\n", | |
| "# Prepare the similarity matrix\n", | |
| "similarity_matrix = fasttext_model300.similarity_matrix(dictionary, tfidf=None, threshold=0.0, exponent=2.0, nonzero_limit=100)\n", | |
| "\n", | |
| "# Convert the sentences into bag-of-words vectors.\n", | |
| "sent_1 = dictionary.doc2bow(simple_preprocess(doc_trump))\n", | |
| "sent_2 = dictionary.doc2bow(simple_preprocess(doc_election))\n", | |
| "sent_3 = dictionary.doc2bow(simple_preprocess(doc_putin))\n", | |
| "sent_4 = dictionary.doc2bow(simple_preprocess(doc_soup))\n", | |
| "sent_5 = dictionary.doc2bow(simple_preprocess(doc_noodles))\n", | |
| "sent_6 = dictionary.doc2bow(simple_preprocess(doc_dosa))\n", | |
| "\n", | |
| "sentences = [sent_1, sent_2, sent_3, sent_4, sent_5, sent_6]" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 11, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "ename": "NameError", | |
| "evalue": "name 'sent_1' is not defined", | |
| "output_type": "error", | |
| "traceback": [ | |
| "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", | |
| "\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)", | |
| "\u001b[1;32m<ipython-input-11-5b2ee7b32951>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[1;31m# Compute soft cosine similarity\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 2\u001b[1;33m \u001b[0mprint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0msoftcossim\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0msent_1\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0msent_2\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0msimilarity_matrix\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 3\u001b[0m \u001b[1;31m#> 0.567228632589\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", | |
| "\u001b[1;31mNameError\u001b[0m: name 'sent_1' is not defined" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "# Compute soft cosine similarity\n", | |
| "print(softcossim(sent_1, sent_2, similarity_matrix))" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [] | |
| } | |
| ], | |
| "metadata": { | |
| "kernelspec": { | |
| "display_name": "Python 3", | |
| "language": "python", | |
| "name": "python3" | |
| }, | |
| "language_info": { | |
| "codemirror_mode": { | |
| "name": "ipython", | |
| "version": 3 | |
| }, | |
| "file_extension": ".py", | |
| "mimetype": "text/x-python", | |
| "name": "python", | |
| "nbconvert_exporter": "python", | |
| "pygments_lexer": "ipython3", | |
| "version": "3.7.2" | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 2 | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment