AashishTiwari · March 26, 2018 13:16
diff --git a/radar_charts.ipynb b/radar_charts.ipynb
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<script>requirejs.config({paths: { 'plotly': ['https://cdn.plot.ly/plotly-latest.min']},});if(!window.Plotly) {{require(['plotly'],function(plotly) {window.Plotly=plotly;});}}</script>"
      ],
      "text/vnd.plotly.v1+html": [
       "<script>requirejs.config({paths: { 'plotly': ['https://cdn.plot.ly/plotly-latest.min']},});if(!window.Plotly) {{require(['plotly'],function(plotly) {window.Plotly=plotly;});}}</script>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import plotly.plotly as py\n",
    "import plotly.graph_objs as go\n",
    "from plotly.offline import init_notebook_mode, plot, iplot\n",
    "init_notebook_mode(connected=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "df_intents = pd.read_excel('hlb_intents_cleaned.xlsx', header=0, names=['intents','variations'],\n",
    "                          na_values=[\"NA\", \"TBC\", \"Not Clear\", \"Not In Corpus\", \"\", \" \"])\n",
    "\n",
    "\n",
    "df_intents.variations = df_intents.variations.str.lower()\n",
    "df_intents.variations = df_intents.variations.str.replace('[^\\w\\s]','')\n",
    "df_intents.variations = df_intents.variations.str.strip()\n",
    "\n",
    "df_intents.intents = df_intents.intents.str.lower()\n",
    "df_intents.intents = df_intents.intents.str.replace('[^\\w\\s]','')\n",
    "df_intents.intents = df_intents.intents.str.strip()\n",
    "\n",
    "df_intents.drop_duplicates(inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "Int64Index: 6723 entries, 0 to 6727\n",
      "Data columns (total 2 columns):\n",
      "intents       6723 non-null object\n",
      "variations    6723 non-null object\n",
      "dtypes: object(2)\n",
      "memory usage: 157.6+ KB\n"
     ]
    }
   ],
   "source": [
    "df_intents.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "df_man1 = pd.read_excel('WatsonAccuracyTrend.xlsx', sheetname=\"ManualMarkingRaw\", parse_cols=\"C,D,E,F,G,V\",\n",
    "                        header=0, names=['question','answer', 'key_question', 'confidence', 'ux', 'expected_kq'],\n",
    "                       na_values=[\"NA\", \"TBC\", \"Not Clear\", \"Not In Corpus\", \"\", \" \"]\n",
    "                       )\n",
    "\n",
    "df_man1.question = df_man1.question.str.lower()\n",
    "df_man1.question = df_man1.question.str.replace('[^\\w\\s]','')\n",
    "df_man1.question = df_man1.question.str.strip()\n",
    "\n",
    "df_man1.expected_kq = df_man1.expected_kq.str.lower()\n",
    "df_man1.expected_kq = df_man1.expected_kq.str.replace('[^\\w\\s]','')\n",
    "df_man1.expected_kq = df_man1.expected_kq.str.strip()\n",
    "\n",
    "df_man1 = df_man1[[\"expected_kq\", \"question\"]]\n",
    "df_man1.drop_duplicates(inplace=True)\n",
    "df_man1.columns = ['intents', 'variations']\n",
    "df_man1.dropna(inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "Int64Index: 10635 entries, 0 to 21958\n",
      "Data columns (total 2 columns):\n",
      "intents       10635 non-null object\n",
      "variations    10635 non-null object\n",
      "dtypes: object(2)\n",
      "memory usage: 249.3+ KB\n"
     ]
    }
   ],
   "source": [
    "df_man1.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "Int64Index: 16503 entries, 0 to 21958\n",
      "Data columns (total 2 columns):\n",
      "intents       16503 non-null object\n",
      "variations    16503 non-null object\n",
      "dtypes: object(2)\n",
      "memory usage: 386.8+ KB\n"
     ]
    }
   ],
   "source": [
    "df = pd.concat([df_intents, df_man1])\n",
    "df.drop_duplicates(inplace=True)\n",
    "\n",
    "err_intents = ['thanks', 'add variation in loan', 'you are helpful', 'will you be my friend', 'it was nice talking to you',\n",
    "               'you are not very intelligent', 'complaints to hong leong bank', 'strong insult', 'add variation to loan',\n",
    "               'add as variation to loan', 'add as variations to loan', 'loan', 'you are stupid', 'compliment to hong leong bank',\n",
    "               'are you a real person', 'my name is'\n",
    "              ]\n",
    "\n",
    "df = df[~df.intents.isin(err_intents)]\n",
    "df.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "from sklearn import preprocessing\n",
    "le = preprocessing.LabelEncoder()\n",
    "Y = le.fit_transform(df.intents.unique())\n",
    "klasses = le.transform(df.intents)\n",
    "df['klass'] = klasses"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "5"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "new_df = df.groupby('klass').filter(lambda x: len(x) >= 317)\n",
    "len(new_df.intents.unique())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "from sklearn.pipeline import FeatureUnion\n",
    "combined_features = FeatureUnion([\n",
    "                    (\"tfidf_word\", TfidfVectorizer(analyzer='word', ngram_range=(1,3), max_features=5000, min_df=2 )), \n",
    "                    (\"tfidf_char\", TfidfVectorizer(analyzer='char', ngram_range=(2,3), max_features=2000, min_df=2 ))\n",
    "])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "feats = combined_features.fit_transform(new_df.intents.unique())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from sklearn.decomposition import TruncatedSVD\n",
    "svd = TruncatedSVD(n_components=5, random_state=0)\n",
    "svd_tfidf = svd.fit_transform(feats)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(5, 5)"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "svd_tfidf.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[t-SNE] Computing pairwise distances...\n",
      "[t-SNE] Computing 4 nearest neighbors...\n",
      "[t-SNE] Computed conditional probabilities for sample 5 / 5\n",
      "[t-SNE] Mean sigma: 1125899906842624.000000\n",
      "[t-SNE] KL divergence after 50 iterations with early exaggeration: 0.000000\n",
      "[t-SNE] Error after 75 iterations: 0.000000\n"
     ]
    }
   ],
   "source": [
    "from sklearn.manifold import TSNE\n",
    "tsne_model = TSNE(n_components=2, verbose=1, random_state=0)\n",
    "tsne_tfidf = tsne_model.fit_transform(svd_tfidf)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([  1.25864699e+00,  -5.27237063e-01,   5.63028098e-04,\n",
       "        -4.01380466e-02,   3.69076473e-01])"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "svd_tfidf[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "application/vnd.plotly.v1+json": {
       "data": [
        {
         "line": {
          "color": "peru"
         },
         "mode": "lines",
         "name": "Class0",
         "r": [
          1.258646994466373,
          -0.5272370631161158,
          0.0005630280977832204,
          -0.040138046581806364,
          0.36907647285274403,
          1.258646994466373
         ],
         "theta": [
          "A",
          "B",
          "C",
          "D",
          "E",
          "A"
         ],
         "type": "scatterpolar"
        },
        {
         "line": {
          "color": "darkviolet"
         },
         "mode": "lines",
         "name": "Class1",
         "r": [
          1.0924325581213414,
          -0.6051586209432138,
          -0.5654095049116606,
          -0.22005983412438146,
          -0.2688120358452568,
          1.0924325581213414
         ],
         "theta": [
          "A",
          "B",
          "C",
          "D",
          "E",
          "A"
         ],
         "type": "scatterpolar"
        },
        {
         "line": {
          "color": "deepskyblue"
         },
         "mode": "lines",
         "name": "Class2",
         "r": [
          0.7147397700108102,
          1.0648266100070478,
          -0.536169680109551,
          -0.24808075634355758,
          0.07917931705328059,
          0.7147397700108102
         ],
         "theta": [
          "A",
          "B",
          "C",
          "D",
          "E",
          "A"
         ],
         "type": "scatterpolar"
        },
        {
         "line": {
          "color": "orangered"
         },
         "mode": "lines",
         "name": "Class3",
         "r": [
          1.1969437099725184,
          0.27771654946833846,
          0.08524409660422959,
          0.688492701360346,
          -0.0943955373748415,
          1.1969437099725184
         ],
         "theta": [
          "A",
          "B",
          "C",
          "D",
          "E",
          "A"
         ],
         "type": "scatterpolar"
        },
        {
         "line": {
          "color": "green"
         },
         "mode": "lines",
         "name": "Class4",
         "r": [
          1.0055306877678314,
          0.22994357570786872,
          0.8932124277708919,
          -0.3538963759384709,
          -0.11385474044182772,
          1.0055306877678314
         ],
         "theta": [
          "A",
          "B",
          "C",
          "D",
          "E",
          "A"
         ],
         "type": "scatterpolar"
        }
       ],
       "layout": {
        "font": {
         "color": "#000",
         "family": "Arial, sans-serif;",
         "size": 12
        },
        "showlegend": false,
        "title": "TFIDF Visualization of Intents"
       }
      },
      "text/html": [
       "<div id=\"e8506ac5-b6c3-43a2-8f42-d5f6abcbe33a\" style=\"height: 525px; width: 100%;\" class=\"plotly-graph-div\"></div><script type=\"text/javascript\">require([\"plotly\"], function(Plotly) { window.PLOTLYENV=window.PLOTLYENV || {};window.PLOTLYENV.BASE_URL=\"https://plot.ly\";Plotly.newPlot(\"e8506ac5-b6c3-43a2-8f42-d5f6abcbe33a\", [{\"theta\": [\"A\", \"B\", \"C\", \"D\", \"E\", \"A\"], \"type\": \"scatterpolar\", \"line\": {\"color\": \"peru\"}, \"r\": [1.258646994466373, -0.5272370631161158, 0.0005630280977832204, -0.040138046581806364, 0.36907647285274403, 1.258646994466373], \"name\": \"Class0\", \"mode\": \"lines\"}, {\"theta\": [\"A\", \"B\", \"C\", \"D\", \"E\", \"A\"], \"type\": \"scatterpolar\", \"line\": {\"color\": \"darkviolet\"}, \"r\": [1.0924325581213414, -0.6051586209432138, -0.5654095049116606, -0.22005983412438146, -0.2688120358452568, 1.0924325581213414], \"name\": \"Class1\", \"mode\": \"lines\"}, {\"theta\": [\"A\", \"B\", \"C\", \"D\", \"E\", \"A\"], \"type\": \"scatterpolar\", \"line\": {\"color\": \"deepskyblue\"}, \"r\": [0.7147397700108102, 1.0648266100070478, -0.536169680109551, -0.24808075634355758, 0.07917931705328059, 0.7147397700108102], \"name\": \"Class2\", \"mode\": \"lines\"}, {\"theta\": [\"A\", \"B\", \"C\", \"D\", \"E\", \"A\"], \"type\": \"scatterpolar\", \"line\": {\"color\": \"orangered\"}, \"r\": [1.1969437099725184, 0.27771654946833846, 0.08524409660422959, 0.688492701360346, -0.0943955373748415, 1.1969437099725184], \"name\": \"Class3\", \"mode\": \"lines\"}, {\"theta\": [\"A\", \"B\", \"C\", \"D\", \"E\", \"A\"], \"type\": \"scatterpolar\", \"line\": {\"color\": \"green\"}, \"r\": [1.0055306877678314, 0.22994357570786872, 0.8932124277708919, -0.3538963759384709, -0.11385474044182772, 1.0055306877678314], \"name\": \"Class4\", \"mode\": \"lines\"}], {\"title\": \"TFIDF Visualization of Intents\", \"showlegend\": false, \"font\": {\"family\": \"Arial, sans-serif;\", \"size\": 12, \"color\": \"#000\"}}, {\"linkText\": \"Export to plot.ly\", \"showLink\": true})});</script>"
      ],
      "text/vnd.plotly.v1+html": [
       "<div id=\"e8506ac5-b6c3-43a2-8f42-d5f6abcbe33a\" style=\"height: 525px; width: 100%;\" class=\"plotly-graph-div\"></div><script type=\"text/javascript\">require([\"plotly\"], function(Plotly) { window.PLOTLYENV=window.PLOTLYENV || {};window.PLOTLYENV.BASE_URL=\"https://plot.ly\";Plotly.newPlot(\"e8506ac5-b6c3-43a2-8f42-d5f6abcbe33a\", [{\"theta\": [\"A\", \"B\", \"C\", \"D\", \"E\", \"A\"], \"type\": \"scatterpolar\", \"line\": {\"color\": \"peru\"}, \"r\": [1.258646994466373, -0.5272370631161158, 0.0005630280977832204, -0.040138046581806364, 0.36907647285274403, 1.258646994466373], \"name\": \"Class0\", \"mode\": \"lines\"}, {\"theta\": [\"A\", \"B\", \"C\", \"D\", \"E\", \"A\"], \"type\": \"scatterpolar\", \"line\": {\"color\": \"darkviolet\"}, \"r\": [1.0924325581213414, -0.6051586209432138, -0.5654095049116606, -0.22005983412438146, -0.2688120358452568, 1.0924325581213414], \"name\": \"Class1\", \"mode\": \"lines\"}, {\"theta\": [\"A\", \"B\", \"C\", \"D\", \"E\", \"A\"], \"type\": \"scatterpolar\", \"line\": {\"color\": \"deepskyblue\"}, \"r\": [0.7147397700108102, 1.0648266100070478, -0.536169680109551, -0.24808075634355758, 0.07917931705328059, 0.7147397700108102], \"name\": \"Class2\", \"mode\": \"lines\"}, {\"theta\": [\"A\", \"B\", \"C\", \"D\", \"E\", \"A\"], \"type\": \"scatterpolar\", \"line\": {\"color\": \"orangered\"}, \"r\": [1.1969437099725184, 0.27771654946833846, 0.08524409660422959, 0.688492701360346, -0.0943955373748415, 1.1969437099725184], \"name\": \"Class3\", \"mode\": \"lines\"}, {\"theta\": [\"A\", \"B\", \"C\", \"D\", \"E\", \"A\"], \"type\": \"scatterpolar\", \"line\": {\"color\": \"green\"}, \"r\": [1.0055306877678314, 0.22994357570786872, 0.8932124277708919, -0.3538963759384709, -0.11385474044182772, 1.0055306877678314], \"name\": \"Class4\", \"mode\": \"lines\"}], {\"title\": \"TFIDF Visualization of Intents\", \"showlegend\": false, \"font\": {\"family\": \"Arial, sans-serif;\", \"size\": 12, \"color\": \"#000\"}}, {\"linkText\": \"Export to plot.ly\", \"showLink\": true})});</script>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "theta_list=['A','B','C', 'D', 'E', 'A']\n",
    "\n",
    "data = [\n",
    "\n",
    "    go.Scatterpolar(\n",
    "        r = np.append(svd_tfidf[0], svd_tfidf[0][0]),\n",
    "        theta = theta_list,\n",
    "        mode = 'lines',\n",
    "        name = 'Class0',\n",
    "        line =  dict(\n",
    "            color = 'peru'\n",
    "        )   \n",
    "    ),\n",
    "    \n",
    "    go.Scatterpolar(\n",
    "        r = np.append(svd_tfidf[1], svd_tfidf[1][0]),\n",
    "        theta = theta_list,\n",
    "        mode = 'lines',\n",
    "        name = 'Class1',\n",
    "        line =  dict(\n",
    "            color = 'darkviolet'\n",
    "        )   \n",
    "    ),\n",
    "    \n",
    "    go.Scatterpolar(\n",
    "        r = np.append(svd_tfidf[2], svd_tfidf[2][0]),\n",
    "        theta = theta_list,\n",
    "        mode = 'lines',\n",
    "        name = 'Class2',\n",
    "        line =  dict(\n",
    "            color = 'deepskyblue'\n",
    "        )   \n",
    "    ),\n",
    "    \n",
    "    go.Scatterpolar(\n",
    "        r = np.append(svd_tfidf[3], svd_tfidf[3][0]),\n",
    "        theta = theta_list,\n",
    "        mode = 'lines',\n",
    "        name = 'Class3',\n",
    "        line =  dict(\n",
    "            color = 'orangered'\n",
    "        )   \n",
    "    ),\n",
    "    \n",
    "    go.Scatterpolar(\n",
    "        r = np.append(svd_tfidf[4], svd_tfidf[4][0]),\n",
    "        theta = theta_list,\n",
    "        mode = 'lines',\n",
    "        name = 'Class4',\n",
    "        line =  dict(\n",
    "            color = 'green'\n",
    "        )   \n",
    "    ),\n",
    "    \n",
    "]\n",
    "\n",
    "\n",
    "\n",
    "layout = go.Layout(\n",
    "    title = 'TFIDF Visualization of Intents',\n",
    "    font = dict(\n",
    "        family = 'Arial, sans-serif;',\n",
    "        size = 12,\n",
    "        color = '#000'\n",
    "    ),\n",
    "    showlegend = False\n",
    ")\n",
    "\n",
    "\n",
    "\n",
    "fig = go.Figure(data=data, layout=layout)\n",
    "iplot(fig, filename = \"radar/tfidf\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python3 (QnA)",
   "language": "python",
   "name": "tensorflow"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
 }
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<script>requirejs.config({paths: { 'plotly': ['https://cdn.plot.ly/plotly-latest.min']},});if(!window.Plotly) {{require(['plotly'],function(plotly) {window.Plotly=plotly;});}}</script>"
	],
	"text/vnd.plotly.v1+html": [
	"<script>requirejs.config({paths: { 'plotly': ['https://cdn.plot.ly/plotly-latest.min']},});if(!window.Plotly) {{require(['plotly'],function(plotly) {window.Plotly=plotly;});}}</script>"
	]
	},
	"metadata": {},
	"output_type": "display_data"
	}
	],
	"source": [
	"import pandas as pd\n",
	"import numpy as np\n",
	"import plotly.plotly as py\n",
	"import plotly.graph_objs as go\n",
	"from plotly.offline import init_notebook_mode, plot, iplot\n",
	"init_notebook_mode(connected=True)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"df_intents = pd.read_excel('hlb_intents_cleaned.xlsx', header=0, names=['intents','variations'],\n",
	" na_values=[\"NA\", \"TBC\", \"Not Clear\", \"Not In Corpus\", \"\", \" \"])\n",
	"\n",
	"\n",
	"df_intents.variations = df_intents.variations.str.lower()\n",
	"df_intents.variations = df_intents.variations.str.replace('[^\\w\\s]','')\n",
	"df_intents.variations = df_intents.variations.str.strip()\n",
	"\n",
	"df_intents.intents = df_intents.intents.str.lower()\n",
	"df_intents.intents = df_intents.intents.str.replace('[^\\w\\s]','')\n",
	"df_intents.intents = df_intents.intents.str.strip()\n",
	"\n",
	"df_intents.drop_duplicates(inplace=True)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"<class 'pandas.core.frame.DataFrame'>\n",
	"Int64Index: 6723 entries, 0 to 6727\n",
	"Data columns (total 2 columns):\n",
	"intents 6723 non-null object\n",
	"variations 6723 non-null object\n",
	"dtypes: object(2)\n",
	"memory usage: 157.6+ KB\n"
	]
	}
	],
	"source": [
	"df_intents.info()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"df_man1 = pd.read_excel('WatsonAccuracyTrend.xlsx', sheetname=\"ManualMarkingRaw\", parse_cols=\"C,D,E,F,G,V\",\n",
	" header=0, names=['question','answer', 'key_question', 'confidence', 'ux', 'expected_kq'],\n",
	" na_values=[\"NA\", \"TBC\", \"Not Clear\", \"Not In Corpus\", \"\", \" \"]\n",
	" )\n",
	"\n",
	"df_man1.question = df_man1.question.str.lower()\n",
	"df_man1.question = df_man1.question.str.replace('[^\\w\\s]','')\n",
	"df_man1.question = df_man1.question.str.strip()\n",
	"\n",
	"df_man1.expected_kq = df_man1.expected_kq.str.lower()\n",
	"df_man1.expected_kq = df_man1.expected_kq.str.replace('[^\\w\\s]','')\n",
	"df_man1.expected_kq = df_man1.expected_kq.str.strip()\n",
	"\n",
	"df_man1 = df_man1[[\"expected_kq\", \"question\"]]\n",
	"df_man1.drop_duplicates(inplace=True)\n",
	"df_man1.columns = ['intents', 'variations']\n",
	"df_man1.dropna(inplace=True)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"<class 'pandas.core.frame.DataFrame'>\n",
	"Int64Index: 10635 entries, 0 to 21958\n",
	"Data columns (total 2 columns):\n",
	"intents 10635 non-null object\n",
	"variations 10635 non-null object\n",
	"dtypes: object(2)\n",
	"memory usage: 249.3+ KB\n"
	]
	}
	],
	"source": [
	"df_man1.info()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"<class 'pandas.core.frame.DataFrame'>\n",
	"Int64Index: 16503 entries, 0 to 21958\n",
	"Data columns (total 2 columns):\n",
	"intents 16503 non-null object\n",
	"variations 16503 non-null object\n",
	"dtypes: object(2)\n",
	"memory usage: 386.8+ KB\n"
	]
	}
	],
	"source": [
	"df = pd.concat([df_intents, df_man1])\n",
	"df.drop_duplicates(inplace=True)\n",
	"\n",
	"err_intents = ['thanks', 'add variation in loan', 'you are helpful', 'will you be my friend', 'it was nice talking to you',\n",
	" 'you are not very intelligent', 'complaints to hong leong bank', 'strong insult', 'add variation to loan',\n",
	" 'add as variation to loan', 'add as variations to loan', 'loan', 'you are stupid', 'compliment to hong leong bank',\n",
	" 'are you a real person', 'my name is'\n",
	" ]\n",
	"\n",
	"df = df[~df.intents.isin(err_intents)]\n",
	"df.info()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 7,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"from sklearn import preprocessing\n",
	"le = preprocessing.LabelEncoder()\n",
	"Y = le.fit_transform(df.intents.unique())\n",
	"klasses = le.transform(df.intents)\n",
	"df['klass'] = klasses"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 8,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"5"
	]
	},
	"execution_count": 8,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"new_df = df.groupby('klass').filter(lambda x: len(x) >= 317)\n",
	"len(new_df.intents.unique())"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 9,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"from sklearn.feature_extraction.text import TfidfVectorizer\n",
	"from sklearn.pipeline import FeatureUnion\n",
	"combined_features = FeatureUnion([\n",
	" (\"tfidf_word\", TfidfVectorizer(analyzer='word', ngram_range=(1,3), max_features=5000, min_df=2 )), \n",
	" (\"tfidf_char\", TfidfVectorizer(analyzer='char', ngram_range=(2,3), max_features=2000, min_df=2 ))\n",
	"])"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 10,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"feats = combined_features.fit_transform(new_df.intents.unique())"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 11,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"from sklearn.decomposition import TruncatedSVD\n",
	"svd = TruncatedSVD(n_components=5, random_state=0)\n",
	"svd_tfidf = svd.fit_transform(feats)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 12,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"(5, 5)"
	]
	},
	"execution_count": 12,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"svd_tfidf.shape"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 13,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"[t-SNE] Computing pairwise distances...\n",
	"[t-SNE] Computing 4 nearest neighbors...\n",
	"[t-SNE] Computed conditional probabilities for sample 5 / 5\n",
	"[t-SNE] Mean sigma: 1125899906842624.000000\n",
	"[t-SNE] KL divergence after 50 iterations with early exaggeration: 0.000000\n",
	"[t-SNE] Error after 75 iterations: 0.000000\n"
	]
	}
	],
	"source": [
	"from sklearn.manifold import TSNE\n",
	"tsne_model = TSNE(n_components=2, verbose=1, random_state=0)\n",
	"tsne_tfidf = tsne_model.fit_transform(svd_tfidf)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 14,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"array([ 1.25864699e+00, -5.27237063e-01, 5.63028098e-04,\n",
	" -4.01380466e-02, 3.69076473e-01])"
	]
	},
	"execution_count": 14,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"svd_tfidf[0]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 21,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"application/vnd.plotly.v1+json": {
	"data": [
	{
	"line": {
	"color": "peru"
	},
	"mode": "lines",
	"name": "Class0",
	"r": [
	1.258646994466373,
	-0.5272370631161158,
	0.0005630280977832204,
	-0.040138046581806364,
	0.36907647285274403,
	1.258646994466373
	],
	"theta": [
	"A",
	"B",
	"C",
	"D",
	"E",
	"A"
	],
	"type": "scatterpolar"
	},
	{
	"line": {
	"color": "darkviolet"
	},
	"mode": "lines",
	"name": "Class1",
	"r": [
	1.0924325581213414,
	-0.6051586209432138,
	-0.5654095049116606,
	-0.22005983412438146,
	-0.2688120358452568,
	1.0924325581213414
	],
	"theta": [
	"A",
	"B",
	"C",
	"D",
	"E",
	"A"
	],
	"type": "scatterpolar"
	},
	{
	"line": {
	"color": "deepskyblue"
	},
	"mode": "lines",
	"name": "Class2",
	"r": [
	0.7147397700108102,
	1.0648266100070478,
	-0.536169680109551,
	-0.24808075634355758,
	0.07917931705328059,
	0.7147397700108102
	],
	"theta": [
	"A",
	"B",
	"C",
	"D",
	"E",
	"A"
	],
	"type": "scatterpolar"
	},
	{
	"line": {
	"color": "orangered"
	},
	"mode": "lines",
	"name": "Class3",
	"r": [
	1.1969437099725184,
	0.27771654946833846,
	0.08524409660422959,
	0.688492701360346,
	-0.0943955373748415,
	1.1969437099725184
	],
	"theta": [
	"A",
	"B",
	"C",
	"D",
	"E",
	"A"
	],
	"type": "scatterpolar"
	},
	{
	"line": {
	"color": "green"
	},
	"mode": "lines",
	"name": "Class4",
	"r": [
	1.0055306877678314,
	0.22994357570786872,
	0.8932124277708919,
	-0.3538963759384709,
	-0.11385474044182772,
	1.0055306877678314
	],
	"theta": [
	"A",
	"B",
	"C",
	"D",
	"E",
	"A"
	],
	"type": "scatterpolar"
	}
	],
	"layout": {
	"font": {
	"color": "#000",
	"family": "Arial, sans-serif;",
	"size": 12
	},
	"showlegend": false,
	"title": "TFIDF Visualization of Intents"
	}
	},
	"text/html": [
	"<div id=\"e8506ac5-b6c3-43a2-8f42-d5f6abcbe33a\" style=\"height: 525px; width: 100%;\" class=\"plotly-graph-div\"></div><script type=\"text/javascript\">require([\"plotly\"], function(Plotly) { window.PLOTLYENV=window.PLOTLYENV \|\| {};window.PLOTLYENV.BASE_URL=\"https://plot.ly\";Plotly.newPlot(\"e8506ac5-b6c3-43a2-8f42-d5f6abcbe33a\", [{\"theta\": [\"A\", \"B\", \"C\", \"D\", \"E\", \"A\"], \"type\": \"scatterpolar\", \"line\": {\"color\": \"peru\"}, \"r\": [1.258646994466373, -0.5272370631161158, 0.0005630280977832204, -0.040138046581806364, 0.36907647285274403, 1.258646994466373], \"name\": \"Class0\", \"mode\": \"lines\"}, {\"theta\": [\"A\", \"B\", \"C\", \"D\", \"E\", \"A\"], \"type\": \"scatterpolar\", \"line\": {\"color\": \"darkviolet\"}, \"r\": [1.0924325581213414, -0.6051586209432138, -0.5654095049116606, -0.22005983412438146, -0.2688120358452568, 1.0924325581213414], \"name\": \"Class1\", \"mode\": \"lines\"}, {\"theta\": [\"A\", \"B\", \"C\", \"D\", \"E\", \"A\"], \"type\": \"scatterpolar\", \"line\": {\"color\": \"deepskyblue\"}, \"r\": [0.7147397700108102, 1.0648266100070478, -0.536169680109551, -0.24808075634355758, 0.07917931705328059, 0.7147397700108102], \"name\": \"Class2\", \"mode\": \"lines\"}, {\"theta\": [\"A\", \"B\", \"C\", \"D\", \"E\", \"A\"], \"type\": \"scatterpolar\", \"line\": {\"color\": \"orangered\"}, \"r\": [1.1969437099725184, 0.27771654946833846, 0.08524409660422959, 0.688492701360346, -0.0943955373748415, 1.1969437099725184], \"name\": \"Class3\", \"mode\": \"lines\"}, {\"theta\": [\"A\", \"B\", \"C\", \"D\", \"E\", \"A\"], \"type\": \"scatterpolar\", \"line\": {\"color\": \"green\"}, \"r\": [1.0055306877678314, 0.22994357570786872, 0.8932124277708919, -0.3538963759384709, -0.11385474044182772, 1.0055306877678314], \"name\": \"Class4\", \"mode\": \"lines\"}], {\"title\": \"TFIDF Visualization of Intents\", \"showlegend\": false, \"font\": {\"family\": \"Arial, sans-serif;\", \"size\": 12, \"color\": \"#000\"}}, {\"linkText\": \"Export to plot.ly\", \"showLink\": true})});</script>"
	],
	"text/vnd.plotly.v1+html": [
	"<div id=\"e8506ac5-b6c3-43a2-8f42-d5f6abcbe33a\" style=\"height: 525px; width: 100%;\" class=\"plotly-graph-div\"></div><script type=\"text/javascript\">require([\"plotly\"], function(Plotly) { window.PLOTLYENV=window.PLOTLYENV \|\| {};window.PLOTLYENV.BASE_URL=\"https://plot.ly\";Plotly.newPlot(\"e8506ac5-b6c3-43a2-8f42-d5f6abcbe33a\", [{\"theta\": [\"A\", \"B\", \"C\", \"D\", \"E\", \"A\"], \"type\": \"scatterpolar\", \"line\": {\"color\": \"peru\"}, \"r\": [1.258646994466373, -0.5272370631161158, 0.0005630280977832204, -0.040138046581806364, 0.36907647285274403, 1.258646994466373], \"name\": \"Class0\", \"mode\": \"lines\"}, {\"theta\": [\"A\", \"B\", \"C\", \"D\", \"E\", \"A\"], \"type\": \"scatterpolar\", \"line\": {\"color\": \"darkviolet\"}, \"r\": [1.0924325581213414, -0.6051586209432138, -0.5654095049116606, -0.22005983412438146, -0.2688120358452568, 1.0924325581213414], \"name\": \"Class1\", \"mode\": \"lines\"}, {\"theta\": [\"A\", \"B\", \"C\", \"D\", \"E\", \"A\"], \"type\": \"scatterpolar\", \"line\": {\"color\": \"deepskyblue\"}, \"r\": [0.7147397700108102, 1.0648266100070478, -0.536169680109551, -0.24808075634355758, 0.07917931705328059, 0.7147397700108102], \"name\": \"Class2\", \"mode\": \"lines\"}, {\"theta\": [\"A\", \"B\", \"C\", \"D\", \"E\", \"A\"], \"type\": \"scatterpolar\", \"line\": {\"color\": \"orangered\"}, \"r\": [1.1969437099725184, 0.27771654946833846, 0.08524409660422959, 0.688492701360346, -0.0943955373748415, 1.1969437099725184], \"name\": \"Class3\", \"mode\": \"lines\"}, {\"theta\": [\"A\", \"B\", \"C\", \"D\", \"E\", \"A\"], \"type\": \"scatterpolar\", \"line\": {\"color\": \"green\"}, \"r\": [1.0055306877678314, 0.22994357570786872, 0.8932124277708919, -0.3538963759384709, -0.11385474044182772, 1.0055306877678314], \"name\": \"Class4\", \"mode\": \"lines\"}], {\"title\": \"TFIDF Visualization of Intents\", \"showlegend\": false, \"font\": {\"family\": \"Arial, sans-serif;\", \"size\": 12, \"color\": \"#000\"}}, {\"linkText\": \"Export to plot.ly\", \"showLink\": true})});</script>"
	]
	},
	"metadata": {},
	"output_type": "display_data"
	}
	],
	"source": [
	"theta_list=['A','B','C', 'D', 'E', 'A']\n",
	"\n",
	"data = [\n",
	"\n",
	" go.Scatterpolar(\n",
	" r = np.append(svd_tfidf[0], svd_tfidf[0][0]),\n",
	" theta = theta_list,\n",
	" mode = 'lines',\n",
	" name = 'Class0',\n",
	" line = dict(\n",
	" color = 'peru'\n",
	" ) \n",
	" ),\n",
	" \n",
	" go.Scatterpolar(\n",
	" r = np.append(svd_tfidf[1], svd_tfidf[1][0]),\n",
	" theta = theta_list,\n",
	" mode = 'lines',\n",
	" name = 'Class1',\n",
	" line = dict(\n",
	" color = 'darkviolet'\n",
	" ) \n",
	" ),\n",
	" \n",
	" go.Scatterpolar(\n",
	" r = np.append(svd_tfidf[2], svd_tfidf[2][0]),\n",
	" theta = theta_list,\n",
	" mode = 'lines',\n",
	" name = 'Class2',\n",
	" line = dict(\n",
	" color = 'deepskyblue'\n",
	" ) \n",
	" ),\n",
	" \n",
	" go.Scatterpolar(\n",
	" r = np.append(svd_tfidf[3], svd_tfidf[3][0]),\n",
	" theta = theta_list,\n",
	" mode = 'lines',\n",
	" name = 'Class3',\n",
	" line = dict(\n",
	" color = 'orangered'\n",
	" ) \n",
	" ),\n",
	" \n",
	" go.Scatterpolar(\n",
	" r = np.append(svd_tfidf[4], svd_tfidf[4][0]),\n",
	" theta = theta_list,\n",
	" mode = 'lines',\n",
	" name = 'Class4',\n",
	" line = dict(\n",
	" color = 'green'\n",
	" ) \n",
	" ),\n",
	" \n",
	"]\n",
	"\n",
	"\n",
	"\n",
	"layout = go.Layout(\n",
	" title = 'TFIDF Visualization of Intents',\n",
	" font = dict(\n",
	" family = 'Arial, sans-serif;',\n",
	" size = 12,\n",
	" color = '#000'\n",
	" ),\n",
	" showlegend = False\n",
	")\n",
	"\n",
	"\n",
	"\n",
	"fig = go.Figure(data=data, layout=layout)\n",
	"iplot(fig, filename = \"radar/tfidf\")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python3 (QnA)",
	"language": "python",
	"name": "tensorflow"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.5.2"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}