Last active
March 26, 2018 13:16
-
-
Save AashishTiwari/f1fa37ef3a282fff055570b3f81f468e to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<script>requirejs.config({paths: { 'plotly': ['https://cdn.plot.ly/plotly-latest.min']},});if(!window.Plotly) {{require(['plotly'],function(plotly) {window.Plotly=plotly;});}}</script>" | |
], | |
"text/vnd.plotly.v1+html": [ | |
"<script>requirejs.config({paths: { 'plotly': ['https://cdn.plot.ly/plotly-latest.min']},});if(!window.Plotly) {{require(['plotly'],function(plotly) {window.Plotly=plotly;});}}</script>" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"import pandas as pd\n", | |
"import numpy as np\n", | |
"import plotly.plotly as py\n", | |
"import plotly.graph_objs as go\n", | |
"from plotly.offline import init_notebook_mode, plot, iplot\n", | |
"init_notebook_mode(connected=True)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"df_intents = pd.read_excel('hlb_intents_cleaned.xlsx', header=0, names=['intents','variations'],\n", | |
" na_values=[\"NA\", \"TBC\", \"Not Clear\", \"Not In Corpus\", \"\", \" \"])\n", | |
"\n", | |
"\n", | |
"df_intents.variations = df_intents.variations.str.lower()\n", | |
"df_intents.variations = df_intents.variations.str.replace('[^\\w\\s]','')\n", | |
"df_intents.variations = df_intents.variations.str.strip()\n", | |
"\n", | |
"df_intents.intents = df_intents.intents.str.lower()\n", | |
"df_intents.intents = df_intents.intents.str.replace('[^\\w\\s]','')\n", | |
"df_intents.intents = df_intents.intents.str.strip()\n", | |
"\n", | |
"df_intents.drop_duplicates(inplace=True)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"<class 'pandas.core.frame.DataFrame'>\n", | |
"Int64Index: 6723 entries, 0 to 6727\n", | |
"Data columns (total 2 columns):\n", | |
"intents 6723 non-null object\n", | |
"variations 6723 non-null object\n", | |
"dtypes: object(2)\n", | |
"memory usage: 157.6+ KB\n" | |
] | |
} | |
], | |
"source": [ | |
"df_intents.info()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"df_man1 = pd.read_excel('WatsonAccuracyTrend.xlsx', sheetname=\"ManualMarkingRaw\", parse_cols=\"C,D,E,F,G,V\",\n", | |
" header=0, names=['question','answer', 'key_question', 'confidence', 'ux', 'expected_kq'],\n", | |
" na_values=[\"NA\", \"TBC\", \"Not Clear\", \"Not In Corpus\", \"\", \" \"]\n", | |
" )\n", | |
"\n", | |
"df_man1.question = df_man1.question.str.lower()\n", | |
"df_man1.question = df_man1.question.str.replace('[^\\w\\s]','')\n", | |
"df_man1.question = df_man1.question.str.strip()\n", | |
"\n", | |
"df_man1.expected_kq = df_man1.expected_kq.str.lower()\n", | |
"df_man1.expected_kq = df_man1.expected_kq.str.replace('[^\\w\\s]','')\n", | |
"df_man1.expected_kq = df_man1.expected_kq.str.strip()\n", | |
"\n", | |
"df_man1 = df_man1[[\"expected_kq\", \"question\"]]\n", | |
"df_man1.drop_duplicates(inplace=True)\n", | |
"df_man1.columns = ['intents', 'variations']\n", | |
"df_man1.dropna(inplace=True)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"<class 'pandas.core.frame.DataFrame'>\n", | |
"Int64Index: 10635 entries, 0 to 21958\n", | |
"Data columns (total 2 columns):\n", | |
"intents 10635 non-null object\n", | |
"variations 10635 non-null object\n", | |
"dtypes: object(2)\n", | |
"memory usage: 249.3+ KB\n" | |
] | |
} | |
], | |
"source": [ | |
"df_man1.info()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"<class 'pandas.core.frame.DataFrame'>\n", | |
"Int64Index: 16503 entries, 0 to 21958\n", | |
"Data columns (total 2 columns):\n", | |
"intents 16503 non-null object\n", | |
"variations 16503 non-null object\n", | |
"dtypes: object(2)\n", | |
"memory usage: 386.8+ KB\n" | |
] | |
} | |
], | |
"source": [ | |
"df = pd.concat([df_intents, df_man1])\n", | |
"df.drop_duplicates(inplace=True)\n", | |
"\n", | |
"err_intents = ['thanks', 'add variation in loan', 'you are helpful', 'will you be my friend', 'it was nice talking to you',\n", | |
" 'you are not very intelligent', 'complaints to hong leong bank', 'strong insult', 'add variation to loan',\n", | |
" 'add as variation to loan', 'add as variations to loan', 'loan', 'you are stupid', 'compliment to hong leong bank',\n", | |
" 'are you a real person', 'my name is'\n", | |
" ]\n", | |
"\n", | |
"df = df[~df.intents.isin(err_intents)]\n", | |
"df.info()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"from sklearn import preprocessing\n", | |
"le = preprocessing.LabelEncoder()\n", | |
"Y = le.fit_transform(df.intents.unique())\n", | |
"klasses = le.transform(df.intents)\n", | |
"df['klass'] = klasses" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"5" | |
] | |
}, | |
"execution_count": 8, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"new_df = df.groupby('klass').filter(lambda x: len(x) >= 317)\n", | |
"len(new_df.intents.unique())" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"from sklearn.feature_extraction.text import TfidfVectorizer\n", | |
"from sklearn.pipeline import FeatureUnion\n", | |
"combined_features = FeatureUnion([\n", | |
" (\"tfidf_word\", TfidfVectorizer(analyzer='word', ngram_range=(1,3), max_features=5000, min_df=2 )), \n", | |
" (\"tfidf_char\", TfidfVectorizer(analyzer='char', ngram_range=(2,3), max_features=2000, min_df=2 ))\n", | |
"])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"feats = combined_features.fit_transform(new_df.intents.unique())" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"from sklearn.decomposition import TruncatedSVD\n", | |
"svd = TruncatedSVD(n_components=5, random_state=0)\n", | |
"svd_tfidf = svd.fit_transform(feats)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"(5, 5)" | |
] | |
}, | |
"execution_count": 12, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"svd_tfidf.shape" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"[t-SNE] Computing pairwise distances...\n", | |
"[t-SNE] Computing 4 nearest neighbors...\n", | |
"[t-SNE] Computed conditional probabilities for sample 5 / 5\n", | |
"[t-SNE] Mean sigma: 1125899906842624.000000\n", | |
"[t-SNE] KL divergence after 50 iterations with early exaggeration: 0.000000\n", | |
"[t-SNE] Error after 75 iterations: 0.000000\n" | |
] | |
} | |
], | |
"source": [ | |
"from sklearn.manifold import TSNE\n", | |
"tsne_model = TSNE(n_components=2, verbose=1, random_state=0)\n", | |
"tsne_tfidf = tsne_model.fit_transform(svd_tfidf)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 14, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"array([ 1.25864699e+00, -5.27237063e-01, 5.63028098e-04,\n", | |
" -4.01380466e-02, 3.69076473e-01])" | |
] | |
}, | |
"execution_count": 14, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"svd_tfidf[0]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 21, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"application/vnd.plotly.v1+json": { | |
"data": [ | |
{ | |
"line": { | |
"color": "peru" | |
}, | |
"mode": "lines", | |
"name": "Class0", | |
"r": [ | |
1.258646994466373, | |
-0.5272370631161158, | |
0.0005630280977832204, | |
-0.040138046581806364, | |
0.36907647285274403, | |
1.258646994466373 | |
], | |
"theta": [ | |
"A", | |
"B", | |
"C", | |
"D", | |
"E", | |
"A" | |
], | |
"type": "scatterpolar" | |
}, | |
{ | |
"line": { | |
"color": "darkviolet" | |
}, | |
"mode": "lines", | |
"name": "Class1", | |
"r": [ | |
1.0924325581213414, | |
-0.6051586209432138, | |
-0.5654095049116606, | |
-0.22005983412438146, | |
-0.2688120358452568, | |
1.0924325581213414 | |
], | |
"theta": [ | |
"A", | |
"B", | |
"C", | |
"D", | |
"E", | |
"A" | |
], | |
"type": "scatterpolar" | |
}, | |
{ | |
"line": { | |
"color": "deepskyblue" | |
}, | |
"mode": "lines", | |
"name": "Class2", | |
"r": [ | |
0.7147397700108102, | |
1.0648266100070478, | |
-0.536169680109551, | |
-0.24808075634355758, | |
0.07917931705328059, | |
0.7147397700108102 | |
], | |
"theta": [ | |
"A", | |
"B", | |
"C", | |
"D", | |
"E", | |
"A" | |
], | |
"type": "scatterpolar" | |
}, | |
{ | |
"line": { | |
"color": "orangered" | |
}, | |
"mode": "lines", | |
"name": "Class3", | |
"r": [ | |
1.1969437099725184, | |
0.27771654946833846, | |
0.08524409660422959, | |
0.688492701360346, | |
-0.0943955373748415, | |
1.1969437099725184 | |
], | |
"theta": [ | |
"A", | |
"B", | |
"C", | |
"D", | |
"E", | |
"A" | |
], | |
"type": "scatterpolar" | |
}, | |
{ | |
"line": { | |
"color": "green" | |
}, | |
"mode": "lines", | |
"name": "Class4", | |
"r": [ | |
1.0055306877678314, | |
0.22994357570786872, | |
0.8932124277708919, | |
-0.3538963759384709, | |
-0.11385474044182772, | |
1.0055306877678314 | |
], | |
"theta": [ | |
"A", | |
"B", | |
"C", | |
"D", | |
"E", | |
"A" | |
], | |
"type": "scatterpolar" | |
} | |
], | |
"layout": { | |
"font": { | |
"color": "#000", | |
"family": "Arial, sans-serif;", | |
"size": 12 | |
}, | |
"showlegend": false, | |
"title": "TFIDF Visualization of Intents" | |
} | |
}, | |
"text/html": [ | |
"<div id=\"e8506ac5-b6c3-43a2-8f42-d5f6abcbe33a\" style=\"height: 525px; width: 100%;\" class=\"plotly-graph-div\"></div><script type=\"text/javascript\">require([\"plotly\"], function(Plotly) { window.PLOTLYENV=window.PLOTLYENV || {};window.PLOTLYENV.BASE_URL=\"https://plot.ly\";Plotly.newPlot(\"e8506ac5-b6c3-43a2-8f42-d5f6abcbe33a\", [{\"theta\": [\"A\", \"B\", \"C\", \"D\", \"E\", \"A\"], \"type\": \"scatterpolar\", \"line\": {\"color\": \"peru\"}, \"r\": [1.258646994466373, -0.5272370631161158, 0.0005630280977832204, -0.040138046581806364, 0.36907647285274403, 1.258646994466373], \"name\": \"Class0\", \"mode\": \"lines\"}, {\"theta\": [\"A\", \"B\", \"C\", \"D\", \"E\", \"A\"], \"type\": \"scatterpolar\", \"line\": {\"color\": \"darkviolet\"}, \"r\": [1.0924325581213414, -0.6051586209432138, -0.5654095049116606, -0.22005983412438146, -0.2688120358452568, 1.0924325581213414], \"name\": \"Class1\", \"mode\": \"lines\"}, {\"theta\": [\"A\", \"B\", \"C\", \"D\", \"E\", \"A\"], \"type\": \"scatterpolar\", \"line\": {\"color\": \"deepskyblue\"}, \"r\": [0.7147397700108102, 1.0648266100070478, -0.536169680109551, -0.24808075634355758, 0.07917931705328059, 0.7147397700108102], \"name\": \"Class2\", \"mode\": \"lines\"}, {\"theta\": [\"A\", \"B\", \"C\", \"D\", \"E\", \"A\"], \"type\": \"scatterpolar\", \"line\": {\"color\": \"orangered\"}, \"r\": [1.1969437099725184, 0.27771654946833846, 0.08524409660422959, 0.688492701360346, -0.0943955373748415, 1.1969437099725184], \"name\": \"Class3\", \"mode\": \"lines\"}, {\"theta\": [\"A\", \"B\", \"C\", \"D\", \"E\", \"A\"], \"type\": \"scatterpolar\", \"line\": {\"color\": \"green\"}, \"r\": [1.0055306877678314, 0.22994357570786872, 0.8932124277708919, -0.3538963759384709, -0.11385474044182772, 1.0055306877678314], \"name\": \"Class4\", \"mode\": \"lines\"}], {\"title\": \"TFIDF Visualization of Intents\", \"showlegend\": false, \"font\": {\"family\": \"Arial, sans-serif;\", \"size\": 12, \"color\": \"#000\"}}, {\"linkText\": \"Export to plot.ly\", \"showLink\": true})});</script>" | |
], | |
"text/vnd.plotly.v1+html": [ | |
"<div id=\"e8506ac5-b6c3-43a2-8f42-d5f6abcbe33a\" style=\"height: 525px; width: 100%;\" class=\"plotly-graph-div\"></div><script type=\"text/javascript\">require([\"plotly\"], function(Plotly) { window.PLOTLYENV=window.PLOTLYENV || {};window.PLOTLYENV.BASE_URL=\"https://plot.ly\";Plotly.newPlot(\"e8506ac5-b6c3-43a2-8f42-d5f6abcbe33a\", [{\"theta\": [\"A\", \"B\", \"C\", \"D\", \"E\", \"A\"], \"type\": \"scatterpolar\", \"line\": {\"color\": \"peru\"}, \"r\": [1.258646994466373, -0.5272370631161158, 0.0005630280977832204, -0.040138046581806364, 0.36907647285274403, 1.258646994466373], \"name\": \"Class0\", \"mode\": \"lines\"}, {\"theta\": [\"A\", \"B\", \"C\", \"D\", \"E\", \"A\"], \"type\": \"scatterpolar\", \"line\": {\"color\": \"darkviolet\"}, \"r\": [1.0924325581213414, -0.6051586209432138, -0.5654095049116606, -0.22005983412438146, -0.2688120358452568, 1.0924325581213414], \"name\": \"Class1\", \"mode\": \"lines\"}, {\"theta\": [\"A\", \"B\", \"C\", \"D\", \"E\", \"A\"], \"type\": \"scatterpolar\", \"line\": {\"color\": \"deepskyblue\"}, \"r\": [0.7147397700108102, 1.0648266100070478, -0.536169680109551, -0.24808075634355758, 0.07917931705328059, 0.7147397700108102], \"name\": \"Class2\", \"mode\": \"lines\"}, {\"theta\": [\"A\", \"B\", \"C\", \"D\", \"E\", \"A\"], \"type\": \"scatterpolar\", \"line\": {\"color\": \"orangered\"}, \"r\": [1.1969437099725184, 0.27771654946833846, 0.08524409660422959, 0.688492701360346, -0.0943955373748415, 1.1969437099725184], \"name\": \"Class3\", \"mode\": \"lines\"}, {\"theta\": [\"A\", \"B\", \"C\", \"D\", \"E\", \"A\"], \"type\": \"scatterpolar\", \"line\": {\"color\": \"green\"}, \"r\": [1.0055306877678314, 0.22994357570786872, 0.8932124277708919, -0.3538963759384709, -0.11385474044182772, 1.0055306877678314], \"name\": \"Class4\", \"mode\": \"lines\"}], {\"title\": \"TFIDF Visualization of Intents\", \"showlegend\": false, \"font\": {\"family\": \"Arial, sans-serif;\", \"size\": 12, \"color\": \"#000\"}}, {\"linkText\": \"Export to plot.ly\", \"showLink\": true})});</script>" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"theta_list=['A','B','C', 'D', 'E', 'A']\n", | |
"\n", | |
"data = [\n", | |
"\n", | |
" go.Scatterpolar(\n", | |
" r = np.append(svd_tfidf[0], svd_tfidf[0][0]),\n", | |
" theta = theta_list,\n", | |
" mode = 'lines',\n", | |
" name = 'Class0',\n", | |
" line = dict(\n", | |
" color = 'peru'\n", | |
" ) \n", | |
" ),\n", | |
" \n", | |
" go.Scatterpolar(\n", | |
" r = np.append(svd_tfidf[1], svd_tfidf[1][0]),\n", | |
" theta = theta_list,\n", | |
" mode = 'lines',\n", | |
" name = 'Class1',\n", | |
" line = dict(\n", | |
" color = 'darkviolet'\n", | |
" ) \n", | |
" ),\n", | |
" \n", | |
" go.Scatterpolar(\n", | |
" r = np.append(svd_tfidf[2], svd_tfidf[2][0]),\n", | |
" theta = theta_list,\n", | |
" mode = 'lines',\n", | |
" name = 'Class2',\n", | |
" line = dict(\n", | |
" color = 'deepskyblue'\n", | |
" ) \n", | |
" ),\n", | |
" \n", | |
" go.Scatterpolar(\n", | |
" r = np.append(svd_tfidf[3], svd_tfidf[3][0]),\n", | |
" theta = theta_list,\n", | |
" mode = 'lines',\n", | |
" name = 'Class3',\n", | |
" line = dict(\n", | |
" color = 'orangered'\n", | |
" ) \n", | |
" ),\n", | |
" \n", | |
" go.Scatterpolar(\n", | |
" r = np.append(svd_tfidf[4], svd_tfidf[4][0]),\n", | |
" theta = theta_list,\n", | |
" mode = 'lines',\n", | |
" name = 'Class4',\n", | |
" line = dict(\n", | |
" color = 'green'\n", | |
" ) \n", | |
" ),\n", | |
" \n", | |
"]\n", | |
"\n", | |
"\n", | |
"\n", | |
"layout = go.Layout(\n", | |
" title = 'TFIDF Visualization of Intents',\n", | |
" font = dict(\n", | |
" family = 'Arial, sans-serif;',\n", | |
" size = 12,\n", | |
" color = '#000'\n", | |
" ),\n", | |
" showlegend = False\n", | |
")\n", | |
"\n", | |
"\n", | |
"\n", | |
"fig = go.Figure(data=data, layout=layout)\n", | |
"iplot(fig, filename = \"radar/tfidf\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python3 (QnA)", | |
"language": "python", | |
"name": "tensorflow" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.5.2" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment