Skip to content

Instantly share code, notes, and snippets.

@AashishTiwari
Last active March 26, 2018 13:16
Show Gist options
  • Save AashishTiwari/f1fa37ef3a282fff055570b3f81f468e to your computer and use it in GitHub Desktop.
Save AashishTiwari/f1fa37ef3a282fff055570b3f81f468e to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<script>requirejs.config({paths: { 'plotly': ['https://cdn.plot.ly/plotly-latest.min']},});if(!window.Plotly) {{require(['plotly'],function(plotly) {window.Plotly=plotly;});}}</script>"
],
"text/vnd.plotly.v1+html": [
"<script>requirejs.config({paths: { 'plotly': ['https://cdn.plot.ly/plotly-latest.min']},});if(!window.Plotly) {{require(['plotly'],function(plotly) {window.Plotly=plotly;});}}</script>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import plotly.plotly as py\n",
"import plotly.graph_objs as go\n",
"from plotly.offline import init_notebook_mode, plot, iplot\n",
"init_notebook_mode(connected=True)"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"df_intents = pd.read_excel('hlb_intents_cleaned.xlsx', header=0, names=['intents','variations'],\n",
" na_values=[\"NA\", \"TBC\", \"Not Clear\", \"Not In Corpus\", \"\", \" \"])\n",
"\n",
"\n",
"df_intents.variations = df_intents.variations.str.lower()\n",
"df_intents.variations = df_intents.variations.str.replace('[^\\w\\s]','')\n",
"df_intents.variations = df_intents.variations.str.strip()\n",
"\n",
"df_intents.intents = df_intents.intents.str.lower()\n",
"df_intents.intents = df_intents.intents.str.replace('[^\\w\\s]','')\n",
"df_intents.intents = df_intents.intents.str.strip()\n",
"\n",
"df_intents.drop_duplicates(inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"Int64Index: 6723 entries, 0 to 6727\n",
"Data columns (total 2 columns):\n",
"intents 6723 non-null object\n",
"variations 6723 non-null object\n",
"dtypes: object(2)\n",
"memory usage: 157.6+ KB\n"
]
}
],
"source": [
"df_intents.info()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"df_man1 = pd.read_excel('WatsonAccuracyTrend.xlsx', sheetname=\"ManualMarkingRaw\", parse_cols=\"C,D,E,F,G,V\",\n",
" header=0, names=['question','answer', 'key_question', 'confidence', 'ux', 'expected_kq'],\n",
" na_values=[\"NA\", \"TBC\", \"Not Clear\", \"Not In Corpus\", \"\", \" \"]\n",
" )\n",
"\n",
"df_man1.question = df_man1.question.str.lower()\n",
"df_man1.question = df_man1.question.str.replace('[^\\w\\s]','')\n",
"df_man1.question = df_man1.question.str.strip()\n",
"\n",
"df_man1.expected_kq = df_man1.expected_kq.str.lower()\n",
"df_man1.expected_kq = df_man1.expected_kq.str.replace('[^\\w\\s]','')\n",
"df_man1.expected_kq = df_man1.expected_kq.str.strip()\n",
"\n",
"df_man1 = df_man1[[\"expected_kq\", \"question\"]]\n",
"df_man1.drop_duplicates(inplace=True)\n",
"df_man1.columns = ['intents', 'variations']\n",
"df_man1.dropna(inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"Int64Index: 10635 entries, 0 to 21958\n",
"Data columns (total 2 columns):\n",
"intents 10635 non-null object\n",
"variations 10635 non-null object\n",
"dtypes: object(2)\n",
"memory usage: 249.3+ KB\n"
]
}
],
"source": [
"df_man1.info()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"Int64Index: 16503 entries, 0 to 21958\n",
"Data columns (total 2 columns):\n",
"intents 16503 non-null object\n",
"variations 16503 non-null object\n",
"dtypes: object(2)\n",
"memory usage: 386.8+ KB\n"
]
}
],
"source": [
"df = pd.concat([df_intents, df_man1])\n",
"df.drop_duplicates(inplace=True)\n",
"\n",
"err_intents = ['thanks', 'add variation in loan', 'you are helpful', 'will you be my friend', 'it was nice talking to you',\n",
" 'you are not very intelligent', 'complaints to hong leong bank', 'strong insult', 'add variation to loan',\n",
" 'add as variation to loan', 'add as variations to loan', 'loan', 'you are stupid', 'compliment to hong leong bank',\n",
" 'are you a real person', 'my name is'\n",
" ]\n",
"\n",
"df = df[~df.intents.isin(err_intents)]\n",
"df.info()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"from sklearn import preprocessing\n",
"le = preprocessing.LabelEncoder()\n",
"Y = le.fit_transform(df.intents.unique())\n",
"klasses = le.transform(df.intents)\n",
"df['klass'] = klasses"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"5"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"new_df = df.groupby('klass').filter(lambda x: len(x) >= 317)\n",
"len(new_df.intents.unique())"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"from sklearn.pipeline import FeatureUnion\n",
"combined_features = FeatureUnion([\n",
" (\"tfidf_word\", TfidfVectorizer(analyzer='word', ngram_range=(1,3), max_features=5000, min_df=2 )), \n",
" (\"tfidf_char\", TfidfVectorizer(analyzer='char', ngram_range=(2,3), max_features=2000, min_df=2 ))\n",
"])"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"feats = combined_features.fit_transform(new_df.intents.unique())"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"from sklearn.decomposition import TruncatedSVD\n",
"svd = TruncatedSVD(n_components=5, random_state=0)\n",
"svd_tfidf = svd.fit_transform(feats)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"(5, 5)"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"svd_tfidf.shape"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[t-SNE] Computing pairwise distances...\n",
"[t-SNE] Computing 4 nearest neighbors...\n",
"[t-SNE] Computed conditional probabilities for sample 5 / 5\n",
"[t-SNE] Mean sigma: 1125899906842624.000000\n",
"[t-SNE] KL divergence after 50 iterations with early exaggeration: 0.000000\n",
"[t-SNE] Error after 75 iterations: 0.000000\n"
]
}
],
"source": [
"from sklearn.manifold import TSNE\n",
"tsne_model = TSNE(n_components=2, verbose=1, random_state=0)\n",
"tsne_tfidf = tsne_model.fit_transform(svd_tfidf)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"array([ 1.25864699e+00, -5.27237063e-01, 5.63028098e-04,\n",
" -4.01380466e-02, 3.69076473e-01])"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"svd_tfidf[0]"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"application/vnd.plotly.v1+json": {
"data": [
{
"line": {
"color": "peru"
},
"mode": "lines",
"name": "Class0",
"r": [
1.258646994466373,
-0.5272370631161158,
0.0005630280977832204,
-0.040138046581806364,
0.36907647285274403,
1.258646994466373
],
"theta": [
"A",
"B",
"C",
"D",
"E",
"A"
],
"type": "scatterpolar"
},
{
"line": {
"color": "darkviolet"
},
"mode": "lines",
"name": "Class1",
"r": [
1.0924325581213414,
-0.6051586209432138,
-0.5654095049116606,
-0.22005983412438146,
-0.2688120358452568,
1.0924325581213414
],
"theta": [
"A",
"B",
"C",
"D",
"E",
"A"
],
"type": "scatterpolar"
},
{
"line": {
"color": "deepskyblue"
},
"mode": "lines",
"name": "Class2",
"r": [
0.7147397700108102,
1.0648266100070478,
-0.536169680109551,
-0.24808075634355758,
0.07917931705328059,
0.7147397700108102
],
"theta": [
"A",
"B",
"C",
"D",
"E",
"A"
],
"type": "scatterpolar"
},
{
"line": {
"color": "orangered"
},
"mode": "lines",
"name": "Class3",
"r": [
1.1969437099725184,
0.27771654946833846,
0.08524409660422959,
0.688492701360346,
-0.0943955373748415,
1.1969437099725184
],
"theta": [
"A",
"B",
"C",
"D",
"E",
"A"
],
"type": "scatterpolar"
},
{
"line": {
"color": "green"
},
"mode": "lines",
"name": "Class4",
"r": [
1.0055306877678314,
0.22994357570786872,
0.8932124277708919,
-0.3538963759384709,
-0.11385474044182772,
1.0055306877678314
],
"theta": [
"A",
"B",
"C",
"D",
"E",
"A"
],
"type": "scatterpolar"
}
],
"layout": {
"font": {
"color": "#000",
"family": "Arial, sans-serif;",
"size": 12
},
"showlegend": false,
"title": "TFIDF Visualization of Intents"
}
},
"text/html": [
"<div id=\"e8506ac5-b6c3-43a2-8f42-d5f6abcbe33a\" style=\"height: 525px; width: 100%;\" class=\"plotly-graph-div\"></div><script type=\"text/javascript\">require([\"plotly\"], function(Plotly) { window.PLOTLYENV=window.PLOTLYENV || {};window.PLOTLYENV.BASE_URL=\"https://plot.ly\";Plotly.newPlot(\"e8506ac5-b6c3-43a2-8f42-d5f6abcbe33a\", [{\"theta\": [\"A\", \"B\", \"C\", \"D\", \"E\", \"A\"], \"type\": \"scatterpolar\", \"line\": {\"color\": \"peru\"}, \"r\": [1.258646994466373, -0.5272370631161158, 0.0005630280977832204, -0.040138046581806364, 0.36907647285274403, 1.258646994466373], \"name\": \"Class0\", \"mode\": \"lines\"}, {\"theta\": [\"A\", \"B\", \"C\", \"D\", \"E\", \"A\"], \"type\": \"scatterpolar\", \"line\": {\"color\": \"darkviolet\"}, \"r\": [1.0924325581213414, -0.6051586209432138, -0.5654095049116606, -0.22005983412438146, -0.2688120358452568, 1.0924325581213414], \"name\": \"Class1\", \"mode\": \"lines\"}, {\"theta\": [\"A\", \"B\", \"C\", \"D\", \"E\", \"A\"], \"type\": \"scatterpolar\", \"line\": {\"color\": \"deepskyblue\"}, \"r\": [0.7147397700108102, 1.0648266100070478, -0.536169680109551, -0.24808075634355758, 0.07917931705328059, 0.7147397700108102], \"name\": \"Class2\", \"mode\": \"lines\"}, {\"theta\": [\"A\", \"B\", \"C\", \"D\", \"E\", \"A\"], \"type\": \"scatterpolar\", \"line\": {\"color\": \"orangered\"}, \"r\": [1.1969437099725184, 0.27771654946833846, 0.08524409660422959, 0.688492701360346, -0.0943955373748415, 1.1969437099725184], \"name\": \"Class3\", \"mode\": \"lines\"}, {\"theta\": [\"A\", \"B\", \"C\", \"D\", \"E\", \"A\"], \"type\": \"scatterpolar\", \"line\": {\"color\": \"green\"}, \"r\": [1.0055306877678314, 0.22994357570786872, 0.8932124277708919, -0.3538963759384709, -0.11385474044182772, 1.0055306877678314], \"name\": \"Class4\", \"mode\": \"lines\"}], {\"title\": \"TFIDF Visualization of Intents\", \"showlegend\": false, \"font\": {\"family\": \"Arial, sans-serif;\", \"size\": 12, \"color\": \"#000\"}}, {\"linkText\": \"Export to plot.ly\", \"showLink\": true})});</script>"
],
"text/vnd.plotly.v1+html": [
"<div id=\"e8506ac5-b6c3-43a2-8f42-d5f6abcbe33a\" style=\"height: 525px; width: 100%;\" class=\"plotly-graph-div\"></div><script type=\"text/javascript\">require([\"plotly\"], function(Plotly) { window.PLOTLYENV=window.PLOTLYENV || {};window.PLOTLYENV.BASE_URL=\"https://plot.ly\";Plotly.newPlot(\"e8506ac5-b6c3-43a2-8f42-d5f6abcbe33a\", [{\"theta\": [\"A\", \"B\", \"C\", \"D\", \"E\", \"A\"], \"type\": \"scatterpolar\", \"line\": {\"color\": \"peru\"}, \"r\": [1.258646994466373, -0.5272370631161158, 0.0005630280977832204, -0.040138046581806364, 0.36907647285274403, 1.258646994466373], \"name\": \"Class0\", \"mode\": \"lines\"}, {\"theta\": [\"A\", \"B\", \"C\", \"D\", \"E\", \"A\"], \"type\": \"scatterpolar\", \"line\": {\"color\": \"darkviolet\"}, \"r\": [1.0924325581213414, -0.6051586209432138, -0.5654095049116606, -0.22005983412438146, -0.2688120358452568, 1.0924325581213414], \"name\": \"Class1\", \"mode\": \"lines\"}, {\"theta\": [\"A\", \"B\", \"C\", \"D\", \"E\", \"A\"], \"type\": \"scatterpolar\", \"line\": {\"color\": \"deepskyblue\"}, \"r\": [0.7147397700108102, 1.0648266100070478, -0.536169680109551, -0.24808075634355758, 0.07917931705328059, 0.7147397700108102], \"name\": \"Class2\", \"mode\": \"lines\"}, {\"theta\": [\"A\", \"B\", \"C\", \"D\", \"E\", \"A\"], \"type\": \"scatterpolar\", \"line\": {\"color\": \"orangered\"}, \"r\": [1.1969437099725184, 0.27771654946833846, 0.08524409660422959, 0.688492701360346, -0.0943955373748415, 1.1969437099725184], \"name\": \"Class3\", \"mode\": \"lines\"}, {\"theta\": [\"A\", \"B\", \"C\", \"D\", \"E\", \"A\"], \"type\": \"scatterpolar\", \"line\": {\"color\": \"green\"}, \"r\": [1.0055306877678314, 0.22994357570786872, 0.8932124277708919, -0.3538963759384709, -0.11385474044182772, 1.0055306877678314], \"name\": \"Class4\", \"mode\": \"lines\"}], {\"title\": \"TFIDF Visualization of Intents\", \"showlegend\": false, \"font\": {\"family\": \"Arial, sans-serif;\", \"size\": 12, \"color\": \"#000\"}}, {\"linkText\": \"Export to plot.ly\", \"showLink\": true})});</script>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"theta_list=['A','B','C', 'D', 'E', 'A']\n",
"\n",
"data = [\n",
"\n",
" go.Scatterpolar(\n",
" r = np.append(svd_tfidf[0], svd_tfidf[0][0]),\n",
" theta = theta_list,\n",
" mode = 'lines',\n",
" name = 'Class0',\n",
" line = dict(\n",
" color = 'peru'\n",
" ) \n",
" ),\n",
" \n",
" go.Scatterpolar(\n",
" r = np.append(svd_tfidf[1], svd_tfidf[1][0]),\n",
" theta = theta_list,\n",
" mode = 'lines',\n",
" name = 'Class1',\n",
" line = dict(\n",
" color = 'darkviolet'\n",
" ) \n",
" ),\n",
" \n",
" go.Scatterpolar(\n",
" r = np.append(svd_tfidf[2], svd_tfidf[2][0]),\n",
" theta = theta_list,\n",
" mode = 'lines',\n",
" name = 'Class2',\n",
" line = dict(\n",
" color = 'deepskyblue'\n",
" ) \n",
" ),\n",
" \n",
" go.Scatterpolar(\n",
" r = np.append(svd_tfidf[3], svd_tfidf[3][0]),\n",
" theta = theta_list,\n",
" mode = 'lines',\n",
" name = 'Class3',\n",
" line = dict(\n",
" color = 'orangered'\n",
" ) \n",
" ),\n",
" \n",
" go.Scatterpolar(\n",
" r = np.append(svd_tfidf[4], svd_tfidf[4][0]),\n",
" theta = theta_list,\n",
" mode = 'lines',\n",
" name = 'Class4',\n",
" line = dict(\n",
" color = 'green'\n",
" ) \n",
" ),\n",
" \n",
"]\n",
"\n",
"\n",
"\n",
"layout = go.Layout(\n",
" title = 'TFIDF Visualization of Intents',\n",
" font = dict(\n",
" family = 'Arial, sans-serif;',\n",
" size = 12,\n",
" color = '#000'\n",
" ),\n",
" showlegend = False\n",
")\n",
"\n",
"\n",
"\n",
"fig = go.Figure(data=data, layout=layout)\n",
"iplot(fig, filename = \"radar/tfidf\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python3 (QnA)",
"language": "python",
"name": "tensorflow"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.2"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment