Last active
August 29, 2015 13:57
-
-
Save seekshreyas/9813037 to your computer and use it in GitHub Desktop.
mds plot for obidroid
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "metadata": { | |
| "name": "" | |
| }, | |
| "nbformat": 3, | |
| "nbformat_minor": 0, | |
| "worksheets": [ | |
| { | |
| "cells": [ | |
| { | |
| "cell_type": "heading", | |
| "level": 1, | |
| "metadata": {}, | |
| "source": [ | |
| "Obidroid Multi Dimensional Scaling (MDS) Notebook" | |
| ] | |
| }, | |
| { | |
| "cell_type": "heading", | |
| "level": 2, | |
| "metadata": {}, | |
| "source": [ | |
| "Loading App Data" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "I shall be using the previously exported `exports/appFeatures.csv` data for this notebook" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "%pylab --no-import-all inline" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "Populating the interactive namespace from numpy and matplotlib\n" | |
| ] | |
| } | |
| ], | |
| "prompt_number": 14 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "%doctest_mode" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "Exception reporting mode: Plain\n", | |
| "Doctest mode is: ON\n" | |
| ] | |
| } | |
| ], | |
| "prompt_number": 15 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "import pandas as pd\n", | |
| "import numpy as np\n", | |
| "from pandas import Series, DataFrame\n", | |
| "\n", | |
| "## Load the appFeatures file\n", | |
| "main_appData = pd.read_csv('exports/appFeatures.csv')" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [], | |
| "prompt_number": 16 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "# This cell creates a dataframe with the same amount of unfair and fair apps, where the fair apps are randomly selected\n", | |
| "# TODO - Create a method that implements k-fold validation for app selection\n", | |
| "import random\n", | |
| "\n", | |
| "## create unfair apps dataframe and count\n", | |
| "df_unfair = main_appData[main_appData.appLabel == 'unfair']\n", | |
| "unfair_count = df_unfair.appLabel.count()\n", | |
| "\n", | |
| "## get same number of random fair apps\n", | |
| "df_randomly_fair = main_appData[main_appData.appLabel == 'fair'].ix[random.sample(main_appData[main_appData.appLabel == 'fair'].index, unfair_count)]\n", | |
| "#print df_randomly_fair.appLabel.count()\n", | |
| "\n", | |
| "\n", | |
| "# append the newly created dataframe of unfair * 2 rows\n", | |
| "appData = df_randomly_fair.append(df_unfair)\n", | |
| "\n", | |
| "# shuffle the dataframe\n", | |
| "appData = appData.ix[np.random.permutation(appData.index)]\n", | |
| "\n", | |
| "main_appData.columns" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "metadata": {}, | |
| "output_type": "pyout", | |
| "prompt_number": 17, | |
| "text": [ | |
| "Index([u'appName', u'adjectiveCount', u'avgRating', u'countCapital', u'exclamationCount', u'hasDeveloperEmail', u'hasDeveloperWebsite', u'Unnamed: 7', u'hasPrivacy', u'installs', u'price', u'revSent', u'revLength', u'appLabel'], dtype='object')" | |
| ] | |
| } | |
| ], | |
| "prompt_number": 17 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "\n", | |
| "\n", | |
| "def trimDataFrame(df):\n", | |
| " \"\"\"\n", | |
| " Lets create a new dataframe for appFeatures and appLabels\n", | |
| " \"\"\"\n", | |
| "\n", | |
| " ## for App Features\n", | |
| " appCols = set(df.columns)\n", | |
| " appCols.remove('appName') # remove app Names column\n", | |
| " appCols.remove('Unnamed: 7') # removing a weird unnamed column\n", | |
| " appCols.remove('appLabel') # removing the label column\n", | |
| " appCols.remove('price') # removing price since most of the apps are free\n", | |
| " appCols.remove('exclamationCount') # remove exclamation count from features, as all values seemed to be 0\n", | |
| " \n", | |
| " df_trim = df[list(appCols)]\n", | |
| " \n", | |
| " # -- boolean\n", | |
| " df_trim['hasPrivacy'].astype(bool)\n", | |
| " df_trim['hasDeveloperEmail'].astype(bool)\n", | |
| " df_trim['hasDeveloperWebsite'].astype(bool)\n", | |
| " \n", | |
| " # -- integer\n", | |
| " df_trim['adjectiveCount'].astype(int)\n", | |
| " df_trim['countCapital'].astype(int)\n", | |
| " df_trim['installs'].astype(int)\n", | |
| " df_trim['revSent'].astype(int)\n", | |
| " df_trim['revLength'].astype(int)\n", | |
| " \n", | |
| " # -- float\n", | |
| " df_trim['avgRating'].astype(float)\n", | |
| " \n", | |
| " return df_trim" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [], | |
| "prompt_number": 18 | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "Now, I want to explicitly set types to all my columns as a better practice" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "# Explicitly casting column types in appFeatures dataframe\n", | |
| "appFeatures = trimDataFrame(main_appData)\n", | |
| "\n", | |
| "\n", | |
| "# -- boolean\n", | |
| "appFeatures['hasPrivacy'].astype(bool)\n", | |
| "appFeatures['hasDeveloperEmail'].astype(bool)\n", | |
| "appFeatures['hasDeveloperWebsite'].astype(bool)\n", | |
| "\n", | |
| "# -- integer\n", | |
| "appFeatures['adjectiveCount'].astype(int)\n", | |
| "appFeatures['countCapital'].astype(int)\n", | |
| "appFeatures['installs'].astype(int)\n", | |
| "appFeatures['revSent'].astype(int)\n", | |
| "appFeatures['revLength'].astype(int)\n", | |
| "\n", | |
| "# -- float\n", | |
| "appFeatures['avgRating'].astype(float)\n", | |
| "\n", | |
| "\n", | |
| "appFeatures" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "metadata": {}, | |
| "output_type": "pyout", | |
| "prompt_number": 19, | |
| "text": [ | |
| " adjectiveCount hasPrivacy revLength countCapital hasDeveloperWebsite \\\n", | |
| "0 4 True 601 1 True \n", | |
| "1 13 True 1139 11 True \n", | |
| "2 23 True 2223 20 True \n", | |
| "3 10 False 804 5 True \n", | |
| "4 22 True 1867 16 True \n", | |
| "5 18 False 1162 6 True \n", | |
| "6 18 True 1522 60 True \n", | |
| "7 13 False 1895 19 True \n", | |
| "8 11 True 1195 10 True \n", | |
| "9 19 True 1488 11 True \n", | |
| "10 18 False 1864 35 True \n", | |
| "11 19 False 2049 14 True \n", | |
| "12 8 False 417 2 True \n", | |
| "13 16 False 1276 11 True \n", | |
| "14 13 False 1210 12 True \n", | |
| "15 20 True 2038 24 True \n", | |
| "16 12 False 1044 10 True \n", | |
| "17 15 True 1245 13 True \n", | |
| "18 2 False 225 2 True \n", | |
| "19 15 False 1120 10 True \n", | |
| "20 22 True 1406 11 True \n", | |
| "21 13 True 1063 10 True \n", | |
| "22 7 False 855 10 True \n", | |
| "23 17 True 2147 38 True \n", | |
| "24 12 True 1189 13 True \n", | |
| "25 19 False 1804 8 True \n", | |
| "26 26 False 1514 5 True \n", | |
| "27 12 True 1272 12 True \n", | |
| "28 17 True 1413 14 True \n", | |
| "29 7 True 610 2 True \n", | |
| "30 14 True 1145 5 True \n", | |
| "31 14 False 1413 14 True \n", | |
| "32 2 True 573 14 True \n", | |
| "33 12 True 1387 16 True \n", | |
| "34 19 True 1336 7 True \n", | |
| "35 7 True 817 5 True \n", | |
| "36 29 True 2205 16 True \n", | |
| "37 7 False 541 3 True \n", | |
| "38 6 False 310 1 True \n", | |
| "39 6 False 880 4 True \n", | |
| "40 5 False 583 4 True \n", | |
| "41 19 False 1888 20 True \n", | |
| "42 13 False 1122 4 True \n", | |
| "43 15 True 1613 21 True \n", | |
| "44 5 False 1014 11 False \n", | |
| "45 8 False 1115 6 False \n", | |
| "46 6 True 437 8 True \n", | |
| "47 2 True 307 0 True \n", | |
| "48 13 True 991 4 True \n", | |
| "49 7 False 578 6 True \n", | |
| "50 10 True 962 7 True \n", | |
| "51 13 False 1267 6 True \n", | |
| "52 9 False 1300 8 True \n", | |
| "53 16 False 1051 12 True \n", | |
| "54 16 True 1822 10 True \n", | |
| "55 7 False 535 0 True \n", | |
| "56 11 False 1075 8 True \n", | |
| "57 6 True 691 4 True \n", | |
| "58 7 True 991 5 True \n", | |
| "59 7 False 805 14 True \n", | |
| " ... ... ... ... ... \n", | |
| "\n", | |
| " installs hasDeveloperEmail avgRating revSent \n", | |
| "0 30000000 True 4.051 -3 \n", | |
| "1 30000000 True 4.351 2 \n", | |
| "2 3000000 False 4.555 -4 \n", | |
| "3 30000000 True 4.623 8 \n", | |
| "4 7500000 False 4.046 -11 \n", | |
| "5 30000000 True 4.595 1 \n", | |
| "6 30000000 True 4.526 -4 \n", | |
| "7 30000000 False 4.039 -5 \n", | |
| "8 3000000 True 4.400 -2 \n", | |
| "9 300000 True 3.935 -4 \n", | |
| "10 3000000 True 4.075 -5 \n", | |
| "11 750000 False 3.983 -2 \n", | |
| "12 30000000 True 4.238 1 \n", | |
| "13 3000000 True 3.915 -3 \n", | |
| "14 750000 True 4.050 -3 \n", | |
| "15 750000 True 3.795 -7 \n", | |
| "16 7500000 True 3.997 1 \n", | |
| "17 3000000 True 3.212 -5 \n", | |
| "18 3000000 True 2.611 -1 \n", | |
| "19 30000000 True 4.547 -3 \n", | |
| "20 3000000 True 2.671 4 \n", | |
| "21 750000 False 4.045 -3 \n", | |
| "22 750000 True 3.555 -9 \n", | |
| "23 3000000 True 4.590 -5 \n", | |
| "24 7500000 True 4.258 -9 \n", | |
| "25 750000 True 4.428 -10 \n", | |
| "26 3000000 True 4.401 5 \n", | |
| "27 3000000 True 4.275 -6 \n", | |
| "28 750000 True 4.149 -8 \n", | |
| "29 300000 True 4.396 -3 \n", | |
| "30 3000000 True 4.113 2 \n", | |
| "31 30000 True 4.240 2 \n", | |
| "32 300000 True 4.241 -4 \n", | |
| "33 3000000 True 3.989 -6 \n", | |
| "34 75000 True 4.310 -7 \n", | |
| "35 3000000 True 4.451 5 \n", | |
| "36 300000 True 3.916 -9 \n", | |
| "37 300000 True 4.761 1 \n", | |
| "38 300000 True 4.158 0 \n", | |
| "39 300000 True 2.972 -4 \n", | |
| "40 300000 True 3.903 -6 \n", | |
| "41 3000000 True 3.433 -9 \n", | |
| "42 3000000 True 4.412 -3 \n", | |
| "43 3000000 False 4.461 15 \n", | |
| "44 300000 True 3.564 -2 \n", | |
| "45 3000000 True 4.131 5 \n", | |
| "46 30000 True 3.550 -1 \n", | |
| "47 3000000 True 4.435 1 \n", | |
| "48 300000 True 4.233 -3 \n", | |
| "49 3000000 True 3.975 1 \n", | |
| "50 300000 True 3.926 4 \n", | |
| "51 3000000 True 4.590 2 \n", | |
| "52 300000 True 3.601 -9 \n", | |
| "53 300000 True 3.701 3 \n", | |
| "54 300000 True 2.931 -13 \n", | |
| "55 3000000 True 4.564 0 \n", | |
| "56 7500000 True 4.179 -7 \n", | |
| "57 3000000 True 4.466 7 \n", | |
| "58 750000 False 4.340 -3 \n", | |
| "59 300000 True 4.539 3 \n", | |
| " ... ... ... ... \n", | |
| "\n", | |
| "[323 rows x 9 columns]" | |
| ] | |
| } | |
| ], | |
| "prompt_number": 19 | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "---" | |
| ] | |
| }, | |
| { | |
| "cell_type": "heading", | |
| "level": 2, | |
| "metadata": {}, | |
| "source": [ | |
| "Unsupervised Learning" | |
| ] | |
| }, | |
| { | |
| "cell_type": "heading", | |
| "level": 4, | |
| "metadata": {}, | |
| "source": [ | |
| "Scaling the feature vector" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "# get all the rows for unsupervised learning\n", | |
| "appData_all = trimDataFrame(main_appData)\n", | |
| "\n", | |
| "appData_all.head()\n", | |
| "\n" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "metadata": {}, | |
| "output_type": "pyout", | |
| "prompt_number": 20, | |
| "text": [ | |
| " adjectiveCount hasPrivacy revLength countCapital hasDeveloperWebsite \\\n", | |
| "0 4 True 601 1 True \n", | |
| "1 13 True 1139 11 True \n", | |
| "2 23 True 2223 20 True \n", | |
| "3 10 False 804 5 True \n", | |
| "4 22 True 1867 16 True \n", | |
| "\n", | |
| " installs hasDeveloperEmail avgRating revSent \n", | |
| "0 30000000 True 4.051 -3 \n", | |
| "1 30000000 True 4.351 2 \n", | |
| "2 3000000 False 4.555 -4 \n", | |
| "3 30000000 True 4.623 8 \n", | |
| "4 7500000 False 4.046 -11 \n", | |
| "\n", | |
| "[5 rows x 9 columns]" | |
| ] | |
| } | |
| ], | |
| "prompt_number": 20 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "from sklearn.preprocessing import MinMaxScaler\n", | |
| "min_max_scaler = MinMaxScaler()\n", | |
| "\n", | |
| "print(appData_all.columns)\n", | |
| "#print appData_all\n", | |
| "\n", | |
| "# scale the dataframe\n", | |
| "X_scaled = min_max_scaler.fit_transform(appData_all)" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "Index([u'adjectiveCount', u'hasPrivacy', u'revLength', u'countCapital', u'hasDeveloperWebsite', u'installs', u'hasDeveloperEmail', u'avgRating', u'revSent'], dtype='object')\n" | |
| ] | |
| } | |
| ], | |
| "prompt_number": 21 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "from time import time\n", | |
| "import pylab as pl\n", | |
| "from matplotlib import offsetbox\n", | |
| "from sklearn import (manifold, datasets, decomposition, ensemble, lda,\n", | |
| " random_projection)\n", | |
| "from mpld3 import enable_notebook\n", | |
| "from mpld3 import plugins\n", | |
| "enable_notebook()" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [], | |
| "prompt_number": 22 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "# Plot D3 scatter\n", | |
| "def plot_d3(X, title=None):\n", | |
| " x_min, x_max = np.min(X, 0), np.max(X, 0)\n", | |
| " X = (X - x_min) / (x_max - x_min)\n", | |
| " \n", | |
| " fig, ax = plt.subplots(subplot_kw=dict(axisbg='#EEEEEE'))\n", | |
| " fig.set_figwidth(12)\n", | |
| " fig.set_figheight(8)\n", | |
| "\n", | |
| " scatter = ax.scatter(X[:,0],\n", | |
| " X[:,1],\n", | |
| " c=['red' if main_appData.iloc[i]['appLabel'] == 'unfair' else 'green' for i in range(X.shape[0])],\n", | |
| " s=75,\n", | |
| " alpha=0.2,\n", | |
| " cmap=plt.cm.jet)\n", | |
| " ax.grid(color='white', linestyle='solid')\n", | |
| "\n", | |
| " ax.set_title(\"Scatter Plot of Unfair/Fair Apps\", size=24)\n", | |
| "\n", | |
| " labels = [main_appData.iloc[i]['appName'].decode('ascii', 'replace') for i in range(X.shape[0])]\n", | |
| " tooltip = plugins.PointLabelTooltip(scatter, labels=labels)\n", | |
| " plugins.connect(fig, tooltip)" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [], | |
| "prompt_number": 23 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "print(\"Computing MDS embedding and plotting with D3\")\n", | |
| "clf = manifold.MDS(n_components=2, n_init=1, max_iter=100, verbose=1)\n", | |
| "t0 = time()\n", | |
| "\n", | |
| "X_mds = clf.fit_transform(X_scaled)\n", | |
| "print(\"Done. Stress: %f\" % clf.stress_)\n", | |
| "plot_d3(X_mds,\n", | |
| " \"MDS embedding of the digits (time %.2fs)\" %\n", | |
| " (time() - t0))" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "Computing MDS embedding and plotting with D3\n", | |
| "breaking at iteration 99 with stress 810.282757972" | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "\n", | |
| "Done. Stress: 810.282758\n" | |
| ] | |
| }, | |
| { | |
| "metadata": {}, | |
| "output_type": "display_data", | |
| "text": [ | |
| "<matplotlib.figure.Figure object at 0x10b08dc90>" | |
| ] | |
| } | |
| ], | |
| "prompt_number": 24 | |
| } | |
| ], | |
| "metadata": {} | |
| } | |
| ] | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment