seekshreyas · August 29, 2015 13:57
diff --git a/ObidroidMDSD3 b/ObidroidMDSD3
 {
 "metadata": {
  "name": ""
 },
 "nbformat": 3,
 "nbformat_minor": 0,
 "worksheets": [
  {
   "cells": [
    {
     "cell_type": "heading",
     "level": 1,
     "metadata": {},
     "source": [
      "Obidroid Multi Dimensional Scaling (MDS) Notebook"
     ]
    },
    {
     "cell_type": "heading",
     "level": 2,
     "metadata": {},
     "source": [
      "Loading App Data"
     ]
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "I shall be using the previously exported `exports/appFeatures.csv` data for this notebook"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "%pylab --no-import-all inline"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "Populating the interactive namespace from numpy and matplotlib\n"
       ]
      }
     ],
     "prompt_number": 14
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "%doctest_mode"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "Exception reporting mode: Plain\n",
        "Doctest mode is: ON\n"
       ]
      }
     ],
     "prompt_number": 15
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "import pandas as pd\n",
      "import numpy as np\n",
      "from pandas import Series, DataFrame\n",
      "\n",
      "## Load the appFeatures file\n",
      "main_appData = pd.read_csv('exports/appFeatures.csv')"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 16
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# This cell creates a dataframe with the same amount of unfair and fair apps, where the fair apps are randomly selected\n",
      "# TODO - Create a method that implements k-fold validation for app selection\n",
      "import random\n",
      "\n",
      "## create unfair apps dataframe and count\n",
      "df_unfair = main_appData[main_appData.appLabel == 'unfair']\n",
      "unfair_count = df_unfair.appLabel.count()\n",
      "\n",
      "## get same number of random fair apps\n",
      "df_randomly_fair = main_appData[main_appData.appLabel == 'fair'].ix[random.sample(main_appData[main_appData.appLabel == 'fair'].index, unfair_count)]\n",
      "#print df_randomly_fair.appLabel.count()\n",
      "\n",
      "\n",
      "# append the newly created dataframe of unfair * 2 rows\n",
      "appData = df_randomly_fair.append(df_unfair)\n",
      "\n",
      "# shuffle the dataframe\n",
      "appData = appData.ix[np.random.permutation(appData.index)]\n",
      "\n",
      "main_appData.columns"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 17,
       "text": [
        "Index([u'appName', u'adjectiveCount', u'avgRating', u'countCapital', u'exclamationCount', u'hasDeveloperEmail', u'hasDeveloperWebsite', u'Unnamed: 7', u'hasPrivacy', u'installs', u'price', u'revSent', u'revLength', u'appLabel'], dtype='object')"
       ]
      }
     ],
     "prompt_number": 17
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "\n",
      "\n",
      "def trimDataFrame(df):\n",
      "    \"\"\"\n",
      "    Lets create a new dataframe for appFeatures and appLabels\n",
      "    \"\"\"\n",
      "\n",
      "    ## for App Features\n",
      "    appCols = set(df.columns)\n",
      "    appCols.remove('appName') # remove app Names column\n",
      "    appCols.remove('Unnamed: 7') # removing a weird unnamed column\n",
      "    appCols.remove('appLabel') # removing the label column\n",
      "    appCols.remove('price') # removing price since most of the apps are free\n",
      "    appCols.remove('exclamationCount') # remove exclamation count from features, as all values seemed to be 0\n",
      "    \n",
      "    df_trim = df[list(appCols)]\n",
      "    \n",
      "    # -- boolean\n",
      "    df_trim['hasPrivacy'].astype(bool)\n",
      "    df_trim['hasDeveloperEmail'].astype(bool)\n",
      "    df_trim['hasDeveloperWebsite'].astype(bool)\n",
      "    \n",
      "    # -- integer\n",
      "    df_trim['adjectiveCount'].astype(int)\n",
      "    df_trim['countCapital'].astype(int)\n",
      "    df_trim['installs'].astype(int)\n",
      "    df_trim['revSent'].astype(int)\n",
      "    df_trim['revLength'].astype(int)\n",
      "    \n",
      "    # -- float\n",
      "    df_trim['avgRating'].astype(float)\n",
      "    \n",
      "    return df_trim"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 18
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "Now, I want to explicitly set types to all my columns as a better practice"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# Explicitly casting column types in appFeatures dataframe\n",
      "appFeatures = trimDataFrame(main_appData)\n",
      "\n",
      "\n",
      "# -- boolean\n",
      "appFeatures['hasPrivacy'].astype(bool)\n",
      "appFeatures['hasDeveloperEmail'].astype(bool)\n",
      "appFeatures['hasDeveloperWebsite'].astype(bool)\n",
      "\n",
      "# -- integer\n",
      "appFeatures['adjectiveCount'].astype(int)\n",
      "appFeatures['countCapital'].astype(int)\n",
      "appFeatures['installs'].astype(int)\n",
      "appFeatures['revSent'].astype(int)\n",
      "appFeatures['revLength'].astype(int)\n",
      "\n",
      "# -- float\n",
      "appFeatures['avgRating'].astype(float)\n",
      "\n",
      "\n",
      "appFeatures"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 19,
       "text": [
        "    adjectiveCount hasPrivacy  revLength  countCapital hasDeveloperWebsite  \\\n",
        "0                4       True        601             1                True   \n",
        "1               13       True       1139            11                True   \n",
        "2               23       True       2223            20                True   \n",
        "3               10      False        804             5                True   \n",
        "4               22       True       1867            16                True   \n",
        "5               18      False       1162             6                True   \n",
        "6               18       True       1522            60                True   \n",
        "7               13      False       1895            19                True   \n",
        "8               11       True       1195            10                True   \n",
        "9               19       True       1488            11                True   \n",
        "10              18      False       1864            35                True   \n",
        "11              19      False       2049            14                True   \n",
        "12               8      False        417             2                True   \n",
        "13              16      False       1276            11                True   \n",
        "14              13      False       1210            12                True   \n",
        "15              20       True       2038            24                True   \n",
        "16              12      False       1044            10                True   \n",
        "17              15       True       1245            13                True   \n",
        "18               2      False        225             2                True   \n",
        "19              15      False       1120            10                True   \n",
        "20              22       True       1406            11                True   \n",
        "21              13       True       1063            10                True   \n",
        "22               7      False        855            10                True   \n",
        "23              17       True       2147            38                True   \n",
        "24              12       True       1189            13                True   \n",
        "25              19      False       1804             8                True   \n",
        "26              26      False       1514             5                True   \n",
        "27              12       True       1272            12                True   \n",
        "28              17       True       1413            14                True   \n",
        "29               7       True        610             2                True   \n",
        "30              14       True       1145             5                True   \n",
        "31              14      False       1413            14                True   \n",
        "32               2       True        573            14                True   \n",
        "33              12       True       1387            16                True   \n",
        "34              19       True       1336             7                True   \n",
        "35               7       True        817             5                True   \n",
        "36              29       True       2205            16                True   \n",
        "37               7      False        541             3                True   \n",
        "38               6      False        310             1                True   \n",
        "39               6      False        880             4                True   \n",
        "40               5      False        583             4                True   \n",
        "41              19      False       1888            20                True   \n",
        "42              13      False       1122             4                True   \n",
        "43              15       True       1613            21                True   \n",
        "44               5      False       1014            11               False   \n",
        "45               8      False       1115             6               False   \n",
        "46               6       True        437             8                True   \n",
        "47               2       True        307             0                True   \n",
        "48              13       True        991             4                True   \n",
        "49               7      False        578             6                True   \n",
        "50              10       True        962             7                True   \n",
        "51              13      False       1267             6                True   \n",
        "52               9      False       1300             8                True   \n",
        "53              16      False       1051            12                True   \n",
        "54              16       True       1822            10                True   \n",
        "55               7      False        535             0                True   \n",
        "56              11      False       1075             8                True   \n",
        "57               6       True        691             4                True   \n",
        "58               7       True        991             5                True   \n",
        "59               7      False        805            14                True   \n",
        "               ...        ...        ...           ...                 ...   \n",
        "\n",
        "    installs hasDeveloperEmail  avgRating  revSent  \n",
        "0   30000000              True      4.051       -3  \n",
        "1   30000000              True      4.351        2  \n",
        "2    3000000             False      4.555       -4  \n",
        "3   30000000              True      4.623        8  \n",
        "4    7500000             False      4.046      -11  \n",
        "5   30000000              True      4.595        1  \n",
        "6   30000000              True      4.526       -4  \n",
        "7   30000000             False      4.039       -5  \n",
        "8    3000000              True      4.400       -2  \n",
        "9     300000              True      3.935       -4  \n",
        "10   3000000              True      4.075       -5  \n",
        "11    750000             False      3.983       -2  \n",
        "12  30000000              True      4.238        1  \n",
        "13   3000000              True      3.915       -3  \n",
        "14    750000              True      4.050       -3  \n",
        "15    750000              True      3.795       -7  \n",
        "16   7500000              True      3.997        1  \n",
        "17   3000000              True      3.212       -5  \n",
        "18   3000000              True      2.611       -1  \n",
        "19  30000000              True      4.547       -3  \n",
        "20   3000000              True      2.671        4  \n",
        "21    750000             False      4.045       -3  \n",
        "22    750000              True      3.555       -9  \n",
        "23   3000000              True      4.590       -5  \n",
        "24   7500000              True      4.258       -9  \n",
        "25    750000              True      4.428      -10  \n",
        "26   3000000              True      4.401        5  \n",
        "27   3000000              True      4.275       -6  \n",
        "28    750000              True      4.149       -8  \n",
        "29    300000              True      4.396       -3  \n",
        "30   3000000              True      4.113        2  \n",
        "31     30000              True      4.240        2  \n",
        "32    300000              True      4.241       -4  \n",
        "33   3000000              True      3.989       -6  \n",
        "34     75000              True      4.310       -7  \n",
        "35   3000000              True      4.451        5  \n",
        "36    300000              True      3.916       -9  \n",
        "37    300000              True      4.761        1  \n",
        "38    300000              True      4.158        0  \n",
        "39    300000              True      2.972       -4  \n",
        "40    300000              True      3.903       -6  \n",
        "41   3000000              True      3.433       -9  \n",
        "42   3000000              True      4.412       -3  \n",
        "43   3000000             False      4.461       15  \n",
        "44    300000              True      3.564       -2  \n",
        "45   3000000              True      4.131        5  \n",
        "46     30000              True      3.550       -1  \n",
        "47   3000000              True      4.435        1  \n",
        "48    300000              True      4.233       -3  \n",
        "49   3000000              True      3.975        1  \n",
        "50    300000              True      3.926        4  \n",
        "51   3000000              True      4.590        2  \n",
        "52    300000              True      3.601       -9  \n",
        "53    300000              True      3.701        3  \n",
        "54    300000              True      2.931      -13  \n",
        "55   3000000              True      4.564        0  \n",
        "56   7500000              True      4.179       -7  \n",
        "57   3000000              True      4.466        7  \n",
        "58    750000             False      4.340       -3  \n",
        "59    300000              True      4.539        3  \n",
        "         ...               ...        ...      ...  \n",
        "\n",
        "[323 rows x 9 columns]"
       ]
      }
     ],
     "prompt_number": 19
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "---"
     ]
    },
    {
     "cell_type": "heading",
     "level": 2,
     "metadata": {},
     "source": [
      "Unsupervised Learning"
     ]
    },
    {
     "cell_type": "heading",
     "level": 4,
     "metadata": {},
     "source": [
      "Scaling the feature vector"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# get all the rows for unsupervised learning\n",
      "appData_all = trimDataFrame(main_appData)\n",
      "\n",
      "appData_all.head()\n",
      "\n"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 20,
       "text": [
        "   adjectiveCount hasPrivacy  revLength  countCapital hasDeveloperWebsite  \\\n",
        "0               4       True        601             1                True   \n",
        "1              13       True       1139            11                True   \n",
        "2              23       True       2223            20                True   \n",
        "3              10      False        804             5                True   \n",
        "4              22       True       1867            16                True   \n",
        "\n",
        "   installs hasDeveloperEmail  avgRating  revSent  \n",
        "0  30000000              True      4.051       -3  \n",
        "1  30000000              True      4.351        2  \n",
        "2   3000000             False      4.555       -4  \n",
        "3  30000000              True      4.623        8  \n",
        "4   7500000             False      4.046      -11  \n",
        "\n",
        "[5 rows x 9 columns]"
       ]
      }
     ],
     "prompt_number": 20
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "from sklearn.preprocessing import MinMaxScaler\n",
      "min_max_scaler = MinMaxScaler()\n",
      "\n",
      "print(appData_all.columns)\n",
      "#print appData_all\n",
      "\n",
      "# scale the dataframe\n",
      "X_scaled = min_max_scaler.fit_transform(appData_all)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "Index([u'adjectiveCount', u'hasPrivacy', u'revLength', u'countCapital', u'hasDeveloperWebsite', u'installs', u'hasDeveloperEmail', u'avgRating', u'revSent'], dtype='object')\n"
       ]
      }
     ],
     "prompt_number": 21
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "from time import time\n",
      "import pylab as pl\n",
      "from matplotlib import offsetbox\n",
      "from sklearn import (manifold, datasets, decomposition, ensemble, lda,\n",
      "                     random_projection)\n",
      "from mpld3 import enable_notebook\n",
      "from mpld3 import plugins\n",
      "enable_notebook()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 22
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# Plot D3 scatter\n",
      "def plot_d3(X, title=None):\n",
      "    x_min, x_max = np.min(X, 0), np.max(X, 0)\n",
      "    X = (X - x_min) / (x_max - x_min)\n",
      "    \n",
      "    fig, ax = plt.subplots(subplot_kw=dict(axisbg='#EEEEEE'))\n",
      "    fig.set_figwidth(12)\n",
      "    fig.set_figheight(8)\n",
      "\n",
      "    scatter = ax.scatter(X[:,0],\n",
      "                         X[:,1],\n",
      "                         c=['red' if main_appData.iloc[i]['appLabel'] == 'unfair' else 'green' for i in range(X.shape[0])],\n",
      "                         s=75,\n",
      "                         alpha=0.2,\n",
      "                         cmap=plt.cm.jet)\n",
      "    ax.grid(color='white', linestyle='solid')\n",
      "\n",
      "    ax.set_title(\"Scatter Plot of Unfair/Fair Apps\", size=24)\n",
      "\n",
      "    labels = [main_appData.iloc[i]['appName'].decode('ascii', 'replace') for i in range(X.shape[0])]\n",
      "    tooltip = plugins.PointLabelTooltip(scatter, labels=labels)\n",
      "    plugins.connect(fig, tooltip)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 23
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "print(\"Computing MDS embedding and plotting with D3\")\n",
      "clf = manifold.MDS(n_components=2, n_init=1, max_iter=100, verbose=1)\n",
      "t0 = time()\n",
      "\n",
      "X_mds = clf.fit_transform(X_scaled)\n",
      "print(\"Done. Stress: %f\" % clf.stress_)\n",
      "plot_d3(X_mds,\n",
      "               \"MDS embedding of the digits (time %.2fs)\" %\n",
      "               (time() - t0))"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "Computing MDS embedding and plotting with D3\n",
        "breaking at iteration 99 with stress 810.282757972"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "Done. Stress: 810.282758\n"
       ]
      },
      {
       "metadata": {},
       "output_type": "display_data",
       "text": [
        "<matplotlib.figure.Figure object at 0x10b08dc90>"
       ]
      }
     ],
     "prompt_number": 24
    }
   ],
   "metadata": {}
  }
 ]
 }
	{
	"metadata": {
	"name": ""
	},
	"nbformat": 3,
	"nbformat_minor": 0,
	"worksheets": [
	{
	"cells": [
	{
	"cell_type": "heading",
	"level": 1,
	"metadata": {},
	"source": [
	"Obidroid Multi Dimensional Scaling (MDS) Notebook"
	]
	},
	{
	"cell_type": "heading",
	"level": 2,
	"metadata": {},
	"source": [
	"Loading App Data"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"I shall be using the previously exported `exports/appFeatures.csv` data for this notebook"
	]
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"%pylab --no-import-all inline"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"Populating the interactive namespace from numpy and matplotlib\n"
	]
	}
	],
	"prompt_number": 14
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"%doctest_mode"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"Exception reporting mode: Plain\n",
	"Doctest mode is: ON\n"
	]
	}
	],
	"prompt_number": 15
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"import pandas as pd\n",
	"import numpy as np\n",
	"from pandas import Series, DataFrame\n",
	"\n",
	"## Load the appFeatures file\n",
	"main_appData = pd.read_csv('exports/appFeatures.csv')"
	],
	"language": "python",
	"metadata": {},
	"outputs": [],
	"prompt_number": 16
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"# This cell creates a dataframe with the same amount of unfair and fair apps, where the fair apps are randomly selected\n",
	"# TODO - Create a method that implements k-fold validation for app selection\n",
	"import random\n",
	"\n",
	"## create unfair apps dataframe and count\n",
	"df_unfair = main_appData[main_appData.appLabel == 'unfair']\n",
	"unfair_count = df_unfair.appLabel.count()\n",
	"\n",
	"## get same number of random fair apps\n",
	"df_randomly_fair = main_appData[main_appData.appLabel == 'fair'].ix[random.sample(main_appData[main_appData.appLabel == 'fair'].index, unfair_count)]\n",
	"#print df_randomly_fair.appLabel.count()\n",
	"\n",
	"\n",
	"# append the newly created dataframe of unfair * 2 rows\n",
	"appData = df_randomly_fair.append(df_unfair)\n",
	"\n",
	"# shuffle the dataframe\n",
	"appData = appData.ix[np.random.permutation(appData.index)]\n",
	"\n",
	"main_appData.columns"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"metadata": {},
	"output_type": "pyout",
	"prompt_number": 17,
	"text": [
	"Index([u'appName', u'adjectiveCount', u'avgRating', u'countCapital', u'exclamationCount', u'hasDeveloperEmail', u'hasDeveloperWebsite', u'Unnamed: 7', u'hasPrivacy', u'installs', u'price', u'revSent', u'revLength', u'appLabel'], dtype='object')"
	]
	}
	],
	"prompt_number": 17
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"\n",
	"\n",
	"def trimDataFrame(df):\n",
	" \"\"\"\n",
	" Lets create a new dataframe for appFeatures and appLabels\n",
	" \"\"\"\n",
	"\n",
	" ## for App Features\n",
	" appCols = set(df.columns)\n",
	" appCols.remove('appName') # remove app Names column\n",
	" appCols.remove('Unnamed: 7') # removing a weird unnamed column\n",
	" appCols.remove('appLabel') # removing the label column\n",
	" appCols.remove('price') # removing price since most of the apps are free\n",
	" appCols.remove('exclamationCount') # remove exclamation count from features, as all values seemed to be 0\n",
	" \n",
	" df_trim = df[list(appCols)]\n",
	" \n",
	" # -- boolean\n",
	" df_trim['hasPrivacy'].astype(bool)\n",
	" df_trim['hasDeveloperEmail'].astype(bool)\n",
	" df_trim['hasDeveloperWebsite'].astype(bool)\n",
	" \n",
	" # -- integer\n",
	" df_trim['adjectiveCount'].astype(int)\n",
	" df_trim['countCapital'].astype(int)\n",
	" df_trim['installs'].astype(int)\n",
	" df_trim['revSent'].astype(int)\n",
	" df_trim['revLength'].astype(int)\n",
	" \n",
	" # -- float\n",
	" df_trim['avgRating'].astype(float)\n",
	" \n",
	" return df_trim"
	],
	"language": "python",
	"metadata": {},
	"outputs": [],
	"prompt_number": 18
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Now, I want to explicitly set types to all my columns as a better practice"
	]
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"# Explicitly casting column types in appFeatures dataframe\n",
	"appFeatures = trimDataFrame(main_appData)\n",
	"\n",
	"\n",
	"# -- boolean\n",
	"appFeatures['hasPrivacy'].astype(bool)\n",
	"appFeatures['hasDeveloperEmail'].astype(bool)\n",
	"appFeatures['hasDeveloperWebsite'].astype(bool)\n",
	"\n",
	"# -- integer\n",
	"appFeatures['adjectiveCount'].astype(int)\n",
	"appFeatures['countCapital'].astype(int)\n",
	"appFeatures['installs'].astype(int)\n",
	"appFeatures['revSent'].astype(int)\n",
	"appFeatures['revLength'].astype(int)\n",
	"\n",
	"# -- float\n",
	"appFeatures['avgRating'].astype(float)\n",
	"\n",
	"\n",
	"appFeatures"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"metadata": {},
	"output_type": "pyout",
	"prompt_number": 19,
	"text": [
	" adjectiveCount hasPrivacy revLength countCapital hasDeveloperWebsite \\\n",
	"0 4 True 601 1 True \n",
	"1 13 True 1139 11 True \n",
	"2 23 True 2223 20 True \n",
	"3 10 False 804 5 True \n",
	"4 22 True 1867 16 True \n",
	"5 18 False 1162 6 True \n",
	"6 18 True 1522 60 True \n",
	"7 13 False 1895 19 True \n",
	"8 11 True 1195 10 True \n",
	"9 19 True 1488 11 True \n",
	"10 18 False 1864 35 True \n",
	"11 19 False 2049 14 True \n",
	"12 8 False 417 2 True \n",
	"13 16 False 1276 11 True \n",
	"14 13 False 1210 12 True \n",
	"15 20 True 2038 24 True \n",
	"16 12 False 1044 10 True \n",
	"17 15 True 1245 13 True \n",
	"18 2 False 225 2 True \n",
	"19 15 False 1120 10 True \n",
	"20 22 True 1406 11 True \n",
	"21 13 True 1063 10 True \n",
	"22 7 False 855 10 True \n",
	"23 17 True 2147 38 True \n",
	"24 12 True 1189 13 True \n",
	"25 19 False 1804 8 True \n",
	"26 26 False 1514 5 True \n",
	"27 12 True 1272 12 True \n",
	"28 17 True 1413 14 True \n",
	"29 7 True 610 2 True \n",
	"30 14 True 1145 5 True \n",
	"31 14 False 1413 14 True \n",
	"32 2 True 573 14 True \n",
	"33 12 True 1387 16 True \n",
	"34 19 True 1336 7 True \n",
	"35 7 True 817 5 True \n",
	"36 29 True 2205 16 True \n",
	"37 7 False 541 3 True \n",
	"38 6 False 310 1 True \n",
	"39 6 False 880 4 True \n",
	"40 5 False 583 4 True \n",
	"41 19 False 1888 20 True \n",
	"42 13 False 1122 4 True \n",
	"43 15 True 1613 21 True \n",
	"44 5 False 1014 11 False \n",
	"45 8 False 1115 6 False \n",
	"46 6 True 437 8 True \n",
	"47 2 True 307 0 True \n",
	"48 13 True 991 4 True \n",
	"49 7 False 578 6 True \n",
	"50 10 True 962 7 True \n",
	"51 13 False 1267 6 True \n",
	"52 9 False 1300 8 True \n",
	"53 16 False 1051 12 True \n",
	"54 16 True 1822 10 True \n",
	"55 7 False 535 0 True \n",
	"56 11 False 1075 8 True \n",
	"57 6 True 691 4 True \n",
	"58 7 True 991 5 True \n",
	"59 7 False 805 14 True \n",
	" ... ... ... ... ... \n",
	"\n",
	" installs hasDeveloperEmail avgRating revSent \n",
	"0 30000000 True 4.051 -3 \n",
	"1 30000000 True 4.351 2 \n",
	"2 3000000 False 4.555 -4 \n",
	"3 30000000 True 4.623 8 \n",
	"4 7500000 False 4.046 -11 \n",
	"5 30000000 True 4.595 1 \n",
	"6 30000000 True 4.526 -4 \n",
	"7 30000000 False 4.039 -5 \n",
	"8 3000000 True 4.400 -2 \n",
	"9 300000 True 3.935 -4 \n",
	"10 3000000 True 4.075 -5 \n",
	"11 750000 False 3.983 -2 \n",
	"12 30000000 True 4.238 1 \n",
	"13 3000000 True 3.915 -3 \n",
	"14 750000 True 4.050 -3 \n",
	"15 750000 True 3.795 -7 \n",
	"16 7500000 True 3.997 1 \n",
	"17 3000000 True 3.212 -5 \n",
	"18 3000000 True 2.611 -1 \n",
	"19 30000000 True 4.547 -3 \n",
	"20 3000000 True 2.671 4 \n",
	"21 750000 False 4.045 -3 \n",
	"22 750000 True 3.555 -9 \n",
	"23 3000000 True 4.590 -5 \n",
	"24 7500000 True 4.258 -9 \n",
	"25 750000 True 4.428 -10 \n",
	"26 3000000 True 4.401 5 \n",
	"27 3000000 True 4.275 -6 \n",
	"28 750000 True 4.149 -8 \n",
	"29 300000 True 4.396 -3 \n",
	"30 3000000 True 4.113 2 \n",
	"31 30000 True 4.240 2 \n",
	"32 300000 True 4.241 -4 \n",
	"33 3000000 True 3.989 -6 \n",
	"34 75000 True 4.310 -7 \n",
	"35 3000000 True 4.451 5 \n",
	"36 300000 True 3.916 -9 \n",
	"37 300000 True 4.761 1 \n",
	"38 300000 True 4.158 0 \n",
	"39 300000 True 2.972 -4 \n",
	"40 300000 True 3.903 -6 \n",
	"41 3000000 True 3.433 -9 \n",
	"42 3000000 True 4.412 -3 \n",
	"43 3000000 False 4.461 15 \n",
	"44 300000 True 3.564 -2 \n",
	"45 3000000 True 4.131 5 \n",
	"46 30000 True 3.550 -1 \n",
	"47 3000000 True 4.435 1 \n",
	"48 300000 True 4.233 -3 \n",
	"49 3000000 True 3.975 1 \n",
	"50 300000 True 3.926 4 \n",
	"51 3000000 True 4.590 2 \n",
	"52 300000 True 3.601 -9 \n",
	"53 300000 True 3.701 3 \n",
	"54 300000 True 2.931 -13 \n",
	"55 3000000 True 4.564 0 \n",
	"56 7500000 True 4.179 -7 \n",
	"57 3000000 True 4.466 7 \n",
	"58 750000 False 4.340 -3 \n",
	"59 300000 True 4.539 3 \n",
	" ... ... ... ... \n",
	"\n",
	"[323 rows x 9 columns]"
	]
	}
	],
	"prompt_number": 19
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"---"
	]
	},
	{
	"cell_type": "heading",
	"level": 2,
	"metadata": {},
	"source": [
	"Unsupervised Learning"
	]
	},
	{
	"cell_type": "heading",
	"level": 4,
	"metadata": {},
	"source": [
	"Scaling the feature vector"
	]
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"# get all the rows for unsupervised learning\n",
	"appData_all = trimDataFrame(main_appData)\n",
	"\n",
	"appData_all.head()\n",
	"\n"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"metadata": {},
	"output_type": "pyout",
	"prompt_number": 20,
	"text": [
	" adjectiveCount hasPrivacy revLength countCapital hasDeveloperWebsite \\\n",
	"0 4 True 601 1 True \n",
	"1 13 True 1139 11 True \n",
	"2 23 True 2223 20 True \n",
	"3 10 False 804 5 True \n",
	"4 22 True 1867 16 True \n",
	"\n",
	" installs hasDeveloperEmail avgRating revSent \n",
	"0 30000000 True 4.051 -3 \n",
	"1 30000000 True 4.351 2 \n",
	"2 3000000 False 4.555 -4 \n",
	"3 30000000 True 4.623 8 \n",
	"4 7500000 False 4.046 -11 \n",
	"\n",
	"[5 rows x 9 columns]"
	]
	}
	],
	"prompt_number": 20
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"from sklearn.preprocessing import MinMaxScaler\n",
	"min_max_scaler = MinMaxScaler()\n",
	"\n",
	"print(appData_all.columns)\n",
	"#print appData_all\n",
	"\n",
	"# scale the dataframe\n",
	"X_scaled = min_max_scaler.fit_transform(appData_all)"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"Index([u'adjectiveCount', u'hasPrivacy', u'revLength', u'countCapital', u'hasDeveloperWebsite', u'installs', u'hasDeveloperEmail', u'avgRating', u'revSent'], dtype='object')\n"
	]
	}
	],
	"prompt_number": 21
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"from time import time\n",
	"import pylab as pl\n",
	"from matplotlib import offsetbox\n",
	"from sklearn import (manifold, datasets, decomposition, ensemble, lda,\n",
	" random_projection)\n",
	"from mpld3 import enable_notebook\n",
	"from mpld3 import plugins\n",
	"enable_notebook()"
	],
	"language": "python",
	"metadata": {},
	"outputs": [],
	"prompt_number": 22
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"# Plot D3 scatter\n",
	"def plot_d3(X, title=None):\n",
	" x_min, x_max = np.min(X, 0), np.max(X, 0)\n",
	" X = (X - x_min) / (x_max - x_min)\n",
	" \n",
	" fig, ax = plt.subplots(subplot_kw=dict(axisbg='#EEEEEE'))\n",
	" fig.set_figwidth(12)\n",
	" fig.set_figheight(8)\n",
	"\n",
	" scatter = ax.scatter(X[:,0],\n",
	" X[:,1],\n",
	" c=['red' if main_appData.iloc[i]['appLabel'] == 'unfair' else 'green' for i in range(X.shape[0])],\n",
	" s=75,\n",
	" alpha=0.2,\n",
	" cmap=plt.cm.jet)\n",
	" ax.grid(color='white', linestyle='solid')\n",
	"\n",
	" ax.set_title(\"Scatter Plot of Unfair/Fair Apps\", size=24)\n",
	"\n",
	" labels = [main_appData.iloc[i]['appName'].decode('ascii', 'replace') for i in range(X.shape[0])]\n",
	" tooltip = plugins.PointLabelTooltip(scatter, labels=labels)\n",
	" plugins.connect(fig, tooltip)"
	],
	"language": "python",
	"metadata": {},
	"outputs": [],
	"prompt_number": 23
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"print(\"Computing MDS embedding and plotting with D3\")\n",
	"clf = manifold.MDS(n_components=2, n_init=1, max_iter=100, verbose=1)\n",
	"t0 = time()\n",
	"\n",
	"X_mds = clf.fit_transform(X_scaled)\n",
	"print(\"Done. Stress: %f\" % clf.stress_)\n",
	"plot_d3(X_mds,\n",
	" \"MDS embedding of the digits (time %.2fs)\" %\n",
	" (time() - t0))"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"Computing MDS embedding and plotting with D3\n",
	"breaking at iteration 99 with stress 810.282757972"
	]
	},
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"\n",
	"Done. Stress: 810.282758\n"
	]
	},
	{
	"metadata": {},
	"output_type": "display_data",
	"text": [
	"<matplotlib.figure.Figure object at 0x10b08dc90>"
	]
	}
	],
	"prompt_number": 24
	}
	],
	"metadata": {}
	}
	]
	}
No results found