lschwetlick · February 28, 2020 17:19
diff --git a/sk1.ipynb b/sk1.ipynb
 {
  "cells": [
    {
      "cell_type": "markdown",
      "source": [
        "# Problems with PCA"
      ],
      "metadata": {
        "nteract": {
          "transient": {
            "deleting": false
          }
        }
      }
    },
    {
      "cell_type": "code",
      "source": [
        "import numpy as np\n",
        "from sklearn.decomposition import PCA\n",
        "import scipy as sp\n",
        "from sklearn.decomposition.pca import _assess_dimension\n",
        "from sklearn.decomposition.pca import _infer_dimension"
      ],
      "outputs": [],
      "execution_count": 104,
      "metadata": {
        "collapsed": false,
        "jupyter": {
          "source_hidden": false,
          "outputs_hidden": false
        },
        "nteract": {
          "transient": {
            "deleting": false
          }
        },
        "execution": {
          "iopub.status.busy": "2020-02-28T16:58:58.071Z",
          "iopub.execute_input": "2020-02-28T16:58:58.077Z",
          "iopub.status.idle": "2020-02-28T16:58:58.085Z",
          "shell.execute_reply": "2020-02-28T16:58:58.089Z"
        }
      }
    },
    {
      "cell_type": "markdown",
      "source": [
        "First, lets check out what `_assess_dim` does. It uses the output of the singular value decomposition as input."
      ],
      "metadata": {
        "nteract": {
          "transient": {
            "deleting": false
          }
        }
      }
    },
    {
      "cell_type": "code",
      "source": [
        "a = np.random.randn(9, 6)\n",
        "b = np.ones((9, 6))\n",
        "c = np.zeros((9, 6))"
      ],
      "outputs": [],
      "execution_count": 105,
      "metadata": {
        "collapsed": false,
        "jupyter": {
          "source_hidden": false,
          "outputs_hidden": false
        },
        "nteract": {
          "transient": {
            "deleting": false
          }
        },
        "execution": {
          "iopub.status.busy": "2020-02-28T16:58:58.824Z",
          "iopub.execute_input": "2020-02-28T16:58:58.831Z",
          "iopub.status.idle": "2020-02-28T16:58:58.839Z",
          "shell.execute_reply": "2020-02-28T16:58:58.844Z"
        }
      }
    },
    {
      "cell_type": "markdown",
      "source": [
        "The first matrix is of full rank and  gives us a list of 6 eigenvalues."
      ],
      "metadata": {
        "nteract": {
          "transient": {
            "deleting": false
          }
        }
      }
    },
    {
      "cell_type": "code",
      "source": [
        "print(\"rank=\", np.linalg.matrix_rank(a))\n",
        "u1, s1, vh1 = np.linalg.svd(a, full_matrices=True)\n",
        "s1"
      ],
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "rank= 6\n"
          ]
        },
        {
          "output_type": "execute_result",
          "execution_count": 106,
          "data": {
            "text/plain": [
              "array([4.58644552, 4.27596537, 2.93635641, 2.26594033, 1.66861791,\n",
              "       1.0130635 ])"
            ]
          },
          "metadata": {}
        }
      ],
      "execution_count": 106,
      "metadata": {
        "collapsed": false,
        "jupyter": {
          "source_hidden": false,
          "outputs_hidden": false
        },
        "nteract": {
          "transient": {
            "deleting": false
          }
        },
        "execution": {
          "iopub.status.busy": "2020-02-28T16:58:59.976Z",
          "iopub.execute_input": "2020-02-28T16:58:59.981Z",
          "iopub.status.idle": "2020-02-28T16:58:59.994Z",
          "shell.execute_reply": "2020-02-28T16:58:59.998Z"
        }
      }
    },
    {
      "cell_type": "markdown",
      "source": [
        " The second matrix has no variance but is of rank 1. It still gives a list of 6, but 5 of 6 elements are 0."
      ],
      "metadata": {
        "nteract": {
          "transient": {
            "deleting": false
          }
        }
      }
    },
    {
      "cell_type": "code",
      "source": [
        "print(\"rank=\", np.linalg.matrix_rank(b))\n",
        "u2, s2, vh2 = np.linalg.svd(b, full_matrices=True)\n",
        "s2"
      ],
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "rank= 1\n"
          ]
        },
        {
          "output_type": "execute_result",
          "execution_count": 107,
          "data": {
            "text/plain": [
              "array([7.34846923, 0.        , 0.        , 0.        , 0.        ,\n",
              "       0.        ])"
            ]
          },
          "metadata": {}
        }
      ],
      "execution_count": 107,
      "metadata": {
        "collapsed": false,
        "jupyter": {
          "source_hidden": false,
          "outputs_hidden": false
        },
        "nteract": {
          "transient": {
            "deleting": false
          }
        },
        "execution": {
          "iopub.status.busy": "2020-02-28T16:59:04.754Z",
          "iopub.execute_input": "2020-02-28T16:59:04.758Z",
          "iopub.status.idle": "2020-02-28T16:59:04.771Z",
          "shell.execute_reply": "2020-02-28T16:59:04.776Z"
        }
      }
    },
    {
      "cell_type": "markdown",
      "source": [
        "The third matrix has no variance and is of rank 0. It still gives a list of 6, all 6 elements are 0."
      ],
      "metadata": {
        "nteract": {
          "transient": {
            "deleting": false
          }
        }
      }
    },
    {
      "cell_type": "code",
      "source": [
        "print(\"rank=\", np.linalg.matrix_rank(c))\n",
        "u3, s3, vh3 = np.linalg.svd(c, full_matrices=True)\n",
        "s3"
      ],
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "rank= 0\n"
          ]
        },
        {
          "output_type": "execute_result",
          "execution_count": 108,
          "data": {
            "text/plain": [
              "array([0., 0., 0., 0., 0., 0.])"
            ]
          },
          "metadata": {}
        }
      ],
      "execution_count": 108,
      "metadata": {
        "collapsed": false,
        "jupyter": {
          "source_hidden": false,
          "outputs_hidden": false
        },
        "nteract": {
          "transient": {
            "deleting": false
          }
        },
        "execution": {
          "iopub.status.busy": "2020-02-28T16:59:10.343Z",
          "iopub.execute_input": "2020-02-28T16:59:10.349Z",
          "iopub.status.idle": "2020-02-28T16:59:10.365Z",
          "shell.execute_reply": "2020-02-28T16:59:10.371Z"
        }
      }
    },
    {
      "cell_type": "markdown",
      "source": [
        "What does pca's `_assess_dimension` do with this? It is capable of assessing the likelihood of each rank. Rank 0,1,2,3,4,5, and 6."
      ],
      "metadata": {
        "nteract": {
          "transient": {
            "deleting": false
          }
        }
      }
    },
    {
      "cell_type": "code",
      "source": [
        "for r in range(7):\n",
        "    print(r)\n",
        "    print(_assess_dimension(np.asarray(s1), r, 9, 6))"
      ],
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "0\n",
            "-26.794487496983052\n",
            "1\n",
            "-27.59120266903175\n",
            "2\n",
            "-29.04631199386823\n",
            "3\n",
            "-30.397847556807104\n",
            "4\n",
            "-31.69273382607098\n",
            "5\n",
            "-32.95111461773282\n",
            "6\n",
            "-33.8239355537562\n"
          ]
        }
      ],
      "execution_count": 109,
      "metadata": {
        "collapsed": false,
        "jupyter": {
          "source_hidden": false,
          "outputs_hidden": false
        },
        "nteract": {
          "transient": {
            "deleting": false
          }
        },
        "execution": {
          "iopub.status.busy": "2020-02-28T16:59:20.017Z",
          "iopub.execute_input": "2020-02-28T16:59:20.023Z",
          "iopub.status.idle": "2020-02-28T16:59:20.034Z",
          "shell.execute_reply": "2020-02-28T16:59:20.039Z"
        }
      }
    },
    {
      "cell_type": "code",
      "source": [
        "for r in range(7):\n",
        "    print(r)\n",
        "    print(_assess_dimension(np.asarray(s2), r, 9, 6))"
      ],
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "0\n",
            "-4.554840426255549\n",
            "1\n",
            "-inf\n",
            "2\n",
            "-inf\n",
            "3\n",
            "-inf\n",
            "4\n",
            "-inf\n",
            "5\n",
            "-inf\n",
            "6\n",
            "nan\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "/Users/lisa/Documents/Projects/scikit-learn/sklearn/decomposition/_pca.py:68: RuntimeWarning: divide by zero encountered in log\n",
            "  pl = np.sum(np.log(spectrum[:rank]))\n",
            "/Users/lisa/Documents/Projects/scikit-learn/sklearn/decomposition/_pca.py:98: RuntimeWarning: divide by zero encountered in double_scalars\n",
            "  (1. / spectrum_[j] - 1. / spectrum_[i])) + log(n_samples)\n",
            "/Users/lisa/Documents/Projects/scikit-learn/sklearn/decomposition/_pca.py:100: RuntimeWarning: invalid value encountered in double_scalars\n",
            "  ll = pu + pl + pv + pp - pa / 2. - rank * log(n_samples) / 2.\n"
          ]
        }
      ],
      "execution_count": 110,
      "metadata": {
        "collapsed": false,
        "jupyter": {
          "source_hidden": false,
          "outputs_hidden": false
        },
        "nteract": {
          "transient": {
            "deleting": false
          }
        },
        "execution": {
          "iopub.status.busy": "2020-02-28T16:59:20.781Z",
          "iopub.execute_input": "2020-02-28T16:59:20.786Z",
          "iopub.status.idle": "2020-02-28T16:59:20.799Z",
          "shell.execute_reply": "2020-02-28T16:59:20.804Z"
        }
      }
    },
    {
      "cell_type": "code",
      "source": [
        "for r in range(7):\n",
        "    print(r)\n",
        "    print(_assess_dimension(np.asarray(s3), r, 9, 6))"
      ],
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "0\n",
            "-inf\n",
            "1\n",
            "-inf\n",
            "2\n",
            "-inf\n",
            "3\n",
            "-inf\n",
            "4\n",
            "-inf\n",
            "5\n",
            "-inf\n",
            "6\n",
            "inf\n"
          ]
        }
      ],
      "execution_count": 111,
      "metadata": {
        "collapsed": false,
        "jupyter": {
          "source_hidden": false,
          "outputs_hidden": false
        },
        "nteract": {
          "transient": {
            "deleting": false
          }
        },
        "execution": {
          "iopub.status.busy": "2020-02-28T16:59:27.300Z",
          "iopub.execute_input": "2020-02-28T16:59:27.306Z",
          "iopub.status.idle": "2020-02-28T16:59:27.317Z",
          "shell.execute_reply": "2020-02-28T16:59:27.323Z"
        }
      }
    },
    {
      "cell_type": "markdown",
      "source": [
        "But it does show strange behavior on the full rank on both of the edge case matrices (all zeros and all ones) with the 7th value being nan or +inf..."
      ],
      "metadata": {
        "nteract": {
          "transient": {
            "deleting": false
          }
        }
      }
    },
    {
      "cell_type": "markdown",
      "source": [
        "Here are some more interesting cases that dont work"
      ],
      "metadata": {
        "nteract": {
          "transient": {
            "deleting": false
          }
        }
      }
    },
    {
      "cell_type": "code",
      "source": [
        "d = np.zeros((9,6))\n",
        "d[:,2]=1\n",
        "d[:,5]=5\n",
        "print(\"rank=\", np.linalg.matrix_rank(d))\n",
        "u4, s4, vh4 = np.linalg.svd(d, full_matrices=True)\n",
        "s4\n",
        "d"
      ],
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "rank= 1\n"
          ]
        },
        {
          "output_type": "execute_result",
          "execution_count": 118,
          "data": {
            "text/plain": [
              "array([[0., 0., 1., 0., 0., 5.],\n",
              "       [0., 0., 1., 0., 0., 5.],\n",
              "       [0., 0., 1., 0., 0., 5.],\n",
              "       [0., 0., 1., 0., 0., 5.],\n",
              "       [0., 0., 1., 0., 0., 5.],\n",
              "       [0., 0., 1., 0., 0., 5.],\n",
              "       [0., 0., 1., 0., 0., 5.],\n",
              "       [0., 0., 1., 0., 0., 5.],\n",
              "       [0., 0., 1., 0., 0., 5.]])"
            ]
          },
          "metadata": {}
        }
      ],
      "execution_count": 118,
      "metadata": {
        "collapsed": false,
        "jupyter": {
          "source_hidden": false,
          "outputs_hidden": false
        },
        "nteract": {
          "transient": {
            "deleting": false
          }
        },
        "execution": {
          "iopub.status.busy": "2020-02-28T17:00:47.965Z",
          "iopub.execute_input": "2020-02-28T17:00:47.970Z",
          "iopub.status.idle": "2020-02-28T17:00:47.982Z",
          "shell.execute_reply": "2020-02-28T17:00:47.987Z"
        }
      }
    },
    {
      "cell_type": "code",
      "source": [
        "for r in range(7):\n",
        "    print(r)\n",
        "    print(_assess_dimension(np.asarray(s4), r, 9, 6))"
      ],
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "0\n",
            "-24.35039085496681\n",
            "1\n",
            "-inf\n",
            "2\n",
            "-inf\n",
            "3\n",
            "-inf\n",
            "4\n",
            "-inf\n",
            "5\n",
            "-inf\n",
            "6\n"
          ]
        },
        {
          "output_type": "error",
          "ename": "ValueError",
          "evalue": "math domain error",
          "traceback": [
            "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
            "\u001b[0;31mValueError\u001b[0m                                Traceback (most recent call last)",
            "\u001b[0;32m<ipython-input-113-e282dbdd7bb2>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mr\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mrange\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m7\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      2\u001b[0m     \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mr\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m     \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0m_assess_dimension\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0masarray\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ms4\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mr\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m9\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m6\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
            "\u001b[0;32m~/Documents/Projects/scikit-learn/sklearn/decomposition/_pca.py\u001b[0m in \u001b[0;36m_assess_dimension\u001b[0;34m(spectrum, rank, n_samples, n_features)\u001b[0m\n\u001b[1;32m     96\u001b[0m         \u001b[0;32mfor\u001b[0m \u001b[0mj\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mrange\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mi\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mspectrum\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     97\u001b[0m             pa += log((spectrum[i] - spectrum[j]) *\n\u001b[0;32m---> 98\u001b[0;31m                       (1. / spectrum_[j] - 1. / spectrum_[i])) + log(n_samples)\n\u001b[0m\u001b[1;32m     99\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    100\u001b[0m     \u001b[0mll\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpu\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mpl\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mpv\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mpp\u001b[0m \u001b[0;34m-\u001b[0m \u001b[0mpa\u001b[0m \u001b[0;34m/\u001b[0m \u001b[0;36m2.\u001b[0m \u001b[0;34m-\u001b[0m \u001b[0mrank\u001b[0m \u001b[0;34m*\u001b[0m \u001b[0mlog\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mn_samples\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m/\u001b[0m \u001b[0;36m2.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
            "\u001b[0;31mValueError\u001b[0m: math domain error"
          ]
        }
      ],
      "execution_count": 113,
      "metadata": {
        "collapsed": false,
        "jupyter": {
          "source_hidden": false,
          "outputs_hidden": false
        },
        "nteract": {
          "transient": {
            "deleting": false
          }
        },
        "execution": {
          "iopub.status.busy": "2020-02-28T16:59:33.931Z",
          "iopub.execute_input": "2020-02-28T16:59:33.936Z",
          "iopub.status.idle": "2020-02-28T16:59:33.951Z",
          "shell.execute_reply": "2020-02-28T16:59:33.965Z"
        }
      }
    },
    {
      "cell_type": "markdown",
      "source": [
        "One Step higher, `_infer_dimension` is the function that does this iteration. It never tests full rank. It also kind of can't because it is meant to return an index into the spectrum vector which is of length 6. If we test full rank, the ll vector will be of length 7, so we can't take the argmax. "
      ],
      "metadata": {
        "nteract": {
          "transient": {
            "deleting": false
          }
        }
      }
    },
    {
      "cell_type": "code",
      "source": [
        "_infer_dimension(s1, 9, 6)"
      ],
      "outputs": [
        {
          "output_type": "execute_result",
          "execution_count": 114,
          "data": {
            "text/plain": [
              "0"
            ]
          },
          "metadata": {}
        }
      ],
      "execution_count": 114,
      "metadata": {
        "collapsed": false,
        "jupyter": {
          "source_hidden": false,
          "outputs_hidden": false
        },
        "nteract": {
          "transient": {
            "deleting": false
          }
        },
        "execution": {
          "iopub.status.busy": "2020-02-28T16:59:41.188Z",
          "iopub.execute_input": "2020-02-28T16:59:41.194Z",
          "iopub.status.idle": "2020-02-28T16:59:41.204Z",
          "shell.execute_reply": "2020-02-28T16:59:41.209Z"
        }
      }
    },
    {
      "cell_type": "code",
      "source": [
        "_infer_dimension(s2, 9, 6)"
      ],
      "outputs": [
        {
          "output_type": "execute_result",
          "execution_count": 115,
          "data": {
            "text/plain": [
              "0"
            ]
          },
          "metadata": {}
        }
      ],
      "execution_count": 115,
      "metadata": {
        "collapsed": false,
        "jupyter": {
          "source_hidden": false,
          "outputs_hidden": false
        },
        "nteract": {
          "transient": {
            "deleting": false
          }
        },
        "execution": {
          "iopub.status.busy": "2020-02-28T16:59:41.718Z",
          "iopub.execute_input": "2020-02-28T16:59:41.723Z",
          "iopub.status.idle": "2020-02-28T16:59:41.732Z",
          "shell.execute_reply": "2020-02-28T16:59:41.737Z"
        }
      }
    },
    {
      "cell_type": "code",
      "source": [
        "_infer_dimension(s3, 9, 6)"
      ],
      "outputs": [
        {
          "output_type": "execute_result",
          "execution_count": 116,
          "data": {
            "text/plain": [
              "0"
            ]
          },
          "metadata": {}
        }
      ],
      "execution_count": 116,
      "metadata": {
        "collapsed": false,
        "jupyter": {
          "source_hidden": false,
          "outputs_hidden": false
        },
        "nteract": {
          "transient": {
            "deleting": false
          }
        },
        "execution": {
          "iopub.status.busy": "2020-02-28T16:59:47.292Z",
          "iopub.execute_input": "2020-02-28T16:59:47.297Z",
          "iopub.status.idle": "2020-02-28T16:59:47.308Z",
          "shell.execute_reply": "2020-02-28T16:59:47.313Z"
        }
      }
    },
    {
      "cell_type": "code",
      "source": [
        "_infer_dimension(s4, 9, 6)"
      ],
      "outputs": [
        {
          "output_type": "execute_result",
          "execution_count": 117,
          "data": {
            "text/plain": [
              "0"
            ]
          },
          "metadata": {}
        }
      ],
      "execution_count": 117,
      "metadata": {
        "collapsed": false,
        "jupyter": {
          "source_hidden": false,
          "outputs_hidden": false
        },
        "nteract": {
          "transient": {
            "deleting": false
          }
        },
        "execution": {
          "iopub.status.busy": "2020-02-28T16:59:48.033Z",
          "iopub.execute_input": "2020-02-28T16:59:48.040Z",
          "iopub.status.idle": "2020-02-28T16:59:48.051Z",
          "shell.execute_reply": "2020-02-28T16:59:48.056Z"
        }
      }
    },
    {
      "cell_type": "markdown",
      "source": [
        "I assume it's always returning zero here because the spectrum comes sorted and all of these matrices have rank 1 or full and spectrum is sorted by linalg"
      ],
      "metadata": {
        "nteract": {
          "transient": {
            "deleting": false
          }
        }
      }
    },
    {
      "cell_type": "markdown",
      "source": [
        "So next Ill construct a matrix that has the largest variation in the last column. In my understanding that would mean that should be the main component then."
      ],
      "metadata": {
        "nteract": {
          "transient": {
            "deleting": false
          }
        }
      }
    },
    {
      "cell_type": "code",
      "source": [
        "e = np.zeros((9,6))\n",
        "e[:,0]=1\n",
        "e[:,1]=1\n",
        "e[:,2]=1\n",
        "e[:,3]=2\n",
        "e[:,4]=[1,2,1,1,1,1,1,1,2]\n",
        "e[:,5]=[1,2,3,4,5,6,7,8,9]\n",
        "print(\"rank =\", np.linalg.matrix_rank(e))\n",
        "u5, s5, vh5 = np.linalg.svd(e, full_matrices=True)\n",
        "s5"
      ],
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "rank = 3\n"
          ]
        },
        {
          "output_type": "execute_result",
          "execution_count": 95,
          "data": {
            "text/plain": [
              "array([1.86723970e+01, 3.61449172e+00, 1.13006188e+00, 3.96930995e-16,\n",
              "       4.51401869e-17, 2.22329483e-17])"
            ]
          },
          "metadata": {}
        }
      ],
      "execution_count": 95,
      "metadata": {
        "collapsed": false,
        "jupyter": {
          "source_hidden": false,
          "outputs_hidden": false
        },
        "nteract": {
          "transient": {
            "deleting": false
          }
        },
        "execution": {
          "iopub.status.busy": "2020-02-28T16:55:06.091Z",
          "iopub.execute_input": "2020-02-28T16:55:06.097Z",
          "iopub.status.idle": "2020-02-28T16:55:06.113Z",
          "shell.execute_reply": "2020-02-28T16:55:06.118Z"
        }
      }
    },
    {
      "cell_type": "code",
      "source": [
        "ll = []\n",
        "for r in range(7):\n",
        "    print(r)\n",
        "    ll1 = _assess_dimension(np.asarray(s5), r, 9, 6)\n",
        "    print(ll1)\n",
        "    ll.append(ll1)"
      ],
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "0\n",
            "-35.846979706242\n",
            "1\n",
            "-22.764631367309146\n",
            "2\n",
            "-17.637601080831697\n",
            "3\n",
            "-inf\n",
            "4\n",
            "-inf\n",
            "5\n",
            "-inf\n",
            "6\n",
            "287.3210868655406\n"
          ]
        }
      ],
      "execution_count": 120,
      "metadata": {
        "collapsed": false,
        "jupyter": {
          "source_hidden": false,
          "outputs_hidden": false
        },
        "nteract": {
          "transient": {
            "deleting": false
          }
        },
        "execution": {
          "iopub.status.busy": "2020-02-28T17:17:16.479Z",
          "iopub.execute_input": "2020-02-28T17:17:16.485Z",
          "iopub.status.idle": "2020-02-28T17:17:16.496Z",
          "shell.execute_reply": "2020-02-28T17:17:16.501Z"
        }
      }
    },
    {
      "cell_type": "markdown",
      "source": [
        "Here the last value (the rank we never test) clearly has the biggest number. `infer_dimension` returns 2 though."
      ],
      "metadata": {
        "nteract": {
          "transient": {
            "deleting": false
          }
        }
      }
    },
    {
      "cell_type": "code",
      "source": [
        "_infer_dimension(s5, 9, 6)"
      ],
      "outputs": [
        {
          "output_type": "execute_result",
          "execution_count": 98,
          "data": {
            "text/plain": [
              "2"
            ]
          },
          "metadata": {}
        }
      ],
      "execution_count": 98,
      "metadata": {
        "collapsed": false,
        "jupyter": {
          "source_hidden": false,
          "outputs_hidden": false
        },
        "nteract": {
          "transient": {
            "deleting": false
          }
        },
        "execution": {
          "iopub.status.busy": "2020-02-28T16:56:30.021Z",
          "iopub.execute_input": "2020-02-28T16:56:30.027Z",
          "iopub.status.idle": "2020-02-28T16:56:30.041Z",
          "shell.execute_reply": "2020-02-28T16:56:30.046Z"
        }
      }
    },
    {
      "cell_type": "markdown",
      "source": [
        "The trouble is that even if we do test for the last rank, we can't really use the argmax because we are always testing the number of values in the spectrum AND rank 0."
      ],
      "metadata": {
        "nteract": {
          "transient": {
            "deleting": false
          }
        }
      }
    },
    {
      "cell_type": "code",
      "source": [
        "print(np.argmax(ll))\n",
        "print(len(s5))\n",
        "s5[np.argmax(ll)]\n"
      ],
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "6\n",
            "6\n"
          ]
        },
        {
          "output_type": "error",
          "ename": "IndexError",
          "evalue": "index 6 is out of bounds for axis 0 with size 6",
          "traceback": [
            "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
            "\u001b[0;31mIndexError\u001b[0m                                Traceback (most recent call last)",
            "\u001b[0;32m<ipython-input-122-368585a97f86>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0margmax\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mll\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      2\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ms5\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m \u001b[0ms5\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0margmax\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mll\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
            "\u001b[0;31mIndexError\u001b[0m: index 6 is out of bounds for axis 0 with size 6"
          ]
        }
      ],
      "execution_count": 122,
      "metadata": {
        "collapsed": false,
        "jupyter": {
          "source_hidden": false,
          "outputs_hidden": false
        },
        "nteract": {
          "transient": {
            "deleting": false
          }
        },
        "execution": {
          "iopub.status.busy": "2020-02-28T17:17:52.521Z",
          "iopub.execute_input": "2020-02-28T17:17:52.527Z",
          "iopub.status.idle": "2020-02-28T17:17:52.542Z",
          "shell.execute_reply": "2020-02-28T17:17:52.548Z"
        }
      }
    },
    {
      "cell_type": "code",
      "source": [
        "pca = PCA(n_components=\"mle\")\n",
        "my_fit = pca.fit(e)\n",
        "#print(pca.explained_variance_ratio_)\n",
        "#print(pca.singular_values_)\n",
        "print(my_fit)"
      ],
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "PCA(copy=True, iterated_power='auto', n_components='mle', random_state=None,\n",
            "    svd_solver='auto', tol=0.0, whiten=False)\n"
          ]
        }
      ],
      "execution_count": 103,
      "metadata": {
        "collapsed": false,
        "jupyter": {
          "source_hidden": false,
          "outputs_hidden": false
        },
        "nteract": {
          "transient": {
            "deleting": false
          }
        },
        "execution": {
          "iopub.status.busy": "2020-02-28T16:57:30.462Z",
          "iopub.execute_input": "2020-02-28T16:57:30.469Z",
          "iopub.status.idle": "2020-02-28T16:57:30.480Z",
          "shell.execute_reply": "2020-02-28T16:57:30.486Z"
        }
      }
    }
  ],
  "metadata": {
    "kernel_info": {
      "name": "scenewalk"
    },
    "language_info": {
      "name": "python",
      "version": "3.6.3",
      "mimetype": "text/x-python",
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "pygments_lexer": "ipython3",
      "nbconvert_exporter": "python",
      "file_extension": ".py"
    },
    "kernelspec": {
      "argv": [
        "/Users/lisa/Documents/virtual_envs/scenewalk/bin/python3",
        "-m",
        "ipykernel_launcher",
        "-f",
        "{connection_file}"
      ],
      "display_name": "SceneWalk",
      "language": "python",
      "name": "scenewalk"
    },
    "nteract": {
      "version": "0.22.0"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
 }