Last active
February 28, 2020 17:19
-
-
Save lschwetlick/c7683bb5437f0b15e3ca40f5f2674927 to your computer and use it in GitHub Desktop.
Scikit-learn problem with PCA
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"# Problems with PCA" | |
], | |
"metadata": { | |
"nteract": { | |
"transient": { | |
"deleting": false | |
} | |
} | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"import numpy as np\n", | |
"from sklearn.decomposition import PCA\n", | |
"import scipy as sp\n", | |
"from sklearn.decomposition.pca import _assess_dimension\n", | |
"from sklearn.decomposition.pca import _infer_dimension" | |
], | |
"outputs": [], | |
"execution_count": 104, | |
"metadata": { | |
"collapsed": false, | |
"jupyter": { | |
"source_hidden": false, | |
"outputs_hidden": false | |
}, | |
"nteract": { | |
"transient": { | |
"deleting": false | |
} | |
}, | |
"execution": { | |
"iopub.status.busy": "2020-02-28T16:58:58.071Z", | |
"iopub.execute_input": "2020-02-28T16:58:58.077Z", | |
"iopub.status.idle": "2020-02-28T16:58:58.085Z", | |
"shell.execute_reply": "2020-02-28T16:58:58.089Z" | |
} | |
} | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"First, lets check out what `_assess_dim` does. It uses the output of the singular value decomposition as input." | |
], | |
"metadata": { | |
"nteract": { | |
"transient": { | |
"deleting": false | |
} | |
} | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"a = np.random.randn(9, 6)\n", | |
"b = np.ones((9, 6))\n", | |
"c = np.zeros((9, 6))" | |
], | |
"outputs": [], | |
"execution_count": 105, | |
"metadata": { | |
"collapsed": false, | |
"jupyter": { | |
"source_hidden": false, | |
"outputs_hidden": false | |
}, | |
"nteract": { | |
"transient": { | |
"deleting": false | |
} | |
}, | |
"execution": { | |
"iopub.status.busy": "2020-02-28T16:58:58.824Z", | |
"iopub.execute_input": "2020-02-28T16:58:58.831Z", | |
"iopub.status.idle": "2020-02-28T16:58:58.839Z", | |
"shell.execute_reply": "2020-02-28T16:58:58.844Z" | |
} | |
} | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"The first matrix is of full rank and gives us a list of 6 eigenvalues." | |
], | |
"metadata": { | |
"nteract": { | |
"transient": { | |
"deleting": false | |
} | |
} | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"print(\"rank=\", np.linalg.matrix_rank(a))\n", | |
"u1, s1, vh1 = np.linalg.svd(a, full_matrices=True)\n", | |
"s1" | |
], | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"rank= 6\n" | |
] | |
}, | |
{ | |
"output_type": "execute_result", | |
"execution_count": 106, | |
"data": { | |
"text/plain": [ | |
"array([4.58644552, 4.27596537, 2.93635641, 2.26594033, 1.66861791,\n", | |
" 1.0130635 ])" | |
] | |
}, | |
"metadata": {} | |
} | |
], | |
"execution_count": 106, | |
"metadata": { | |
"collapsed": false, | |
"jupyter": { | |
"source_hidden": false, | |
"outputs_hidden": false | |
}, | |
"nteract": { | |
"transient": { | |
"deleting": false | |
} | |
}, | |
"execution": { | |
"iopub.status.busy": "2020-02-28T16:58:59.976Z", | |
"iopub.execute_input": "2020-02-28T16:58:59.981Z", | |
"iopub.status.idle": "2020-02-28T16:58:59.994Z", | |
"shell.execute_reply": "2020-02-28T16:58:59.998Z" | |
} | |
} | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
" The second matrix has no variance but is of rank 1. It still gives a list of 6, but 5 of 6 elements are 0." | |
], | |
"metadata": { | |
"nteract": { | |
"transient": { | |
"deleting": false | |
} | |
} | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"print(\"rank=\", np.linalg.matrix_rank(b))\n", | |
"u2, s2, vh2 = np.linalg.svd(b, full_matrices=True)\n", | |
"s2" | |
], | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"rank= 1\n" | |
] | |
}, | |
{ | |
"output_type": "execute_result", | |
"execution_count": 107, | |
"data": { | |
"text/plain": [ | |
"array([7.34846923, 0. , 0. , 0. , 0. ,\n", | |
" 0. ])" | |
] | |
}, | |
"metadata": {} | |
} | |
], | |
"execution_count": 107, | |
"metadata": { | |
"collapsed": false, | |
"jupyter": { | |
"source_hidden": false, | |
"outputs_hidden": false | |
}, | |
"nteract": { | |
"transient": { | |
"deleting": false | |
} | |
}, | |
"execution": { | |
"iopub.status.busy": "2020-02-28T16:59:04.754Z", | |
"iopub.execute_input": "2020-02-28T16:59:04.758Z", | |
"iopub.status.idle": "2020-02-28T16:59:04.771Z", | |
"shell.execute_reply": "2020-02-28T16:59:04.776Z" | |
} | |
} | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"The third matrix has no variance and is of rank 0. It still gives a list of 6, all 6 elements are 0." | |
], | |
"metadata": { | |
"nteract": { | |
"transient": { | |
"deleting": false | |
} | |
} | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"print(\"rank=\", np.linalg.matrix_rank(c))\n", | |
"u3, s3, vh3 = np.linalg.svd(c, full_matrices=True)\n", | |
"s3" | |
], | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"rank= 0\n" | |
] | |
}, | |
{ | |
"output_type": "execute_result", | |
"execution_count": 108, | |
"data": { | |
"text/plain": [ | |
"array([0., 0., 0., 0., 0., 0.])" | |
] | |
}, | |
"metadata": {} | |
} | |
], | |
"execution_count": 108, | |
"metadata": { | |
"collapsed": false, | |
"jupyter": { | |
"source_hidden": false, | |
"outputs_hidden": false | |
}, | |
"nteract": { | |
"transient": { | |
"deleting": false | |
} | |
}, | |
"execution": { | |
"iopub.status.busy": "2020-02-28T16:59:10.343Z", | |
"iopub.execute_input": "2020-02-28T16:59:10.349Z", | |
"iopub.status.idle": "2020-02-28T16:59:10.365Z", | |
"shell.execute_reply": "2020-02-28T16:59:10.371Z" | |
} | |
} | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"What does pca's `_assess_dimension` do with this? It is capable of assessing the likelihood of each rank. Rank 0,1,2,3,4,5, and 6." | |
], | |
"metadata": { | |
"nteract": { | |
"transient": { | |
"deleting": false | |
} | |
} | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"for r in range(7):\n", | |
" print(r)\n", | |
" print(_assess_dimension(np.asarray(s1), r, 9, 6))" | |
], | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"0\n", | |
"-26.794487496983052\n", | |
"1\n", | |
"-27.59120266903175\n", | |
"2\n", | |
"-29.04631199386823\n", | |
"3\n", | |
"-30.397847556807104\n", | |
"4\n", | |
"-31.69273382607098\n", | |
"5\n", | |
"-32.95111461773282\n", | |
"6\n", | |
"-33.8239355537562\n" | |
] | |
} | |
], | |
"execution_count": 109, | |
"metadata": { | |
"collapsed": false, | |
"jupyter": { | |
"source_hidden": false, | |
"outputs_hidden": false | |
}, | |
"nteract": { | |
"transient": { | |
"deleting": false | |
} | |
}, | |
"execution": { | |
"iopub.status.busy": "2020-02-28T16:59:20.017Z", | |
"iopub.execute_input": "2020-02-28T16:59:20.023Z", | |
"iopub.status.idle": "2020-02-28T16:59:20.034Z", | |
"shell.execute_reply": "2020-02-28T16:59:20.039Z" | |
} | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"for r in range(7):\n", | |
" print(r)\n", | |
" print(_assess_dimension(np.asarray(s2), r, 9, 6))" | |
], | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"0\n", | |
"-4.554840426255549\n", | |
"1\n", | |
"-inf\n", | |
"2\n", | |
"-inf\n", | |
"3\n", | |
"-inf\n", | |
"4\n", | |
"-inf\n", | |
"5\n", | |
"-inf\n", | |
"6\n", | |
"nan\n" | |
] | |
}, | |
{ | |
"output_type": "stream", | |
"name": "stderr", | |
"text": [ | |
"/Users/lisa/Documents/Projects/scikit-learn/sklearn/decomposition/_pca.py:68: RuntimeWarning: divide by zero encountered in log\n", | |
" pl = np.sum(np.log(spectrum[:rank]))\n", | |
"/Users/lisa/Documents/Projects/scikit-learn/sklearn/decomposition/_pca.py:98: RuntimeWarning: divide by zero encountered in double_scalars\n", | |
" (1. / spectrum_[j] - 1. / spectrum_[i])) + log(n_samples)\n", | |
"/Users/lisa/Documents/Projects/scikit-learn/sklearn/decomposition/_pca.py:100: RuntimeWarning: invalid value encountered in double_scalars\n", | |
" ll = pu + pl + pv + pp - pa / 2. - rank * log(n_samples) / 2.\n" | |
] | |
} | |
], | |
"execution_count": 110, | |
"metadata": { | |
"collapsed": false, | |
"jupyter": { | |
"source_hidden": false, | |
"outputs_hidden": false | |
}, | |
"nteract": { | |
"transient": { | |
"deleting": false | |
} | |
}, | |
"execution": { | |
"iopub.status.busy": "2020-02-28T16:59:20.781Z", | |
"iopub.execute_input": "2020-02-28T16:59:20.786Z", | |
"iopub.status.idle": "2020-02-28T16:59:20.799Z", | |
"shell.execute_reply": "2020-02-28T16:59:20.804Z" | |
} | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"for r in range(7):\n", | |
" print(r)\n", | |
" print(_assess_dimension(np.asarray(s3), r, 9, 6))" | |
], | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"0\n", | |
"-inf\n", | |
"1\n", | |
"-inf\n", | |
"2\n", | |
"-inf\n", | |
"3\n", | |
"-inf\n", | |
"4\n", | |
"-inf\n", | |
"5\n", | |
"-inf\n", | |
"6\n", | |
"inf\n" | |
] | |
} | |
], | |
"execution_count": 111, | |
"metadata": { | |
"collapsed": false, | |
"jupyter": { | |
"source_hidden": false, | |
"outputs_hidden": false | |
}, | |
"nteract": { | |
"transient": { | |
"deleting": false | |
} | |
}, | |
"execution": { | |
"iopub.status.busy": "2020-02-28T16:59:27.300Z", | |
"iopub.execute_input": "2020-02-28T16:59:27.306Z", | |
"iopub.status.idle": "2020-02-28T16:59:27.317Z", | |
"shell.execute_reply": "2020-02-28T16:59:27.323Z" | |
} | |
} | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"But it does show strange behavior on the full rank on both of the edge case matrices (all zeros and all ones) with the 7th value being nan or +inf..." | |
], | |
"metadata": { | |
"nteract": { | |
"transient": { | |
"deleting": false | |
} | |
} | |
} | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"Here are some more interesting cases that dont work" | |
], | |
"metadata": { | |
"nteract": { | |
"transient": { | |
"deleting": false | |
} | |
} | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"d = np.zeros((9,6))\n", | |
"d[:,2]=1\n", | |
"d[:,5]=5\n", | |
"print(\"rank=\", np.linalg.matrix_rank(d))\n", | |
"u4, s4, vh4 = np.linalg.svd(d, full_matrices=True)\n", | |
"s4\n", | |
"d" | |
], | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"rank= 1\n" | |
] | |
}, | |
{ | |
"output_type": "execute_result", | |
"execution_count": 118, | |
"data": { | |
"text/plain": [ | |
"array([[0., 0., 1., 0., 0., 5.],\n", | |
" [0., 0., 1., 0., 0., 5.],\n", | |
" [0., 0., 1., 0., 0., 5.],\n", | |
" [0., 0., 1., 0., 0., 5.],\n", | |
" [0., 0., 1., 0., 0., 5.],\n", | |
" [0., 0., 1., 0., 0., 5.],\n", | |
" [0., 0., 1., 0., 0., 5.],\n", | |
" [0., 0., 1., 0., 0., 5.],\n", | |
" [0., 0., 1., 0., 0., 5.]])" | |
] | |
}, | |
"metadata": {} | |
} | |
], | |
"execution_count": 118, | |
"metadata": { | |
"collapsed": false, | |
"jupyter": { | |
"source_hidden": false, | |
"outputs_hidden": false | |
}, | |
"nteract": { | |
"transient": { | |
"deleting": false | |
} | |
}, | |
"execution": { | |
"iopub.status.busy": "2020-02-28T17:00:47.965Z", | |
"iopub.execute_input": "2020-02-28T17:00:47.970Z", | |
"iopub.status.idle": "2020-02-28T17:00:47.982Z", | |
"shell.execute_reply": "2020-02-28T17:00:47.987Z" | |
} | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"for r in range(7):\n", | |
" print(r)\n", | |
" print(_assess_dimension(np.asarray(s4), r, 9, 6))" | |
], | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"0\n", | |
"-24.35039085496681\n", | |
"1\n", | |
"-inf\n", | |
"2\n", | |
"-inf\n", | |
"3\n", | |
"-inf\n", | |
"4\n", | |
"-inf\n", | |
"5\n", | |
"-inf\n", | |
"6\n" | |
] | |
}, | |
{ | |
"output_type": "error", | |
"ename": "ValueError", | |
"evalue": "math domain error", | |
"traceback": [ | |
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", | |
"\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", | |
"\u001b[0;32m<ipython-input-113-e282dbdd7bb2>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mr\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mrange\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m7\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mr\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0m_assess_dimension\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0masarray\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ms4\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mr\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m9\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m6\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", | |
"\u001b[0;32m~/Documents/Projects/scikit-learn/sklearn/decomposition/_pca.py\u001b[0m in \u001b[0;36m_assess_dimension\u001b[0;34m(spectrum, rank, n_samples, n_features)\u001b[0m\n\u001b[1;32m 96\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mj\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mrange\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mi\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mspectrum\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 97\u001b[0m pa += log((spectrum[i] - spectrum[j]) *\n\u001b[0;32m---> 98\u001b[0;31m (1. / spectrum_[j] - 1. / spectrum_[i])) + log(n_samples)\n\u001b[0m\u001b[1;32m 99\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 100\u001b[0m \u001b[0mll\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpu\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mpl\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mpv\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mpp\u001b[0m \u001b[0;34m-\u001b[0m \u001b[0mpa\u001b[0m \u001b[0;34m/\u001b[0m \u001b[0;36m2.\u001b[0m \u001b[0;34m-\u001b[0m \u001b[0mrank\u001b[0m \u001b[0;34m*\u001b[0m \u001b[0mlog\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mn_samples\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m/\u001b[0m \u001b[0;36m2.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", | |
"\u001b[0;31mValueError\u001b[0m: math domain error" | |
] | |
} | |
], | |
"execution_count": 113, | |
"metadata": { | |
"collapsed": false, | |
"jupyter": { | |
"source_hidden": false, | |
"outputs_hidden": false | |
}, | |
"nteract": { | |
"transient": { | |
"deleting": false | |
} | |
}, | |
"execution": { | |
"iopub.status.busy": "2020-02-28T16:59:33.931Z", | |
"iopub.execute_input": "2020-02-28T16:59:33.936Z", | |
"iopub.status.idle": "2020-02-28T16:59:33.951Z", | |
"shell.execute_reply": "2020-02-28T16:59:33.965Z" | |
} | |
} | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"One Step higher, `_infer_dimension` is the function that does this iteration. It never tests full rank. It also kind of can't because it is meant to return an index into the spectrum vector which is of length 6. If we test full rank, the ll vector will be of length 7, so we can't take the argmax. " | |
], | |
"metadata": { | |
"nteract": { | |
"transient": { | |
"deleting": false | |
} | |
} | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"_infer_dimension(s1, 9, 6)" | |
], | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"execution_count": 114, | |
"data": { | |
"text/plain": [ | |
"0" | |
] | |
}, | |
"metadata": {} | |
} | |
], | |
"execution_count": 114, | |
"metadata": { | |
"collapsed": false, | |
"jupyter": { | |
"source_hidden": false, | |
"outputs_hidden": false | |
}, | |
"nteract": { | |
"transient": { | |
"deleting": false | |
} | |
}, | |
"execution": { | |
"iopub.status.busy": "2020-02-28T16:59:41.188Z", | |
"iopub.execute_input": "2020-02-28T16:59:41.194Z", | |
"iopub.status.idle": "2020-02-28T16:59:41.204Z", | |
"shell.execute_reply": "2020-02-28T16:59:41.209Z" | |
} | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"_infer_dimension(s2, 9, 6)" | |
], | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"execution_count": 115, | |
"data": { | |
"text/plain": [ | |
"0" | |
] | |
}, | |
"metadata": {} | |
} | |
], | |
"execution_count": 115, | |
"metadata": { | |
"collapsed": false, | |
"jupyter": { | |
"source_hidden": false, | |
"outputs_hidden": false | |
}, | |
"nteract": { | |
"transient": { | |
"deleting": false | |
} | |
}, | |
"execution": { | |
"iopub.status.busy": "2020-02-28T16:59:41.718Z", | |
"iopub.execute_input": "2020-02-28T16:59:41.723Z", | |
"iopub.status.idle": "2020-02-28T16:59:41.732Z", | |
"shell.execute_reply": "2020-02-28T16:59:41.737Z" | |
} | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"_infer_dimension(s3, 9, 6)" | |
], | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"execution_count": 116, | |
"data": { | |
"text/plain": [ | |
"0" | |
] | |
}, | |
"metadata": {} | |
} | |
], | |
"execution_count": 116, | |
"metadata": { | |
"collapsed": false, | |
"jupyter": { | |
"source_hidden": false, | |
"outputs_hidden": false | |
}, | |
"nteract": { | |
"transient": { | |
"deleting": false | |
} | |
}, | |
"execution": { | |
"iopub.status.busy": "2020-02-28T16:59:47.292Z", | |
"iopub.execute_input": "2020-02-28T16:59:47.297Z", | |
"iopub.status.idle": "2020-02-28T16:59:47.308Z", | |
"shell.execute_reply": "2020-02-28T16:59:47.313Z" | |
} | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"_infer_dimension(s4, 9, 6)" | |
], | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"execution_count": 117, | |
"data": { | |
"text/plain": [ | |
"0" | |
] | |
}, | |
"metadata": {} | |
} | |
], | |
"execution_count": 117, | |
"metadata": { | |
"collapsed": false, | |
"jupyter": { | |
"source_hidden": false, | |
"outputs_hidden": false | |
}, | |
"nteract": { | |
"transient": { | |
"deleting": false | |
} | |
}, | |
"execution": { | |
"iopub.status.busy": "2020-02-28T16:59:48.033Z", | |
"iopub.execute_input": "2020-02-28T16:59:48.040Z", | |
"iopub.status.idle": "2020-02-28T16:59:48.051Z", | |
"shell.execute_reply": "2020-02-28T16:59:48.056Z" | |
} | |
} | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"I assume it's always returning zero here because the spectrum comes sorted and all of these matrices have rank 1 or full and spectrum is sorted by linalg" | |
], | |
"metadata": { | |
"nteract": { | |
"transient": { | |
"deleting": false | |
} | |
} | |
} | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"So next Ill construct a matrix that has the largest variation in the last column. In my understanding that would mean that should be the main component then." | |
], | |
"metadata": { | |
"nteract": { | |
"transient": { | |
"deleting": false | |
} | |
} | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"e = np.zeros((9,6))\n", | |
"e[:,0]=1\n", | |
"e[:,1]=1\n", | |
"e[:,2]=1\n", | |
"e[:,3]=2\n", | |
"e[:,4]=[1,2,1,1,1,1,1,1,2]\n", | |
"e[:,5]=[1,2,3,4,5,6,7,8,9]\n", | |
"print(\"rank =\", np.linalg.matrix_rank(e))\n", | |
"u5, s5, vh5 = np.linalg.svd(e, full_matrices=True)\n", | |
"s5" | |
], | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"rank = 3\n" | |
] | |
}, | |
{ | |
"output_type": "execute_result", | |
"execution_count": 95, | |
"data": { | |
"text/plain": [ | |
"array([1.86723970e+01, 3.61449172e+00, 1.13006188e+00, 3.96930995e-16,\n", | |
" 4.51401869e-17, 2.22329483e-17])" | |
] | |
}, | |
"metadata": {} | |
} | |
], | |
"execution_count": 95, | |
"metadata": { | |
"collapsed": false, | |
"jupyter": { | |
"source_hidden": false, | |
"outputs_hidden": false | |
}, | |
"nteract": { | |
"transient": { | |
"deleting": false | |
} | |
}, | |
"execution": { | |
"iopub.status.busy": "2020-02-28T16:55:06.091Z", | |
"iopub.execute_input": "2020-02-28T16:55:06.097Z", | |
"iopub.status.idle": "2020-02-28T16:55:06.113Z", | |
"shell.execute_reply": "2020-02-28T16:55:06.118Z" | |
} | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"ll = []\n", | |
"for r in range(7):\n", | |
" print(r)\n", | |
" ll1 = _assess_dimension(np.asarray(s5), r, 9, 6)\n", | |
" print(ll1)\n", | |
" ll.append(ll1)" | |
], | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"0\n", | |
"-35.846979706242\n", | |
"1\n", | |
"-22.764631367309146\n", | |
"2\n", | |
"-17.637601080831697\n", | |
"3\n", | |
"-inf\n", | |
"4\n", | |
"-inf\n", | |
"5\n", | |
"-inf\n", | |
"6\n", | |
"287.3210868655406\n" | |
] | |
} | |
], | |
"execution_count": 120, | |
"metadata": { | |
"collapsed": false, | |
"jupyter": { | |
"source_hidden": false, | |
"outputs_hidden": false | |
}, | |
"nteract": { | |
"transient": { | |
"deleting": false | |
} | |
}, | |
"execution": { | |
"iopub.status.busy": "2020-02-28T17:17:16.479Z", | |
"iopub.execute_input": "2020-02-28T17:17:16.485Z", | |
"iopub.status.idle": "2020-02-28T17:17:16.496Z", | |
"shell.execute_reply": "2020-02-28T17:17:16.501Z" | |
} | |
} | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"Here the last value (the rank we never test) clearly has the biggest number. `infer_dimension` returns 2 though." | |
], | |
"metadata": { | |
"nteract": { | |
"transient": { | |
"deleting": false | |
} | |
} | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"_infer_dimension(s5, 9, 6)" | |
], | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"execution_count": 98, | |
"data": { | |
"text/plain": [ | |
"2" | |
] | |
}, | |
"metadata": {} | |
} | |
], | |
"execution_count": 98, | |
"metadata": { | |
"collapsed": false, | |
"jupyter": { | |
"source_hidden": false, | |
"outputs_hidden": false | |
}, | |
"nteract": { | |
"transient": { | |
"deleting": false | |
} | |
}, | |
"execution": { | |
"iopub.status.busy": "2020-02-28T16:56:30.021Z", | |
"iopub.execute_input": "2020-02-28T16:56:30.027Z", | |
"iopub.status.idle": "2020-02-28T16:56:30.041Z", | |
"shell.execute_reply": "2020-02-28T16:56:30.046Z" | |
} | |
} | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"The trouble is that even if we do test for the last rank, we can't really use the argmax because we are always testing the number of values in the spectrum AND rank 0." | |
], | |
"metadata": { | |
"nteract": { | |
"transient": { | |
"deleting": false | |
} | |
} | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"print(np.argmax(ll))\n", | |
"print(len(s5))\n", | |
"s5[np.argmax(ll)]\n" | |
], | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"6\n", | |
"6\n" | |
] | |
}, | |
{ | |
"output_type": "error", | |
"ename": "IndexError", | |
"evalue": "index 6 is out of bounds for axis 0 with size 6", | |
"traceback": [ | |
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", | |
"\u001b[0;31mIndexError\u001b[0m Traceback (most recent call last)", | |
"\u001b[0;32m<ipython-input-122-368585a97f86>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0margmax\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mll\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ms5\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m \u001b[0ms5\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0margmax\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mll\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", | |
"\u001b[0;31mIndexError\u001b[0m: index 6 is out of bounds for axis 0 with size 6" | |
] | |
} | |
], | |
"execution_count": 122, | |
"metadata": { | |
"collapsed": false, | |
"jupyter": { | |
"source_hidden": false, | |
"outputs_hidden": false | |
}, | |
"nteract": { | |
"transient": { | |
"deleting": false | |
} | |
}, | |
"execution": { | |
"iopub.status.busy": "2020-02-28T17:17:52.521Z", | |
"iopub.execute_input": "2020-02-28T17:17:52.527Z", | |
"iopub.status.idle": "2020-02-28T17:17:52.542Z", | |
"shell.execute_reply": "2020-02-28T17:17:52.548Z" | |
} | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"pca = PCA(n_components=\"mle\")\n", | |
"my_fit = pca.fit(e)\n", | |
"#print(pca.explained_variance_ratio_)\n", | |
"#print(pca.singular_values_)\n", | |
"print(my_fit)" | |
], | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"PCA(copy=True, iterated_power='auto', n_components='mle', random_state=None,\n", | |
" svd_solver='auto', tol=0.0, whiten=False)\n" | |
] | |
} | |
], | |
"execution_count": 103, | |
"metadata": { | |
"collapsed": false, | |
"jupyter": { | |
"source_hidden": false, | |
"outputs_hidden": false | |
}, | |
"nteract": { | |
"transient": { | |
"deleting": false | |
} | |
}, | |
"execution": { | |
"iopub.status.busy": "2020-02-28T16:57:30.462Z", | |
"iopub.execute_input": "2020-02-28T16:57:30.469Z", | |
"iopub.status.idle": "2020-02-28T16:57:30.480Z", | |
"shell.execute_reply": "2020-02-28T16:57:30.486Z" | |
} | |
} | |
} | |
], | |
"metadata": { | |
"kernel_info": { | |
"name": "scenewalk" | |
}, | |
"language_info": { | |
"name": "python", | |
"version": "3.6.3", | |
"mimetype": "text/x-python", | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"pygments_lexer": "ipython3", | |
"nbconvert_exporter": "python", | |
"file_extension": ".py" | |
}, | |
"kernelspec": { | |
"argv": [ | |
"/Users/lisa/Documents/virtual_envs/scenewalk/bin/python3", | |
"-m", | |
"ipykernel_launcher", | |
"-f", | |
"{connection_file}" | |
], | |
"display_name": "SceneWalk", | |
"language": "python", | |
"name": "scenewalk" | |
}, | |
"nteract": { | |
"version": "0.22.0" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 0 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment