mkolod · May 12, 2016 08:53
diff --git a/PCA_via_SVD.ipynb b/PCA_via_SVD.ipynb
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# Imports\n",
    "import numpy as np\n",
    "from sklearn.decomposition import PCA"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The matrix shape is (6, 2)\n",
      "The matrix rank is 2\n"
     ]
    }
   ],
   "source": [
    "# Data\n",
    "X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])\n",
    "# 6 rows, 2 columns\n",
    "print(\"The matrix shape is %s\" % repr(X.shape)) \n",
    "print(\"The matrix rank is %d\" % np.linalg.matrix_rank(X))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Principal components using PCA directly\n",
      "[[ 0.83849224  0.54491354]\n",
      " [ 0.54491354 -0.83849224]]\n"
     ]
    }
   ],
   "source": [
    "# Run PCA and list PCA components\n",
    "pca = PCA(n_components=2)\n",
    "pca.fit(X)\n",
    "# There are at most 2 principal components because the matrix rank is 2\n",
    "print(\"Principal components using PCA directly\")\n",
    "print(pca.components_)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Principal components using SVD\n",
      "[[ 0.83849224  0.54491354]\n",
      " [ 0.54491354 -0.83849224]]\n",
      "Eigenvalues using SVD\n",
      "[ 6.30061232  0.54980396]\n"
     ]
    }
   ],
   "source": [
    "# Get principal components using SVD\n",
    "U, s, V = np.linalg.svd(X, full_matrices=True)\n",
    "print(\"Principal components using SVD\\n%s\" % V)\n",
    "print(\"Eigenvalues using SVD\\n%s\" % s)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Variance explained by consecutive principal components (via PCA)\n",
      "[ 0.99244289  0.00755711]\n",
      "Variance explained by consecutive principal components (via SVD)\n",
      "[ 0.99244289  0.00755711]\n"
     ]
    }
   ],
   "source": [
    "# Percentage of variance explained (via eigenvalues)\n",
    "ssq = np.multiply(s, s)\n",
    "total = np.sum(ssq)\n",
    "explained = ssq / total\n",
    "print(\"Variance explained by consecutive principal components (via PCA)\")\n",
    "print(pca.explained_variance_ratio_)\n",
    "print(\"Variance explained by consecutive principal components (via SVD)\")\n",
    "print(explained)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Principal components using U from SVD of covariance matrix\n",
      "U\n",
      "[[-0.83849224 -0.54491354]\n",
      " [-0.54491354  0.83849224]]\n",
      "s\n",
      "[ 6.30061232  0.54980396]\n",
      "V\n",
      "[[-0.83849224 -0.54491354]\n",
      " [-0.54491354  0.83849224]]\n"
     ]
    }
   ],
   "source": [
    "# Now let's create the covariance matrix\n",
    "XX = np.dot(np.transpose(X), X)\n",
    "U2, s2, V2 = np.linalg.svd(XX, full_matrices=True)\n",
    "s2 = np.sqrt(s2)\n",
    "print(\"Principal components using U from SVD of covariance matrix\")\n",
    "print(\"U\\n%s\" % U2)\n",
    "print(\"s\\n%s\" % s2)\n",
    "print(\"V\\n%s\" % V2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Eigenvalues:\n",
      "[ 6.30061232  0.54980396]\n",
      "\n",
      "Eigenvetors/principal components:\n",
      "[[ 0.83849224 -0.54491354]\n",
      " [ 0.54491354  0.83849224]]\n"
     ]
    }
   ],
   "source": [
    "# PCA via eigendecomposition\n",
    "# Note: while SVD can be performed on the data matrix,\n",
    "# the eigendecomposition has to be performed on the covariance matrix.\n",
    "\n",
    "cov = np.dot(np.transpose(X), X)\n",
    "[s3, v3] = np.linalg.eig(cov)\n",
    "s3 = np.sqrt(s3)\n",
    "print(\"Eigenvalues:\\n%s\\n\" % s3)\n",
    "print(\"Eigenvetors/principal components:\\n%s\" % v3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Original matrix\n",
      "[[ 0.37454012  0.95071431  0.73199394  0.59865848  0.15601864  0.15599452\n",
      "   0.05808361  0.86617615  0.60111501  0.70807258]\n",
      " [ 0.02058449  0.96990985  0.83244264  0.21233911  0.18182497  0.18340451\n",
      "   0.30424224  0.52475643  0.43194502  0.29122914]]\n",
      "\n",
      "After dimensionality reduction to n-1\n",
      "[[-1.68447704 -0.62064143  0.45054911 -0.17178848  0.10110122  0.10293963\n",
      "   0.31044351  0.00231546  0.07976426]\n",
      " [-1.00922335 -0.50413069  0.79493015 -0.25238636  0.1779029   0.18097257\n",
      "   0.52195075  0.05390896  0.16797379]]\n",
      "\n",
      "Approximate mxn matrix from reduced\n",
      "[[ 0.32630632  0.88549274  0.7426496   0.57043379  0.15863648  0.15873872\n",
      "   0.07771049  0.84249145  0.59002194  0.83481068]\n",
      " [-0.04741931  0.87795536  0.84746583  0.17254572  0.1855158   0.18727349\n",
      "   0.33191376  0.49136388  0.41630513  0.46991447]]\n",
      "\n",
      "Difference between original and approx\n",
      "[[ 0.0482338   0.06522157 -0.01065566  0.02822469 -0.00261784 -0.0027442\n",
      "  -0.01962688  0.0236847   0.01109308 -0.1267381 ]\n",
      " [ 0.0680038   0.09195449 -0.01502319  0.03979339 -0.00369083 -0.00386898\n",
      "  -0.02767152  0.03339255  0.01563989 -0.17868533]]\n"
     ]
    }
   ],
   "source": [
    "# Dimensionality reduction\n",
    "np.random.seed(42)\n",
    "Y = np.random.rand(2, 10)\n",
    "print(\"Original matrix\\n%s\" % Y)\n",
    "U, s, V = np.linalg.svd(Y, full_matrices=True)\n",
    "projection = V[:, :V.shape[1] - 1]\n",
    "Y_reduce = np.dot(Y, projection)\n",
    "print(\"\\nAfter dimensionality reduction to n-1\\n%s\" % Y_reduce)\n",
    "Y_approx = np.dot(Y_reduce, np.transpose(projection))\n",
    "print(\"\\nApproximate mxn matrix from reduced\\n%s\" % Y_approx)\n",
    "print(\"\\nDifference between original and approx\\n%s\" % np.subtract(Y, Y_approx))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
 }
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"# Imports\n",
	"import numpy as np\n",
	"from sklearn.decomposition import PCA"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"The matrix shape is (6, 2)\n",
	"The matrix rank is 2\n"
	]
	}
	],
	"source": [
	"# Data\n",
	"X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])\n",
	"# 6 rows, 2 columns\n",
	"print(\"The matrix shape is %s\" % repr(X.shape)) \n",
	"print(\"The matrix rank is %d\" % np.linalg.matrix_rank(X))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Principal components using PCA directly\n",
	"[[ 0.83849224 0.54491354]\n",
	" [ 0.54491354 -0.83849224]]\n"
	]
	}
	],
	"source": [
	"# Run PCA and list PCA components\n",
	"pca = PCA(n_components=2)\n",
	"pca.fit(X)\n",
	"# There are at most 2 principal components because the matrix rank is 2\n",
	"print(\"Principal components using PCA directly\")\n",
	"print(pca.components_)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Principal components using SVD\n",
	"[[ 0.83849224 0.54491354]\n",
	" [ 0.54491354 -0.83849224]]\n",
	"Eigenvalues using SVD\n",
	"[ 6.30061232 0.54980396]\n"
	]
	}
	],
	"source": [
	"# Get principal components using SVD\n",
	"U, s, V = np.linalg.svd(X, full_matrices=True)\n",
	"print(\"Principal components using SVD\\n%s\" % V)\n",
	"print(\"Eigenvalues using SVD\\n%s\" % s)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Variance explained by consecutive principal components (via PCA)\n",
	"[ 0.99244289 0.00755711]\n",
	"Variance explained by consecutive principal components (via SVD)\n",
	"[ 0.99244289 0.00755711]\n"
	]
	}
	],
	"source": [
	"# Percentage of variance explained (via eigenvalues)\n",
	"ssq = np.multiply(s, s)\n",
	"total = np.sum(ssq)\n",
	"explained = ssq / total\n",
	"print(\"Variance explained by consecutive principal components (via PCA)\")\n",
	"print(pca.explained_variance_ratio_)\n",
	"print(\"Variance explained by consecutive principal components (via SVD)\")\n",
	"print(explained)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Principal components using U from SVD of covariance matrix\n",
	"U\n",
	"[[-0.83849224 -0.54491354]\n",
	" [-0.54491354 0.83849224]]\n",
	"s\n",
	"[ 6.30061232 0.54980396]\n",
	"V\n",
	"[[-0.83849224 -0.54491354]\n",
	" [-0.54491354 0.83849224]]\n"
	]
	}
	],
	"source": [
	"# Now let's create the covariance matrix\n",
	"XX = np.dot(np.transpose(X), X)\n",
	"U2, s2, V2 = np.linalg.svd(XX, full_matrices=True)\n",
	"s2 = np.sqrt(s2)\n",
	"print(\"Principal components using U from SVD of covariance matrix\")\n",
	"print(\"U\\n%s\" % U2)\n",
	"print(\"s\\n%s\" % s2)\n",
	"print(\"V\\n%s\" % V2)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 7,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Eigenvalues:\n",
	"[ 6.30061232 0.54980396]\n",
	"\n",
	"Eigenvetors/principal components:\n",
	"[[ 0.83849224 -0.54491354]\n",
	" [ 0.54491354 0.83849224]]\n"
	]
	}
	],
	"source": [
	"# PCA via eigendecomposition\n",
	"# Note: while SVD can be performed on the data matrix,\n",
	"# the eigendecomposition has to be performed on the covariance matrix.\n",
	"\n",
	"cov = np.dot(np.transpose(X), X)\n",
	"[s3, v3] = np.linalg.eig(cov)\n",
	"s3 = np.sqrt(s3)\n",
	"print(\"Eigenvalues:\\n%s\\n\" % s3)\n",
	"print(\"Eigenvetors/principal components:\\n%s\" % v3)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 8,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Original matrix\n",
	"[[ 0.37454012 0.95071431 0.73199394 0.59865848 0.15601864 0.15599452\n",
	" 0.05808361 0.86617615 0.60111501 0.70807258]\n",
	" [ 0.02058449 0.96990985 0.83244264 0.21233911 0.18182497 0.18340451\n",
	" 0.30424224 0.52475643 0.43194502 0.29122914]]\n",
	"\n",
	"After dimensionality reduction to n-1\n",
	"[[-1.68447704 -0.62064143 0.45054911 -0.17178848 0.10110122 0.10293963\n",
	" 0.31044351 0.00231546 0.07976426]\n",
	" [-1.00922335 -0.50413069 0.79493015 -0.25238636 0.1779029 0.18097257\n",
	" 0.52195075 0.05390896 0.16797379]]\n",
	"\n",
	"Approximate mxn matrix from reduced\n",
	"[[ 0.32630632 0.88549274 0.7426496 0.57043379 0.15863648 0.15873872\n",
	" 0.07771049 0.84249145 0.59002194 0.83481068]\n",
	" [-0.04741931 0.87795536 0.84746583 0.17254572 0.1855158 0.18727349\n",
	" 0.33191376 0.49136388 0.41630513 0.46991447]]\n",
	"\n",
	"Difference between original and approx\n",
	"[[ 0.0482338 0.06522157 -0.01065566 0.02822469 -0.00261784 -0.0027442\n",
	" -0.01962688 0.0236847 0.01109308 -0.1267381 ]\n",
	" [ 0.0680038 0.09195449 -0.01502319 0.03979339 -0.00369083 -0.00386898\n",
	" -0.02767152 0.03339255 0.01563989 -0.17868533]]\n"
	]
	}
	],
	"source": [
	"# Dimensionality reduction\n",
	"np.random.seed(42)\n",
	"Y = np.random.rand(2, 10)\n",
	"print(\"Original matrix\\n%s\" % Y)\n",
	"U, s, V = np.linalg.svd(Y, full_matrices=True)\n",
	"projection = V[:, :V.shape[1] - 1]\n",
	"Y_reduce = np.dot(Y, projection)\n",
	"print(\"\\nAfter dimensionality reduction to n-1\\n%s\" % Y_reduce)\n",
	"Y_approx = np.dot(Y_reduce, np.transpose(projection))\n",
	"print(\"\\nApproximate mxn matrix from reduced\\n%s\" % Y_approx)\n",
	"print(\"\\nDifference between original and approx\\n%s\" % np.subtract(Y, Y_approx))"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 2",
	"language": "python",
	"name": "python2"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 2
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython2",
	"version": "2.7.11"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 0
	}