Created
September 12, 2020 16:22
-
-
Save eric-czech/52a04007f4cfb8f05b312079ca770add to your computer and use it in GitHub Desktop.
PCA loadings vs scores
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "cells": [ | |
| { | |
| "cell_type": "code", | |
| "execution_count": 16, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "import xarray as xr\n", | |
| "import numpy as np\n", | |
| "from sklearn.decomposition import PCA" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 17, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "<xarray.DataArray (variants: 1000, samples: 100)>\n", | |
| "array([[0, 1, 0, ..., 0, 1, 2],\n", | |
| " [2, 1, 0, ..., 0, 0, 1],\n", | |
| " [0, 2, 2, ..., 1, 2, 0],\n", | |
| " ...,\n", | |
| " [0, 1, 2, ..., 0, 2, 0],\n", | |
| " [0, 0, 0, ..., 2, 1, 0],\n", | |
| " [0, 1, 0, ..., 0, 0, 2]])\n", | |
| "Dimensions without coordinates: variants, samples\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "np.random.seed(0)\n", | |
| "x = xr.DataArray(np.random.randint(0, 3, size=(1000, 100)), dims=('variants', 'samples'))\n", | |
| "print(x)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 18, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "# Scores: projection of samples into PC space\n", | |
| "\n", | |
| "def pc_scores_numpy(x, k):\n", | |
| " x = x - x.mean(axis=0, keepdims=True)\n", | |
| " u, s, v = np.linalg.svd(x, full_matrices=False)\n", | |
| " r = u * s[np.newaxis]\n", | |
| " return r[:, :k]\n", | |
| "\n", | |
| "def pc_scores_sklearn(x, k):\n", | |
| " return PCA(n_components=k, svd_solver='full').fit_transform(x)\n", | |
| "\n", | |
| "scores_np = pc_scores_numpy(x.T.values, 1)\n", | |
| "scores_sk = pc_scores_sklearn(x.T.values, 1)\n", | |
| "assert scores_np.shape == (x.sizes['samples'], 1)\n", | |
| "np.testing.assert_almost_equal(scores_np, scores_sk)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 15, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "# Loadings: Variance-scaled eigenvectors\n", | |
| "\n", | |
| "def pc_loadings_numpy(x, k):\n", | |
| " x = x - x.mean(axis=0, keepdims=True)\n", | |
| " u, s, v = np.linalg.svd(x, full_matrices=False)\n", | |
| " l = (v.T * s) / np.sqrt(len(x) - 1)\n", | |
| " return l[:, :k]\n", | |
| "\n", | |
| "def pc_loadings_sklearn(x, k):\n", | |
| " pca = PCA(n_components=k, svd_solver='full').fit(x)\n", | |
| " return pca.components_.T * np.sqrt(pca.explained_variance_)\n", | |
| "\n", | |
| "# Loadings of matrix where genetic samples are considered features\n", | |
| "# rather than the other way around as above (i.e. no .T here on x)\n", | |
| "loadings_np = pc_loadings_numpy(x.values, 1)\n", | |
| "loadings_sk = pc_loadings_sklearn(x.values, 1)\n", | |
| "assert loadings_np.shape == scores_np.shape\n", | |
| "np.testing.assert_almost_equal(loadings_np, loadings_sk)" | |
| ] | |
| } | |
| ], | |
| "metadata": { | |
| "kernelspec": { | |
| "display_name": "Python 3", | |
| "language": "python", | |
| "name": "python3" | |
| }, | |
| "language_info": { | |
| "codemirror_mode": { | |
| "name": "ipython", | |
| "version": 3 | |
| }, | |
| "file_extension": ".py", | |
| "mimetype": "text/x-python", | |
| "name": "python", | |
| "nbconvert_exporter": "python", | |
| "pygments_lexer": "ipython3", | |
| "version": "3.8.5" | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 4 | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment