Created
July 23, 2020 16:20
-
-
Save ljbelenky/597765a90f8e8c038cc28207a9e8055d to your computer and use it in GitHub Desktop.
Categorical comparison vs. Correlation
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "cells": [ | |
| { | |
| "cell_type": "code", | |
| "execution_count": 1, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "import numpy as np\n", | |
| "import pandas as pd \n", | |
| "import matplotlib.pyplot as plt" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "# T-test works well for comparing categories" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 8, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "cats = np.random.normal(3,2,100)\n", | |
| "dogs = np.random.normal(3.5,2,100)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 9, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXIAAAD4CAYAAADxeG0DAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjMsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+AADFEAAALW0lEQVR4nO3dXYjldR3H8c+n2ZXNp9xlT2i7S2MUMjIFxkEqxfChMBProgsXlB4G5qpNozBtLrQLrwwrLIrBNYiW8UKNIqxUGomBWjqzrrnrWIj5sD7gkSStEGft28XOmo7z8D/7/835z/fs+wUDex729/stDO/98z+///84IgQAyOtdTS8AAFAPIQeA5Ag5ACRHyAEgOUIOAMltaGLSrVu3xvDwcBNTA0Bas7OzL0VEa/HzjYR8eHhYnU6niakBIC3bTy31PKdWACA5Qg4AyRFyAEiOkANAcoQcAJIj5ACQHCEHgOQIOQAk18gFQQAGi+2e/w7fhVAOIQdQ23JRtk2w+4CQJ3EsRzwSRz3A8aDIOXLbX7d90PYB21O2N5UYF/8XEcv+rPQ6gMFXO+S2t0n6mqR2RIxKGpJ0Zd1xAQDVlNq1skHSu21vkHSipOcKjQsAWEXtkEfEs5K+K+lpSc9L+mdE3Fd3XABANSVOrWyW9DlJZ0p6n6STbF+1xPvGbXdsd7rdbt1pAQALSpxauUTS3yOiGxHzku6R9InFb4qIyYhoR0S71XrHF1wAAI5RiZA/Leljtk/0kT1yF0uaKzAuAKCCEufI90q6S9I+SY8sjDlZd1wAQDVFLgiKiBsl3VhiLABAb7hpFgAkR8gBIDlCDgDJEXIASI6QA0ByhBwAkiPkAJAcIQeA5Ag5ACRHyAEgOUIOAMkRcgBIjpADQHKEHACSI+QAkBwhB4DkCDkAJEfIASA5Qg4AyRFyAEiOkANAcoQcAJIj5ACQHCEHgOSKhNz2abbvsv2Y7TnbHy8xLgBgdRsKjfMDSb+NiC/YPkHSiYXGBQCsonbIbZ8q6QJJX5KkiHhd0ut1xwUAVFPi1MoHJHUl/dT2Q7Zvt33S4jfZHrfdsd3pdrsFpgUASGVCvkHSRyX9OCLOkfRvSdcvflNETEZEOyLarVarwLQAAKlMyA9JOhQRexce36UjYQcA9EHtkEfEC5KesX3WwlMXS3q07rgAgGpK7VrZJWnPwo6VJyR9udC4AIBVFAl5ROyX1C4xFgCgN1zZCQDJEXIASI6QA0ByhBwAkiPkAJAcIQeA5Ag5ACRHyAEgOUIOAMkRcgBIjpADQHKEHACSI+QAKtmyZYts9/Qjqee/s2XLlob/pfmUuo0tgAH38ssvKyLWfJ6j/wGgOo7IASA5Qg4AyRFyAEiOkANAcoQcAJIj5ACQHCFfZ/qxV5d9usBgYR/5OtOPvbrs0wUGC0fkAJAcIQeA5IqF3PaQ7Yds/7rUmACA1ZU8Ir9G0lzB8QAAFRQJue3tkj4r6fYS4wEAqit1RP59SddJ+u9yb7A9brtju9PtdgtNCwCoHXLbl0t6MSJmV3pfRExGRDsi2q1Wq+60AIAFJY7Iz5N0he0nJd0p6SLbPy8wLgCggtohj4gbImJ7RAxLulLS7yPiqtorAwBUwj5yAEiu6CX6EfGgpAdLjgkAWBlH5ACQHCEHgOS4+yGASuLGU6Wb3tOfedATQg6gEn/nlTW/xbJ05DbLcdOaTzNQOLUCAMkRcgBIjpADQHKEHACSI+QAkBwhB4DkCDkAJEfIASA5Qg4AyRFyAEiOS/TXmX7cz4J7WQCDhZCvM/24nwX3sgAGC6dWACA5Qg4AyRFyAEiOkANAcoQcAJIj5ACQHNsPAVRme83n2Lx585rPMWgIOYBKjuX6Btt9+Z7P413tUyu2d9ietj1n+6Dta0osDABQTYkj8sOSvhER+2yfImnW9v0R8WiBsQEAq6h9RB4Rz0fEvoU/vyppTtK2uuMCAKopumvF9rCkcyTtXeK1cdsd251ut1tyWgA4rhULue2TJd0t6dqIeGXx6xExGRHtiGi3Wq1S0wLAca9IyG1v1JGI74mIe0qMCQCopsSuFUvaLWkuIm6tvyQAQC9KHJGfJ+lqSRfZ3r/wc1mBcQEAFdTefhgRM5LW/nKv48haXz3HlXPAYOHKznWGq+cA9IqbZgFAcoQcAJIj5ACQHCEHgOQIOQAkR8gBIDlCDgDJEXIASI6QA0ByhBwAkiPkAJAcIQeA5Ag5ACRHyAEgOUIOAMlxP3IAta30ZSjLvcY99Msh5ABqI8rN4tQKACRHyAEgOUIOAMkRcgBIjpADQHKEHACSKxJy25fa/qvtx21fX2JMAEA1tUNue0jSjyR9RtLZknbaPrvuuACAakockZ8r6fGIeCIiXpd0p6TPFRgXAFBBiZBvk/TMWx4fWnjubWyP2+7Y7nS73QLTAgCkMiFf6kYK77heNyImI6IdEe1Wq1VgWgCAVCbkhyTteMvj7ZKeKzAuAKCCEiH/s6QP2T7T9gmSrpT0qwLjAgAqqH33w4g4bPurkn4naUjSHRFxsPbKAACVFLmNbUTcK+neEmMBAHrDlZ0AkBwhB4DkCDkAJEfIASA5Qg4AyRFyAEiOkANAcoQcAJIj5ACQHCEHgOQIOQAkR8gBIDlCDgDJEXIASI6QA0ByhBwAkiPkAJAcIQeA5Ag5ACRHyAEguSJfvoy1Z/uYXo+ItVgOgHWEkCdBkAEsh1MrAJAcIQeA5GqF3PYtth+z/Rfbv7B9WqmFAQCqqXtEfr+k0Yj4iKS/Sbqh/pIAAL2oFfKIuC8iDi88/JOk7fWXBADoRclz5F+R9JvlXrQ9brtju9PtdgtOCwDHt1W3H9p+QNLpS7w0ERG/XHjPhKTDkvYsN05ETEqalKR2u81eOgAoZNWQR8QlK71u+4uSLpd0cbDZGQD6rtYFQbYvlfQtSZ+MiP+UWRIAoBd1z5H/UNIpku63vd/2TwqsCQDQg1pH5BHxwVILAQAcG67sBIDkCDkAJEfIASA5Qg4AyRFyAEiOkANAcoQcAJIj5ACQHCEHgOQIOQAkR8gBIDlCDgDJEXIASI6QA0ByhBwAkiPkAJAcIQeA5Ag5ACRHyBObmprS6OiohoaGNDo6qqmpqaaXBKABtb6zE82ZmprSxMSEdu/erfPPP18zMzMaGxuTJO3cubPh1QHoJ0dE3ydtt9vR6XT6Pu8gGR0d1W233aYLL7zwzeemp6e1a9cuHThwoMGVAVgrtmcjov2O5wl5TkNDQ3rttde0cePGN5+bn5/Xpk2b9MYbbzS4MgBrZbmQc448qZGREc3MzLztuZmZGY2MjDS0IgBNIeRJTUxMaGxsTNPT05qfn9f09LTGxsY0MTHR9NIA9FmRDzttf1PSLZJaEfFSiTGxsqMfaO7atUtzc3MaGRnRzTffzAedwHGodsht75D0KUlP118OerFz507CDaDIqZXvSbpOUv8/NQUA1Au57SskPRsRD1d477jtju1Ot9utMy0A4C1WPbVi+wFJpy/x0oSkb0v6dJWJImJS0qR0ZPthD2sEAKxg1ZBHxCVLPW/7w5LOlPSwbUnaLmmf7XMj4oWiqwQALOuYP+yMiEckvffoY9tPSmqzawUA+qvYlZ29hNx2V9JTRSaGJG2VxH+gWI/43Szr/RHRWvxkI5fooyzbnaUu2wWaxu9mf3BlJwAkR8gBIDlCPhgmm14AsAx+N/uAc+QAkBxH5ACQHCEHgOQIeWK277D9om2+2w3riu0dtqdtz9k+aPuaptc0yDhHnpjtCyT9S9LPImK06fUAR9k+Q9IZEbHP9imSZiV9PiIebXhpA4kj8sQi4g+S/tH0OoDFIuL5iNi38OdXJc1J2tbsqgYXIQewpmwPSzpH0t5mVzK4CDmANWP7ZEl3S7o2Il5pej2DipADWBO2N+pIxPdExD1Nr2eQEXIAxfnIlxTsljQXEbc2vZ5BR8gTsz0l6Y+SzrJ9yPZY02sCFpwn6WpJF9nev/BzWdOLGlRsPwSA5DgiB4DkCDkAJEfIASA5Qg4AyRFyAEiOkANAcoQcAJL7H3uQpLnOTmvYAAAAAElFTkSuQmCC\n", | |
| "text/plain": [ | |
| "<Figure size 432x288 with 1 Axes>" | |
| ] | |
| }, | |
| "metadata": { | |
| "needs_background": "light" | |
| }, | |
| "output_type": "display_data" | |
| } | |
| ], | |
| "source": [ | |
| "plt.boxplot([cats, dogs]);" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 50, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "Ttest_indResult(statistic=-2.7559348517984787, pvalue=0.006398917300031993)" | |
| ] | |
| }, | |
| "execution_count": 50, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "from scipy.stats import ttest_ind\n", | |
| "ttest_ind(cats,dogs)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "# Comapring Continuous Numeric Values " | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 142, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "height = np.random.normal(66,6,100)\n", | |
| "weight = (30+2*np.sqrt(height)) + np.random.normal(0,2,size=(100))" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "### Suppose we have the height and weight of various people. There is a slight trend, but also a lot of noise.\n" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 143, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "Text(0, 0.5, 'weight')" | |
| ] | |
| }, | |
| "execution_count": 143, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| }, | |
| { | |
| "data": { | |
| "image/png": "\n", | |
| "text/plain": [ | |
| "<Figure size 432x288 with 1 Axes>" | |
| ] | |
| }, | |
| "metadata": { | |
| "needs_background": "light" | |
| }, | |
| "output_type": "display_data" | |
| } | |
| ], | |
| "source": [ | |
| "plt.scatter(height,weight);\n", | |
| "plt.xlabel('height')\n", | |
| "plt.ylabel('weight')" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "## Question: Do taller people weigh more than shorter people?" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "### We could just split this into two groups, choosing the mean value as a threshold." | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 144, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "sample1 = weight[height<height.mean()]\n", | |
| "sample2 = weight[height>=height.mean()]" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "## By doing this, we can perform a two-sample t-test, as if it were categorical" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 145, | |
| "metadata": { | |
| "scrolled": true | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stderr", | |
| "output_type": "stream", | |
| "text": [ | |
| "/home/land/anaconda3/lib/python3.7/site-packages/numpy/core/_asarray.py:83: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray\n", | |
| " return array(a, dtype, copy=False, order=order)\n" | |
| ] | |
| }, | |
| { | |
| "data": { | |
| "image/png": "\n", | |
| "text/plain": [ | |
| "<Figure size 432x288 with 2 Axes>" | |
| ] | |
| }, | |
| "metadata": { | |
| "needs_background": "light" | |
| }, | |
| "output_type": "display_data" | |
| } | |
| ], | |
| "source": [ | |
| "fig, axs = plt.subplots(1,2)\n", | |
| "\n", | |
| "axs[0].scatter(height[height<height.mean()], weight[height<height.mean()])\n", | |
| "axs[0].scatter(height[height>height.mean()], weight[height>height.mean()])\n", | |
| "axs[1].boxplot([weight[height<height.mean()],weight[height>height.mean()]]);\n" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 146, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "Ttest_indResult(statistic=-3.7868129617579447, pvalue=0.00026318530898211554)" | |
| ] | |
| }, | |
| "execution_count": 146, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "from scipy.stats import ttest_ind\n", | |
| "ttest_ind(sample1, sample2)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "### The t-test shows us there is a signficant difference between the two groups, which is a valid conclusion. But in dividing the two groups, we have lost information. We no longer know how far to the left or right, inside the group each data point is." | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "### By using the pearson correlation test, we test not just the difference due to two groups, but the difference due to values across a whole range of data. Often, we'll see greater signficance in correlation tests (if a correlation truly exists)." | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 148, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "(0.4501710655403607, 2.611235184744756e-06)" | |
| ] | |
| }, | |
| "execution_count": 148, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "\n", | |
| "from scipy.stats import pearsonr\n", | |
| "pearsonr(height, weight)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [] | |
| } | |
| ], | |
| "metadata": { | |
| "kernelspec": { | |
| "display_name": "Python 3", | |
| "language": "python", | |
| "name": "python3" | |
| }, | |
| "language_info": { | |
| "codemirror_mode": { | |
| "name": "ipython", | |
| "version": 3 | |
| }, | |
| "file_extension": ".py", | |
| "mimetype": "text/x-python", | |
| "name": "python", | |
| "nbconvert_exporter": "python", | |
| "pygments_lexer": "ipython3", | |
| "version": "3.7.6" | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 4 | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment