datavudeja · August 15, 2025 10:50
diff --git a/pandas.ipynb b/pandas.ipynb
 {
  "metadata": {
    "kernelspec": {
      "name": "python",
      "display_name": "Python (Pyodide)",
      "language": "python"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "python",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.8"
    }
  },
  "nbformat_minor": 4,
  "nbformat": 4,
  "cells": [
    {
      "cell_type": "code",
      "source": "import pandas as pd\nimport numpy as np",
      "metadata": {
        "trusted": true
      },
      "outputs": [],
      "execution_count": 1
    },
    {
      "cell_type": "markdown",
      "source": "### Series\n",
      "metadata": {
        "jp-MarkdownHeadingCollapsed": true
      }
    },
    {
      "cell_type": "markdown",
      "source": "#### 2.1 Series Creation",
      "metadata": {
        "jp-MarkdownHeadingCollapsed": true
      }
    },
    {
      "cell_type": "code",
      "source": "x = pd.Series([5, 10, 15, 20, 25, 30, 35])",
      "metadata": {
        "trusted": true
      },
      "outputs": [],
      "execution_count": 4
    },
    {
      "cell_type": "code",
      "source": "print(x)",
      "metadata": {
        "trusted": true
      },
      "outputs": [
        {
          "name": "stdout",
          "text": "0     5\n1    10\n2    15\n3    20\n4    25\n5    30\n6    35\ndtype: int64\n",
          "output_type": "stream"
        }
      ],
      "execution_count": 5
    },
    {
      "cell_type": "code",
      "source": "type(x)",
      "metadata": {
        "trusted": true
      },
      "outputs": [
        {
          "execution_count": 6,
          "output_type": "execute_result",
          "data": {
            "text/plain": "pandas.core.series.Series"
          },
          "metadata": {}
        }
      ],
      "execution_count": 6
    },
    {
      "cell_type": "code",
      "source": "x.head()",
      "metadata": {
        "trusted": true
      },
      "outputs": [
        {
          "execution_count": 7,
          "output_type": "execute_result",
          "data": {
            "text/plain": "0     5\n1    10\n2    15\n3    20\n4    25\ndtype: int64"
          },
          "metadata": {}
        }
      ],
      "execution_count": 7
    },
    {
      "cell_type": "code",
      "source": "x.tail()",
      "metadata": {
        "trusted": true
      },
      "outputs": [
        {
          "execution_count": 8,
          "output_type": "execute_result",
          "data": {
            "text/plain": "2    15\n3    20\n4    25\n5    30\n6    35\ndtype: int64"
          },
          "metadata": {}
        }
      ],
      "execution_count": 8
    },
    {
      "cell_type": "code",
      "source": "data = {'a' : 0.,'b' : 1., 'c' : 2.,'d': 3,}\n\ny = pd.Series(data)\n\nprint(y)",
      "metadata": {
        "trusted": true
      },
      "outputs": [
        {
          "name": "stdout",
          "text": "a    0.0\nb    1.0\nc    2.0\nd    3.0\ndtype: float64\n",
          "output_type": "stream"
        }
      ],
      "execution_count": 11
    },
    {
      "cell_type": "code",
      "source": "z = np.array([10, 20, 30 , 40], dtype = 'int64')\n\npd.Series(z)",
      "metadata": {
        "trusted": true
      },
      "outputs": [
        {
          "execution_count": 16,
          "output_type": "execute_result",
          "data": {
            "text/plain": "0    10\n1    20\n2    30\n3    40\ndtype: int64"
          },
          "metadata": {}
        }
      ],
      "execution_count": 16
    },
    {
      "cell_type": "code",
      "source": "pd.Series(np.linspace(start = 1, stop = 2, num = 10))",
      "metadata": {
        "trusted": true
      },
      "outputs": [
        {
          "execution_count": 17,
          "output_type": "execute_result",
          "data": {
            "text/plain": "0    1.000000\n1    1.111111\n2    1.222222\n3    1.333333\n4    1.444444\n5    1.555556\n6    1.666667\n7    1.777778\n8    1.888889\n9    2.000000\ndtype: float64"
          },
          "metadata": {}
        }
      ],
      "execution_count": 17
    },
    {
      "cell_type": "markdown",
      "source": "#### 2.2 Basic Indexing",
      "metadata": {
        "jp-MarkdownHeadingCollapsed": true
      }
    },
    {
      "cell_type": "code",
      "source": "x = pd.Series([5, 10, 15, 20, 25])\nx.index = [3, 1, 4, 0, 2]\n\nprint(x)",
      "metadata": {
        "trusted": true
      },
      "outputs": [
        {
          "name": "stdout",
          "text": "3     5\n1    10\n4    15\n0    20\n2    25\ndtype: int64\n",
          "output_type": "stream"
        }
      ],
      "execution_count": 21
    },
    {
      "cell_type": "code",
      "source": "x[0]",
      "metadata": {
        "trusted": true
      },
      "outputs": [
        {
          "execution_count": 22,
          "output_type": "execute_result",
          "data": {
            "text/plain": "20"
          },
          "metadata": {}
        }
      ],
      "execution_count": 22
    },
    {
      "cell_type": "code",
      "source": "x.loc[0]",
      "metadata": {
        "trusted": true
      },
      "outputs": [
        {
          "execution_count": 25,
          "output_type": "execute_result",
          "data": {
            "text/plain": "20"
          },
          "metadata": {}
        }
      ],
      "execution_count": 25
    },
    {
      "cell_type": "code",
      "source": "x.iloc[0]",
      "metadata": {
        "trusted": true
      },
      "outputs": [
        {
          "execution_count": 29,
          "output_type": "execute_result",
          "data": {
            "text/plain": "5"
          },
          "metadata": {}
        }
      ],
      "execution_count": 29
    },
    {
      "cell_type": "code",
      "source": "x.iloc[1:4:2]",
      "metadata": {
        "trusted": true
      },
      "outputs": [
        {
          "execution_count": 30,
          "output_type": "execute_result",
          "data": {
            "text/plain": "1    10\n0    20\ndtype: int64"
          },
          "metadata": {}
        }
      ],
      "execution_count": 30
    },
    {
      "cell_type": "markdown",
      "source": "###### Range index vs int64 Index",
      "metadata": {}
    },
    {
      "cell_type": "code",
      "source": "x = pd.Series(np.random.normal(size = 10**7))\nprint(x)",
      "metadata": {
        "trusted": true
      },
      "outputs": [
        {
          "name": "stdout",
          "text": "0         -0.095360\n1         -0.531923\n2          0.989790\n3         -0.515691\n4          0.229343\n             ...   \n9999995   -0.308055\n9999996    0.685634\n9999997    1.574605\n9999998   -0.551347\n9999999   -0.376001\nLength: 10000000, dtype: float64\n",
          "output_type": "stream"
        }
      ],
      "execution_count": 33
    },
    {
      "cell_type": "code",
      "source": "x.index",
      "metadata": {
        "trusted": true
      },
      "outputs": [
        {
          "execution_count": 39,
          "output_type": "execute_result",
          "data": {
            "text/plain": "RangeIndex(start=0, stop=10000000, step=1)"
          },
          "metadata": {}
        }
      ],
      "execution_count": 39
    },
    {
      "cell_type": "code",
      "source": "y = pd.Series(np.random.normal(size = 10**7), index = np.arange(10**7))\nprint(y)",
      "metadata": {
        "trusted": true
      },
      "outputs": [
        {
          "name": "stdout",
          "text": "0         -0.048644\n1          1.150449\n2          1.340439\n3          1.286724\n4         -1.398852\n             ...   \n9999995   -1.448750\n9999996    0.604911\n9999997    0.025833\n9999998   -0.064581\n9999999   -0.112200\nLength: 10000000, dtype: float64\n",
          "output_type": "stream"
        }
      ],
      "execution_count": 36
    },
    {
      "cell_type": "code",
      "source": "y.index",
      "metadata": {
        "trusted": true
      },
      "outputs": [
        {
          "execution_count": 38,
          "output_type": "execute_result",
          "data": {
            "text/plain": "Int64Index([      0,       1,       2,       3,       4,       5,       6,\n                  7,       8,       9,\n            ...\n            9999990, 9999991, 9999992, 9999993, 9999994, 9999995, 9999996,\n            9999997, 9999998, 9999999],\n           dtype='int64', length=10000000)"
          },
          "metadata": {}
        }
      ],
      "execution_count": 38
    },
    {
      "cell_type": "markdown",
      "source": "int64 index consumes more memory than range index. it literally stores all the 10^7 indices",
      "metadata": {}
    },
    {
      "cell_type": "code",
      "source": "import sys",
      "metadata": {
        "trusted": true
      },
      "outputs": [],
      "execution_count": 40
    },
    {
      "cell_type": "code",
      "source": "sys.getsizeof(x)",
      "metadata": {
        "trusted": true
      },
      "outputs": [
        {
          "execution_count": 41,
          "output_type": "execute_result",
          "data": {
            "text/plain": "80000088"
          },
          "metadata": {}
        }
      ],
      "execution_count": 41
    },
    {
      "cell_type": "code",
      "source": "sys.getsizeof(y)",
      "metadata": {
        "trusted": true
      },
      "outputs": [
        {
          "execution_count": 43,
          "output_type": "execute_result",
          "data": {
            "text/plain": "160000016"
          },
          "metadata": {}
        }
      ],
      "execution_count": 43
    },
    {
      "cell_type": "markdown",
      "source": "###### Overwriting data in a series",
      "metadata": {}
    },
    {
      "cell_type": "code",
      "source": "foo = pd.Series([10, 20, 30, 40, 50], index = ['a', 'b', 'c','d','e'])\nprint(foo)",
      "metadata": {
        "trusted": true
      },
      "outputs": [
        {
          "name": "stdout",
          "text": "a    10\nb    20\nc    30\nd    40\ne    50\ndtype: int64\n",
          "output_type": "stream"
        }
      ],
      "execution_count": 44
    },
    {
      "cell_type": "code",
      "source": "foo.iloc[1] = 200\nprint(foo)",
      "metadata": {
        "trusted": true
      },
      "outputs": [
        {
          "name": "stdout",
          "text": "a     10\nb    200\nc     30\nd     40\ne     50\ndtype: int64\n",
          "output_type": "stream"
        }
      ],
      "execution_count": 46
    },
    {
      "cell_type": "code",
      "source": "foo.iloc[[0,1,2]] = 99\nprint(foo)",
      "metadata": {
        "trusted": true
      },
      "outputs": [
        {
          "name": "stdout",
          "text": "a    99\nb    99\nc    99\nd    40\ne    50\ndtype: int64\n",
          "output_type": "stream"
        }
      ],
      "execution_count": 48
    },
    {
      "cell_type": "code",
      "source": "foo.iloc[:3] = 999\nprint(foo)",
      "metadata": {
        "trusted": true
      },
      "outputs": [
        {
          "name": "stdout",
          "text": "a    999\nb    999\nc    999\nd     40\ne     50\ndtype: int64\n",
          "output_type": "stream"
        }
      ],
      "execution_count": 50
    },
    {
      "cell_type": "code",
      "source": "x = pd.Series([10, 20, 30, 40])\ny = pd.Series([1, 11, 111, 1111], index = [7,3,2,0])\n\nprint(y)",
      "metadata": {
        "trusted": true
      },
      "outputs": [
        {
          "name": "stdout",
          "text": "7       1\n3      11\n2     111\n0    1111\ndtype: int64\n",
          "output_type": "stream"
        }
      ],
      "execution_count": 61
    },
    {
      "cell_type": "code",
      "source": "print(x)\n",
      "metadata": {
        "trusted": true
      },
      "outputs": [
        {
          "name": "stdout",
          "text": "0    10\n1    20\n2    30\n3    40\ndtype: int64\n",
          "output_type": "stream"
        }
      ],
      "execution_count": 62
    },
    {
      "cell_type": "code",
      "source": "x.loc[[0,1]] = y",
      "metadata": {
        "trusted": true
      },
      "outputs": [],
      "execution_count": 53
    },
    {
      "cell_type": "code",
      "source": "print(x)  # looks for labels 0, 1 in x and try to replace the same with 0, 1 from y. as there is no 1 in y, it changes to NaN",
      "metadata": {
        "trusted": true
      },
      "outputs": [
        {
          "name": "stdout",
          "text": "0    1111.0\n1       NaN\n2      30.0\n3      40.0\ndtype: float64\n",
          "output_type": "stream"
        }
      ],
      "execution_count": 55
    },
    {
      "cell_type": "code",
      "source": "x.iloc[:2] = y.to_numpy()[:2]\nprint(x)",
      "metadata": {
        "trusted": true
      },
      "outputs": [
        {
          "name": "stdout",
          "text": "0     1\n1    11\n2    30\n3    40\ndtype: int64\n",
          "output_type": "stream"
        }
      ],
      "execution_count": 63
    },
    {
      "cell_type": "markdown",
      "source": "#### 2.3 Series Basic Operations",
      "metadata": {
        "jp-MarkdownHeadingCollapsed": true
      }
    },
    {
      "cell_type": "code",
      "source": "x = pd.Series([1,2,3,4])\nprint(x)",
      "metadata": {
        "trusted": true
      },
      "outputs": [
        {
          "name": "stdout",
          "text": "0    1\n1    2\n2    3\n3    4\ndtype: int64\n",
          "output_type": "stream"
        }
      ],
      "execution_count": 66
    },
    {
      "cell_type": "code",
      "source": "x + 1",
      "metadata": {
        "trusted": true
      },
      "outputs": [
        {
          "execution_count": 67,
          "output_type": "execute_result",
          "data": {
            "text/plain": "0    2\n1    3\n2    4\n3    5\ndtype: int64"
          },
          "metadata": {}
        }
      ],
      "execution_count": 67
    },
    {
      "cell_type": "code",
      "source": "x + pd.Series(1)  # only adds at index 0",
      "metadata": {
        "trusted": true
      },
      "outputs": [
        {
          "execution_count": 68,
          "output_type": "execute_result",
          "data": {
            "text/plain": "0    2.0\n1    NaN\n2    NaN\n3    NaN\ndtype: float64"
          },
          "metadata": {}
        }
      ],
      "execution_count": 68
    },
    {
      "cell_type": "markdown",
      "source": "what is we don't want NaNs?",
      "metadata": {}
    },
    {
      "cell_type": "code",
      "source": "x = pd.Series([1, 2,3,4])\ny = pd.Series([10, 20], index = [2,1])\n\nprint(x)\nprint(y)",
      "metadata": {
        "trusted": true
      },
      "outputs": [
        {
          "name": "stdout",
          "text": "0    1\n1    2\n2    3\n3    4\ndtype: int64\n2    10\n1    20\ndtype: int64\n",
          "output_type": "stream"
        }
      ],
      "execution_count": 69
    },
    {
      "cell_type": "code",
      "source": "x.loc[y.index] += y\nprint(x)",
      "metadata": {
        "trusted": true
      },
      "outputs": [
        {
          "name": "stdout",
          "text": "0     1\n1    22\n2    13\n3     4\ndtype: int64\n",
          "output_type": "stream"
        }
      ],
      "execution_count": 71
    },
    {
      "cell_type": "markdown",
      "source": "#### 2.4 Boolean Indexing",
      "metadata": {
        "jp-MarkdownHeadingCollapsed": true
      }
    },
    {
      "cell_type": "code",
      "source": "foo = pd.Series([20, 50, 11, 45, 17, 31])\nprint(foo)",
      "metadata": {
        "trusted": true
      },
      "outputs": [
        {
          "name": "stdout",
          "text": "0    20\n1    50\n2    11\n3    45\n4    17\n5    31\ndtype: int64\n",
          "output_type": "stream"
        }
      ],
      "execution_count": 73
    },
    {
      "cell_type": "code",
      "source": "foo < 20",
      "metadata": {
        "trusted": true
      },
      "outputs": [
        {
          "execution_count": 74,
          "output_type": "execute_result",
          "data": {
            "text/plain": "0    False\n1    False\n2     True\n3    False\n4     True\n5    False\ndtype: bool"
          },
          "metadata": {}
        }
      ],
      "execution_count": 74
    },
    {
      "cell_type": "code",
      "source": "mask = foo < 20\nfoo.loc[mask]  #gets only values below 20. you can write foo.loc[foo < 20] too.",
      "metadata": {
        "trusted": true
      },
      "outputs": [
        {
          "execution_count": 75,
          "output_type": "execute_result",
          "data": {
            "text/plain": "2    11\n4    17\ndtype: int64"
          },
          "metadata": {}
        }
      ],
      "execution_count": 75
    },
    {
      "cell_type": "code",
      "source": "foo.index = [0,1,2,3,5,4] # swap last 2 indices\nprint(foo)",
      "metadata": {
        "trusted": true
      },
      "outputs": [
        {
          "name": "stdout",
          "text": "0    20\n1    50\n2    11\n3    45\n5    17\n4    31\ndtype: int64\n",
          "output_type": "stream"
        }
      ],
      "execution_count": 77
    },
    {
      "cell_type": "code",
      "source": "foo.loc[mask] # tries to match indicex of foo and mask",
      "metadata": {
        "trusted": true
      },
      "outputs": [
        {
          "execution_count": 78,
          "output_type": "execute_result",
          "data": {
            "text/plain": "2    11\n4    31\ndtype: int64"
          },
          "metadata": {}
        }
      ],
      "execution_count": 78
    },
    {
      "cell_type": "code",
      "source": "foo.loc[mask.to_numpy()]",
      "metadata": {
        "trusted": true
      },
      "outputs": [
        {
          "execution_count": 80,
          "output_type": "execute_result",
          "data": {
            "text/plain": "2    11\n5    17\ndtype: int64"
          },
          "metadata": {}
        }
      ],
      "execution_count": 80
    },
    {
      "cell_type": "markdown",
      "source": "combining boolean series - element wise and element wise or negation",
      "metadata": {}
    },
    {
      "cell_type": "code",
      "source": "ages = pd.Series(\n    data = [42, 43, 14, 18, 1],\n    index = ['peter', 'lois', 'chris', 'meg', 'stewie']\n)\nprint(ages)",
      "metadata": {
        "trusted": true
      },
      "outputs": [
        {
          "name": "stdout",
          "text": "peter     42\nlois      43\nchris     14\nmeg       18\nstewie     1\ndtype: int64\n",
          "output_type": "stream"
        }
      ],
      "execution_count": 81
    },
    {
      "cell_type": "code",
      "source": "genders = pd.Series(\n    data = ['female', 'female', 'male', 'male', 'male'],\n    index = ['lois', 'meg', 'chris', 'peter', 'stewie'],\n    dtype = 'string'\n)\nprint(genders)",
      "metadata": {
        "trusted": true
      },
      "outputs": [
        {
          "name": "stdout",
          "text": "lois      female\nmeg       female\nchris       male\npeter       male\nstewie      male\ndtype: string\n",
          "output_type": "stream"
        }
      ],
      "execution_count": 83
    },
    {
      "cell_type": "code",
      "source": "# Even though their indices are in diff order. we can still answer questions like\n# who's a male younger than 18?\n\nmask = (genders == 'male') & (ages < 18)\nmask.loc[mask]",
      "metadata": {
        "trusted": true
      },
      "outputs": [
        {
          "execution_count": 85,
          "output_type": "execute_result",
          "data": {
            "text/plain": "chris     True\nstewie    True\ndtype: bool"
          },
          "metadata": {}
        }
      ],
      "execution_count": 85
    },
    {
      "cell_type": "code",
      "source": "mask",
      "metadata": {
        "trusted": true
      },
      "outputs": [
        {
          "execution_count": 86,
          "output_type": "execute_result",
          "data": {
            "text/plain": "chris      True\nlois      False\nmeg       False\npeter     False\nstewie     True\ndtype: bool"
          },
          "metadata": {}
        }
      ],
      "execution_count": 86
    },
    {
      "cell_type": "code",
      "source": "~mask",
      "metadata": {
        "trusted": true
      },
      "outputs": [
        {
          "execution_count": 87,
          "output_type": "execute_result",
          "data": {
            "text/plain": "chris     False\nlois       True\nmeg        True\npeter      True\nstewie    False\ndtype: bool"
          },
          "metadata": {}
        }
      ],
      "execution_count": 87
    },
    {
      "cell_type": "markdown",
      "source": "#### 2.5 Series Missing Values",
      "metadata": {
        "jp-MarkdownHeadingCollapsed": true
      }
    },
    {
      "cell_type": "code",
      "source": "# history of Nan\n\nroux = pd.Series([1,2,3])\nprint(roux)\nroux.iloc[1] = np.nan\nprint(roux) # converts the whole series to floating point number",
      "metadata": {
        "trusted": true
      },
      "outputs": [
        {
          "name": "stdout",
          "text": "0    1\n1    2\n2    3\ndtype: int64\n0    1.0\n1    NaN\n2    3.0\ndtype: float64\n",
          "output_type": "stream"
        }
      ],
      "execution_count": 92
    },
    {
      "cell_type": "code",
      "source": "roux = pd.Series([1,2,3], dtype = 'Int64')\nprint(roux)",
      "metadata": {
        "trusted": true
      },
      "outputs": [
        {
          "name": "stdout",
          "text": "0    1\n1    2\n2    3\ndtype: Int64\n",
          "output_type": "stream"
        }
      ],
      "execution_count": 93
    },
    {
      "cell_type": "code",
      "source": "roux.iloc[1] = np.nan # or pd.Na or None\nprint(roux)",
      "metadata": {
        "trusted": true
      },
      "outputs": [
        {
          "name": "stdout",
          "text": "0       1\n1    <NA>\n2       3\ndtype: Int64\n",
          "output_type": "stream"
        }
      ],
      "execution_count": 95
    },
    {
      "cell_type": "code",
      "source": "# fill na\nx = pd.Series([1, pd.NA, 3, pd.NA], dtype = 'Int64')\nprint(x)",
      "metadata": {
        "trusted": true
      },
      "outputs": [
        {
          "name": "stdout",
          "text": "0       1\n1    <NA>\n2       3\n3    <NA>\ndtype: Int64\n",
          "output_type": "stream"
        }
      ],
      "execution_count": 96
    },
    {
      "cell_type": "code",
      "source": "x.fillna(-1)",
      "metadata": {
        "trusted": true
      },
      "outputs": [
        {
          "execution_count": 97,
          "output_type": "execute_result",
          "data": {
            "text/plain": "0     1\n1    -1\n2     3\n3    -1\ndtype: Int64"
          },
          "metadata": {}
        }
      ],
      "execution_count": 97
    },
    {
      "cell_type": "code",
      "source": "x # notice the na values hasn't changed",
      "metadata": {
        "trusted": true
      },
      "outputs": [
        {
          "execution_count": 98,
          "output_type": "execute_result",
          "data": {
            "text/plain": "0       1\n1    <NA>\n2       3\n3    <NA>\ndtype: Int64"
          },
          "metadata": {}
        }
      ],
      "execution_count": 98
    },
    {
      "cell_type": "code",
      "source": "# to change it. use inplace=True\nx.fillna(-1, inplace=True)\nprint(x)",
      "metadata": {
        "trusted": true
      },
      "outputs": [
        {
          "name": "stdout",
          "text": "0     1\n1    -1\n2     3\n3    -1\ndtype: Int64\n",
          "output_type": "stream"
        }
      ],
      "execution_count": 99
    },
    {
      "cell_type": "markdown",
      "source": "#### 2.6 Vectorization",
      "metadata": {
        "jp-MarkdownHeadingCollapsed": true
      }
    },
    {
      "cell_type": "code",
      "source": "x = pd.Series(np.random.uniform(low = 1, high = 2, size = 10**6))\nprint(x)\n",
      "metadata": {
        "trusted": true
      },
      "outputs": [
        {
          "name": "stdout",
          "text": "0         1.410044\n1         1.161561\n2         1.107593\n3         1.293172\n4         1.155268\n            ...   \n999995    1.682250\n999996    1.589057\n999997    1.310423\n999998    1.540775\n999999    1.796745\nLength: 1000000, dtype: float64\n",
          "output_type": "stream"
        }
      ],
      "execution_count": 101
    },
    {
      "cell_type": "code",
      "source": "def average(x):\n    avg = 0.0\n    for i in range(x.size):\n        avg += x.iloc[i]/x.size\n    return avg",
      "metadata": {
        "trusted": true
      },
      "outputs": [],
      "execution_count": 104
    },
    {
      "cell_type": "code",
      "source": "average(x) # very slow",
      "metadata": {
        "trusted": true
      },
      "outputs": [
        {
          "execution_count": 105,
          "output_type": "execute_result",
          "data": {
            "text/plain": "1.4999582043534072"
          },
          "metadata": {}
        }
      ],
      "execution_count": 105
    },
    {
      "cell_type": "code",
      "source": "np.mean(x) # very fast\n\n# avoid using loops.",
      "metadata": {
        "trusted": true
      },
      "outputs": [
        {
          "execution_count": 106,
          "output_type": "execute_result",
          "data": {
            "text/plain": "1.4999582043534536"
          },
          "metadata": {}
        }
      ],
      "execution_count": 106
    },
    {
      "cell_type": "markdown",
      "source": "#### 2.7 apply()",
      "metadata": {
        "jp-MarkdownHeadingCollapsed": true
      }
    },
    {
      "cell_type": "code",
      "source": "foo = pd.Series([3,9,2,2,8,7])\nprint(foo)",
      "metadata": {
        "trusted": true
      },
      "outputs": [
        {
          "name": "stdout",
          "text": "0    3\n1    9\n2    2\n3    2\n4    8\n5    7\ndtype: int64\n",
          "output_type": "stream"
        }
      ],
      "execution_count": 108
    },
    {
      "cell_type": "code",
      "source": "def my_func(x, a = 1, b = 3):\n    return x - a if x % 2 == 0 else x + b",
      "metadata": {
        "trusted": true
      },
      "outputs": [],
      "execution_count": 112
    },
    {
      "cell_type": "code",
      "source": "foo.apply(my_func, a = 2, b = 4)",
      "metadata": {
        "trusted": true
      },
      "outputs": [
        {
          "execution_count": 114,
          "output_type": "execute_result",
          "data": {
            "text/plain": "0     7\n1    13\n2     0\n3     0\n4     6\n5    11\ndtype: int64"
          },
          "metadata": {}
        }
      ],
      "execution_count": 114
    },
    {
      "cell_type": "code",
      "source": "# apply() is not vectorized\n\nbigfoo = pd.Series(np.random.randint(low = 0, high = 9, size = 10**7))\nprint(bigfoo)",
      "metadata": {
        "trusted": true
      },
      "outputs": [
        {
          "name": "stdout",
          "text": "0          0\n1          7\n2          2\n3          2\n4          8\n          ..\n9999995    5\n9999996    1\n9999997    6\n9999998    4\n9999999    5\nLength: 10000000, dtype: int32\n",
          "output_type": "stream"
        }
      ],
      "execution_count": 116
    },
    {
      "cell_type": "code",
      "source": "%%timeit\ny1 = bigfoo.apply(my_func)",
      "metadata": {
        "trusted": true
      },
      "outputs": [
        {
          "name": "stdout",
          "text": "4.96 s ± 771 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n",
          "output_type": "stream"
        }
      ],
      "execution_count": 117
    },
    {
      "cell_type": "code",
      "source": "%%timeit\na = bigfoo.to_numpy()\ny2 = pd.Series(np.where(a % 2 == 0, a - 1, a + 3))",
      "metadata": {
        "trusted": true
      },
      "outputs": [
        {
          "name": "stdout",
          "text": "90.2 ms ± 29.2 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n",
          "output_type": "stream"
        }
      ],
      "execution_count": 118
    },
    {
      "cell_type": "code",
      "source": "# apply method is for convienience and clarity. not speed",
      "metadata": {
        "trusted": true
      },
      "outputs": [],
      "execution_count": 119
    },
    {
      "cell_type": "markdown",
      "source": "#### 2.8 View vs Copy",
      "metadata": {
        "jp-MarkdownHeadingCollapsed": true
      }
    },
    {
      "cell_type": "code",
      "source": "x = pd.Series(\n    data = [2,3,5,7,11],\n    index = [2,11,12, 30, 30]\n)\nprint(x)",
      "metadata": {
        "trusted": true
      },
      "outputs": [
        {
          "name": "stdout",
          "text": "2      2\n11     3\n12     5\n30     7\n30    11\ndtype: int64\n",
          "output_type": "stream"
        }
      ],
      "execution_count": 120
    },
    {
      "cell_type": "code",
      "source": "y = x\ny.iloc[0] = 99\nprint(y)",
      "metadata": {
        "trusted": true
      },
      "outputs": [
        {
          "name": "stdout",
          "text": "2     99\n11     3\n12     5\n30     7\n30    11\ndtype: int64\n",
          "output_type": "stream"
        }
      ],
      "execution_count": 121
    },
    {
      "cell_type": "code",
      "source": "print(x) # also modifies x",
      "metadata": {
        "trusted": true
      },
      "outputs": [
        {
          "name": "stdout",
          "text": "2     99\n11     3\n12     5\n30     7\n30    11\ndtype: int64\n",
          "output_type": "stream"
        }
      ],
      "execution_count": 123
    },
    {
      "cell_type": "code",
      "source": "y = x.copy()\ny.iloc[0] = 999\nprint(y)",
      "metadata": {
        "trusted": true
      },
      "outputs": [
        {
          "name": "stdout",
          "text": "2     999\n11      3\n12      5\n30      7\n30     11\ndtype: int64\n",
          "output_type": "stream"
        }
      ],
      "execution_count": 124
    },
    {
      "cell_type": "code",
      "source": "print(x) # now x is not changed",
      "metadata": {
        "trusted": true
      },
      "outputs": [
        {
          "name": "stdout",
          "text": "2     99\n11     3\n12     5\n30     7\n30    11\ndtype: int64\n",
          "output_type": "stream"
        }
      ],
      "execution_count": 125
    },
    {
      "cell_type": "markdown",
      "source": "#### 2.9 Challenge : Baby Names",
      "metadata": {
        "jp-MarkdownHeadingCollapsed": true
      }
    },
    {
      "cell_type": "code",
      "source": "# you and your spouse have decide to let the internet name your next child. You've asked the great people of the web to \n# submit their favorite names and you've compiled their submissions\n# into a series called babynames. Determine how many people have voted for the following names: 'Chad', 'Ruger', and 'Zeltron'",
      "metadata": {
        "trusted": true
      },
      "outputs": [],
      "execution_count": 126
    },
    {
      "cell_type": "code",
      "source": "babynames = pd.Series(['Jathonathon', 'Zeltron', 'Ruger', 'Phreddy', 'Ruger', 'Chad', 'Chad' 'Ruger', 'Ryan', 'Ruger', 'Chad', 'Ryan', 'Phreddy', 'Phreddy', 'Phreddy', 'Mister', 'Zeltron'\n                        , 'Ryan', 'Ruger', 'Ruger', 'Jathonathon', 'Jathonathon', 'Ruger', 'Chad', 'Zeltron'], dtype = 'string')",
      "metadata": {
        "trusted": true
      },
      "outputs": [],
      "execution_count": 127
    },
    {
      "cell_type": "code",
      "source": "# solution\n\nbabynames.value_counts().loc[['Chad', 'Ruger', 'Zeltron']] # values_counts - returns a series containing counts of unique values.",
      "metadata": {
        "trusted": true
      },
      "outputs": [
        {
          "execution_count": 129,
          "output_type": "execute_result",
          "data": {
            "text/plain": "Chad       3\nRuger      6\nZeltron    3\ndtype: Int64"
          },
          "metadata": {}
        }
      ],
      "execution_count": 129
    },
    {
      "cell_type": "markdown",
      "source": "#### 2.10 Challenge: Bess Knees",
      "metadata": {
        "jp-MarkdownHeadingCollapsed": true
      }
    },
    {
      "cell_type": "code",
      "source": "# Given, two series, bees and knees, if the ith value of bees is NaN, double the ith value inside knees.",
      "metadata": {
        "trusted": true
      },
      "outputs": [],
      "execution_count": 130
    },
    {
      "cell_type": "code",
      "source": "bees = pd.Series([True, True, False, np.nan, True, False, True, np.nan])\nprint(bees)",
      "metadata": {
        "trusted": true
      },
      "outputs": [
        {
          "name": "stdout",
          "text": "0     True\n1     True\n2    False\n3      NaN\n4     True\n5    False\n6     True\n7      NaN\ndtype: object\n",
          "output_type": "stream"
        }
      ],
      "execution_count": 132
    },
    {
      "cell_type": "code",
      "source": "knees = pd.Series([5,2,9,1,3,10,5,2], index = [7,0,2,6,3,5,1,4])\nprint(knees)",
      "metadata": {
        "trusted": true
      },
      "outputs": [
        {
          "name": "stdout",
          "text": "7     5\n0     2\n2     9\n6     1\n3     3\n5    10\n1     5\n4     2\ndtype: int64\n",
          "output_type": "stream"
        }
      ],
      "execution_count": 137
    },
    {
      "cell_type": "code",
      "source": "# solution\n\n# to_numpy is important else the index wouldn't have matched.\n# knees.loc[pd.isna(bees)] will indices from bees\n\nknees.loc[pd.isna(bees).to_numpy()] *= 2\nprint(knees)",
      "metadata": {
        "trusted": true
      },
      "outputs": [
        {
          "name": "stdout",
          "text": "7     5\n0     2\n2     9\n6     2\n3     3\n5    10\n1     5\n4     4\ndtype: int64\n",
          "output_type": "stream"
        }
      ],
      "execution_count": 136
    },
    {
      "cell_type": "markdown",
      "source": "#### 2.11 Challenge: Car Shopping",
      "metadata": {
        "jp-MarkdownHeadingCollapsed": true
      }
    },
    {
      "cell_type": "code",
      "source": "# After accidentally leaving an ice chest of fish and shrimp in your car for a week while you were on vacation, you're now in the market for a new vehicle.\n# Your insurance didn't cover the loss, so you want to make sure you get a good deal on your new car. Given a series of car asking_prices and another series of car fair_prices,\n# determine which cars for sale are a good deal. In other words, identify cars whose asking price is less than their fair price.\n\n# The result should be a list of integer indices corresponding to the good deals in asking_prices.",
      "metadata": {
        "trusted": true
      },
      "outputs": [],
      "execution_count": 141
    },
    {
      "cell_type": "code",
      "source": "asking_prices = pd.Series([5000, 7600, 9000, 8500, 7000], index =['civic', 'civic', 'camry', 'mustang', 'mustang'])\nfair_prices = pd.Series([5500, 7500, 7500], index = ['civic', 'mustang', 'camry'])",
      "metadata": {
        "trusted": true
      },
      "outputs": [],
      "execution_count": 152
    },
    {
      "cell_type": "code",
      "source": "# solution\n\n# we can restructure this and solve it with Dataframes. we haven't learnt dataframes yet.\n\nall_fair_prices = fair_prices.loc[asking_prices.index]\noff_market_prices = asking_prices - all_fair_prices\nbelow_fair_prices = (off_market_prices < 0).reset_index(drop=True)\nbelow_fair_prices.loc[below_fair_prices].index.to_list()\n",
      "metadata": {
        "trusted": true
      },
      "outputs": [
        {
          "execution_count": 156,
          "output_type": "execute_result",
          "data": {
            "text/plain": "[0, 4]"
          },
          "metadata": {}
        }
      ],
      "execution_count": 156
    },
    {
      "cell_type": "markdown",
      "source": "#### 2.12 Challenge: Price Gouging",
      "metadata": {
        "jp-MarkdownHeadingCollapsed": true
      }
    },
    {
      "cell_type": "code",
      "source": "# Track the price of ground beef everyday for 10 days. You've compiled the data into a series called beef_prices, whose index represents the day of each recording. Determin which \n# day had the biggest price increase from the prior day.",
      "metadata": {
        "trusted": true
      },
      "outputs": [],
      "execution_count": 157
    },
    {
      "cell_type": "code",
      "source": "generator = np.random.default_rng(123)\nbeef_prices = pd.Series(\n    data = np.round(generator.uniform(low = 3, high = 5, size = 10), 2),\n    index = generator.choice(10, size = 10, replace = False)\n)\nprint(beef_prices)",
      "metadata": {
        "trusted": true
      },
      "outputs": [
        {
          "name": "stdout",
          "text": "4    4.36\n8    3.11\n2    3.44\n0    3.37\n6    3.35\n9    4.62\n3    4.85\n5    3.55\n1    4.64\n7    4.78\ndtype: float64\n",
          "output_type": "stream"
        }
      ],
      "execution_count": 159
    },
    {
      "cell_type": "code",
      "source": "# solution\n\nbeef_prices.sort_index(inplace = True)\nbeef_prices_prev = beef_prices.shift(periods = 1)\ndaily_changes = beef_prices - beef_prices_prev\ndaily_changes.idxmax()  # returns row label of the maximum value",
      "metadata": {
        "trusted": true
      },
      "outputs": [
        {
          "execution_count": 164,
          "output_type": "execute_result",
          "data": {
            "text/plain": "9"
          },
          "metadata": {}
        }
      ],
      "execution_count": 164
    },
    {
      "cell_type": "markdown",
      "source": "#### 2.13 Challenge: Fair Teams",
      "metadata": {
        "jp-MarkdownHeadingCollapsed": true
      }
    },
    {
      "cell_type": "code",
      "source": "# 6 coaches and 20 players have signed up. Your job is to randomly and fairly determine the teams, assigning players to coaches. Keep in mind that some teams \n# will have three players and some teams will have four players. given a Series of coaches and Series of players, create a Series of random coach-to-player mappings.\n\n# The resulting Series should have coach names in its index and corresponding player names in its values.",
      "metadata": {
        "trusted": true
      },
      "outputs": [],
      "execution_count": 165
    },
    {
      "cell_type": "code",
      "source": "coaches = pd.Series(['Aaron', 'Donald', 'Joshua', 'Peter', 'Scott', 'Stephen'], dtype = 'string')\nplayers = pd.Series(['Asher', 'Connor', 'Elizabeth', 'Emily', 'Ethan', 'Hannah', 'Isabella', 'Isaiah', 'James', 'Joshua', 'Julian', 'Layla', 'Leo', \n                     'Madison', 'Mia', 'Oliver', 'Ryan', 'Scarlat', 'William', 'Wyatt'], dtype = 'string')",
      "metadata": {
        "trusted": true
      },
      "outputs": [],
      "execution_count": 166
    },
    {
      "cell_type": "code",
      "source": "coaches = coaches.sample(frac=1, random_state=2357) # returns a random sample of items from an axis of object\nplayers = players.sample(frac=1, random_state=7532)\n\nrepeats = np.ceil(len(players)/len(coaches)).astype('int64') # num of times coaches list is repeated beside player list. ceil(20/6) \ncoaches_repeated = pd.concat([coaches] * repeats).head(len(players)) # coaches list to match the length of players. head(players) is used to pick only 20 \n\nresult = players.copy()\nresult.index = pd.Index(coaches_repeated, name ='coach')\nprint(result)",
      "metadata": {
        "trusted": true
      },
      "outputs": [
        {
          "name": "stdout",
          "text": "1     Donald\n3      Peter\n5    Stephen\n0      Aaron\n2     Joshua\n4      Scott\ndtype: string\n12          Leo\n13      Madison\n2     Elizabeth\n10       Julian\n8         James\n11        Layla\n6      Isabella\n7        Isaiah\n15       Oliver\n14          Mia\n17      Scarlat\n3         Emily\n5        Hannah\n1        Connor\n9        Joshua\n4         Ethan\n0         Asher\n18      William\n16         Ryan\n19        Wyatt\ndtype: string\ncoach\nDonald           Leo\nPeter        Madison\nStephen    Elizabeth\nAaron         Julian\nJoshua         James\nScott          Layla\nDonald      Isabella\nPeter         Isaiah\nStephen       Oliver\nAaron            Mia\nJoshua       Scarlat\nScott          Emily\nDonald        Hannah\nPeter         Connor\nStephen       Joshua\nAaron          Ethan\nJoshua         Asher\nScott        William\nDonald          Ryan\nPeter          Wyatt\ndtype: string\n",
          "output_type": "stream"
        }
      ],
      "execution_count": 168
    },
    {
      "cell_type": "markdown",
      "source": "### DataFrame",
      "metadata": {}
    },
    {
      "cell_type": "markdown",
      "source": "#### 3.1 DataFrame Creation",
      "metadata": {}
    },
    {
      "cell_type": "code",
      "source": "# Dataframe is a table of data with a row index. row index is an unlabelled column on the left.\n\ndf = pd.DataFrame({'name' : ['Bob', 'Sue', 'Mary'], 'age' : [39, 57, 28]})\n\nprint(df)",
      "metadata": {
        "trusted": true
      },
      "outputs": [
        {
          "name": "stdout",
          "text": "   name  age\n0   Bob   39\n1   Sue   57\n2  Mary   28\n",
          "output_type": "stream"
        }
      ],
      "execution_count": 3
    },
    {
      "cell_type": "code",
      "source": "df = pd.DataFrame(\n    [\n        ['Bob', 39],\n        ['Sue', 57],\n        ['Mary', 28]\n    ], columns =['name', 'age']\n)\nprint(df)",
      "metadata": {
        "trusted": true
      },
      "outputs": [
        {
          "name": "stdout",
          "text": "   name  age\n0   Bob   39\n1   Sue   57\n2  Mary   28\n",
          "output_type": "stream"
        }
      ],
      "execution_count": 5
    },
    {
      "cell_type": "code",
      "source": "df.info()",
      "metadata": {
        "trusted": true
      },
      "outputs": [
        {
          "name": "stdout",
          "text": "<class 'pandas.core.frame.DataFrame'>\nRangeIndex: 3 entries, 0 to 2\nData columns (total 2 columns):\n #   Column  Non-Null Count  Dtype \n---  ------  --------------  ----- \n 0   name    3 non-null      object\n 1   age     3 non-null      int64 \ndtypes: int64(1), object(1)\nmemory usage: 108.0+ bytes\n",
          "output_type": "stream"
        }
      ],
      "execution_count": 6
    },
    {
      "cell_type": "code",
      "source": "df.shape",
      "metadata": {
        "trusted": true
      },
      "outputs": [
        {
          "execution_count": 7,
          "output_type": "execute_result",
          "data": {
            "text/plain": "(3, 2)"
          },
          "metadata": {}
        }
      ],
      "execution_count": 7
    },
    {
      "cell_type": "code",
      "source": "df.axes",
      "metadata": {
        "trusted": true
      },
      "outputs": [
        {
          "execution_count": 8,
          "output_type": "execute_result",
          "data": {
            "text/plain": "[RangeIndex(start=0, stop=3, step=1), Index(['name', 'age'], dtype='object')]"
          },
          "metadata": {}
        }
      ],
      "execution_count": 8
    },
    {
      "cell_type": "code",
      "source": "df.size",
      "metadata": {
        "trusted": true
      },
      "outputs": [
        {
          "execution_count": 9,
          "output_type": "execute_result",
          "data": {
            "text/plain": "6"
          },
          "metadata": {}
        }
      ],
      "execution_count": 9
    },
    {
      "cell_type": "code",
      "source": "df.rename(columns = {'age' : 'years'}, inplace = True)\nprint(df)",
      "metadata": {
        "trusted": true
      },
      "outputs": [
        {
          "name": "stdout",
          "text": "   name  years\n0   Bob     39\n1   Sue     57\n2  Mary     28\n",
          "output_type": "stream"
        }
      ],
      "execution_count": 11
    },
    {
      "cell_type": "markdown",
      "source": "#### 3.2 To and From CSV",
      "metadata": {}
    },
    {
      "cell_type": "code",
      "source": "# create data \n\ndf = pd.DataFrame(\n    {\n        'id' : np.arange(1000),\n        'b' : np.random.normal(size = 1000),\n        'c': pd.Series(np.random.choice(['cat', 'dog', 'hippo'], size = 1000, replace = True))\n    }\n)\nprint(df)",
      "metadata": {
        "trusted": true
      },
      "outputs": [
        {
          "name": "stdout",
          "text": "      id         b      c\n0      0  1.703454    dog\n1      1 -0.291755    dog\n2      2 -0.255308    cat\n3      3 -0.628465    cat\n4      4 -0.108681    dog\n..   ...       ...    ...\n995  995 -0.127438  hippo\n996  996 -2.956545  hippo\n997  997  0.032741    cat\n998  998 -0.966289  hippo\n999  999  0.086645    cat\n\n[1000 rows x 3 columns]\n",
          "output_type": "stream"
        }
      ],
      "execution_count": 2
    },
    {
      "cell_type": "code",
      "source": "df.to_csv('pets.csv', index = False)",
      "metadata": {
        "trusted": true
      },
      "outputs": [],
      "execution_count": 3
    },
    {
      "cell_type": "code",
      "source": "pets = pd.read_csv('pets.csv')\nprint(pets)",
      "metadata": {
        "trusted": true
      },
      "outputs": [
        {
          "name": "stdout",
          "text": "      id         b      c\n0      0  1.703454    dog\n1      1 -0.291755    dog\n2      2 -0.255308    cat\n3      3 -0.628465    cat\n4      4 -0.108681    dog\n..   ...       ...    ...\n995  995 -0.127438  hippo\n996  996 -2.956545  hippo\n997  997  0.032741    cat\n998  998 -0.966289  hippo\n999  999  0.086645    cat\n\n[1000 rows x 3 columns]\n",
          "output_type": "stream"
        }
      ],
      "execution_count": 4
    },
    {
      "cell_type": "markdown",
      "source": "#### 3.3 Basic Indexing",
      "metadata": {}
    },
    {
      "cell_type": "code",
      "source": "df = pd.DataFrame(\n    {\n        'shrimp' : [10, 20, 30 , 40, 50, 60],\n        'crab' : [5, 10, 15, 20, 25, 30],\n        'red fish' : [2,3,5,7,11, 13]\n    }\n)\nprint(df)",
      "metadata": {
        "trusted": true
      },
      "outputs": [
        {
          "name": "stdout",
          "text": "   shrimp  crab  red fish\n0      10     5         2\n1      20    10         3\n2      30    15         5\n3      40    20         7\n4      50    25        11\n5      60    30        13\n",
          "output_type": "stream"
        }
      ],
      "execution_count": 5
    },
    {
      "cell_type": "code",
      "source": "df['shrimp']",
      "metadata": {
        "trusted": true
      },
      "outputs": [
        {
          "execution_count": 6,
          "output_type": "execute_result",
          "data": {
            "text/plain": "0    10\n1    20\n2    30\n3    40\n4    50\n5    60\nName: shrimp, dtype: int64"
          },
          "metadata": {}
        }
      ],
      "execution_count": 6
    },
    {
      "cell_type": "code",
      "source": "df.shrimp",
      "metadata": {
        "trusted": true
      },
      "outputs": [
        {
          "execution_count": 7,
          "output_type": "execute_result",
          "data": {
            "text/plain": "0    10\n1    20\n2    30\n3    40\n4    50\n5    60\nName: shrimp, dtype: int64"
          },
          "metadata": {}
        }
      ],
      "execution_count": 7
    },
    {
      "cell_type": "code",
      "source": "df[['shrimp', 'red fish']] # copy of original dataframe",
      "metadata": {
        "trusted": true
      },
      "outputs": [
        {
          "execution_count": 9,
          "output_type": "execute_result",
          "data": {
            "text/plain": "   shrimp  red fish\n0      10         2\n1      20         3\n2      30         5\n3      40         7\n4      50        11\n5      60        13",
            "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>shrimp</th>\n      <th>red fish</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>10</td>\n      <td>2</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>20</td>\n      <td>3</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>30</td>\n      <td>5</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>40</td>\n      <td>7</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>50</td>\n      <td>11</td>\n    </tr>\n    <tr>\n      <th>5</th>\n      <td>60</td>\n      <td>13</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
          },
          "metadata": {}
        }
      ],
      "execution_count": 9
    },
    {
      "cell_type": "code",
      "source": "df.iloc[[0,2,4]]",
      "metadata": {
        "trusted": true
      },
      "outputs": [
        {
          "execution_count": 10,
          "output_type": "execute_result",
          "data": {
            "text/plain": "   shrimp  crab  red fish\n0      10     5         2\n2      30    15         5\n4      50    25        11",
            "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>shrimp</th>\n      <th>crab</th>\n      <th>red fish</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>10</td>\n      <td>5</td>\n      <td>2</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>30</td>\n      <td>15</td>\n      <td>5</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>50</td>\n      <td>25</td>\n      <td>11</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
          },
          "metadata": {}
        }
      ],
      "execution_count": 10
    },
    {
      "cell_type": "code",
      "source": "df.iloc[0:5:2] #give me everything from 0 (inclusive) to 5 (exclusive), stepping by 2",
      "metadata": {
        "trusted": true
      },
      "outputs": [
        {
          "execution_count": 14,
          "output_type": "execute_result",
          "data": {
            "text/plain": "   shrimp  crab  red fish\n0      10     5         2\n2      30    15         5\n4      50    25        11",
            "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>shrimp</th>\n      <th>crab</th>\n      <th>red fish</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>10</td>\n      <td>5</td>\n      <td>2</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>30</td>\n      <td>15</td>\n      <td>5</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>50</td>\n      <td>25</td>\n      <td>11</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
          },
          "metadata": {}
        }
      ],
      "execution_count": 14
    },
    {
      "cell_type": "code",
      "source": "df.iloc[1] # returns as Series",
      "metadata": {
        "trusted": true
      },
      "outputs": [
        {
          "execution_count": 15,
          "output_type": "execute_result",
          "data": {
            "text/plain": "shrimp      20\ncrab        10\nred fish     3\nName: 1, dtype: int64"
          },
          "metadata": {}
        }
      ],
      "execution_count": 15
    },
    {
      "cell_type": "code",
      "source": "df.iloc[[1]]",
      "metadata": {
        "trusted": true
      },
      "outputs": [
        {
          "execution_count": 16,
          "output_type": "execute_result",
          "data": {
            "text/plain": "   shrimp  crab  red fish\n1      20    10         3",
            "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>shrimp</th>\n      <th>crab</th>\n      <th>red fish</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>1</th>\n      <td>20</td>\n      <td>10</td>\n      <td>3</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
          },
          "metadata": {}
        }
      ],
      "execution_count": 16
    },
    {
      "cell_type": "code",
      "source": "df.iloc[:, [0,1]] # give me every row of the df but only columns 0, 1",
      "metadata": {
        "trusted": true
      },
      "outputs": [
        {
          "execution_count": 20,
          "output_type": "execute_result",
          "data": {
            "text/plain": "   shrimp  crab\n0      10     5\n1      20    10\n2      30    15\n3      40    20\n4      50    25\n5      60    30",
            "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>shrimp</th>\n      <th>crab</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>10</td>\n      <td>5</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>20</td>\n      <td>10</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>30</td>\n      <td>15</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>40</td>\n      <td>20</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>50</td>\n      <td>25</td>\n    </tr>\n    <tr>\n      <th>5</th>\n      <td>60</td>\n      <td>30</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
          },
          "metadata": {}
        }
      ],
      "execution_count": 20
    },
    {
      "cell_type": "code",
      "source": "df.iloc[[0,2], [1,2]] # give me rows 0, 2 with columns 1, 2",
      "metadata": {
        "trusted": true
      },
      "outputs": [
        {
          "execution_count": 22,
          "output_type": "execute_result",
          "data": {
            "text/plain": "   crab  red fish\n0     5         2\n2    15         5",
            "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>crab</th>\n      <th>red fish</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>5</td>\n      <td>2</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>15</td>\n      <td>5</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
          },
          "metadata": {}
        }
      ],
      "execution_count": 22
    },
    {
      "cell_type": "code",
      "source": "df.index = ['a', 'b', 'c', 'd', 'e', 'f']\nprint(df)",
      "metadata": {
        "trusted": true
      },
      "outputs": [
        {
          "name": "stdout",
          "text": "   shrimp  crab  red fish\na      10     5         2\nb      20    10         3\nc      30    15         5\nd      40    20         7\ne      50    25        11\nf      60    30        13\n",
          "output_type": "stream"
        }
      ],
      "execution_count": 23
    },
    {
      "cell_type": "code",
      "source": "df.loc[['b', 'e']]",
      "metadata": {
        "trusted": true
      },
      "outputs": [
        {
          "execution_count": 24,
          "output_type": "execute_result",
          "data": {
            "text/plain": "   shrimp  crab  red fish\nb      20    10         3\ne      50    25        11",
            "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>shrimp</th>\n      <th>crab</th>\n      <th>red fish</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>b</th>\n      <td>20</td>\n      <td>10</td>\n      <td>3</td>\n    </tr>\n    <tr>\n      <th>e</th>\n      <td>50</td>\n      <td>25</td>\n      <td>11</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
          },
          "metadata": {}
        }
      ],
      "execution_count": 24
    },
    {
      "cell_type": "code",
      "source": "df.loc[['a', 'c', 'f'], ['crab' , 'shrimp']]",
      "metadata": {
        "trusted": true
      },
      "outputs": [
        {
          "execution_count": 25,
          "output_type": "execute_result",
          "data": {
            "text/plain": "   crab  shrimp\na     5      10\nc    15      30\nf    30      60",
            "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>crab</th>\n      <th>shrimp</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>a</th>\n      <td>5</td>\n      <td>10</td>\n    </tr>\n    <tr>\n      <th>c</th>\n      <td>15</td>\n      <td>30</td>\n    </tr>\n    <tr>\n      <th>f</th>\n      <td>30</td>\n      <td>60</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
          },
          "metadata": {}
        }
      ],
      "execution_count": 25
    },
    {
      "cell_type": "code",
      "source": "df.loc['b':'e', ['crab', 'shrimp']]",
      "metadata": {
        "trusted": true
      },
      "outputs": [
        {
          "execution_count": 28,
          "output_type": "execute_result",
          "data": {
            "text/plain": "   crab  shrimp\nb    10      20\nc    15      30\nd    20      40\ne    25      50",
            "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>crab</th>\n      <th>shrimp</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>b</th>\n      <td>10</td>\n      <td>20</td>\n    </tr>\n    <tr>\n      <th>c</th>\n      <td>15</td>\n      <td>30</td>\n    </tr>\n    <tr>\n      <th>d</th>\n      <td>20</td>\n      <td>40</td>\n    </tr>\n    <tr>\n      <th>e</th>\n      <td>25</td>\n      <td>50</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
          },
          "metadata": {}
        }
      ],
      "execution_count": 28
    },
    {
      "cell_type": "code",
      "source": "# df.iloc[:3, ['shrimp']] doesn't work in pandas. it expects numeric indexers\n\ndf.iloc[:3, [0, 2]]",
      "metadata": {
        "trusted": true
      },
      "outputs": [
        {
          "execution_count": 32,
          "output_type": "execute_result",
          "data": {
            "text/plain": "   shrimp  red fish\na      10         2\nb      20         3\nc      30         5",
            "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>shrimp</th>\n      <th>red fish</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>a</th>\n      <td>10</td>\n      <td>2</td>\n    </tr>\n    <tr>\n      <th>b</th>\n      <td>20</td>\n      <td>3</td>\n    </tr>\n    <tr>\n      <th>c</th>\n      <td>30</td>\n      <td>5</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
          },
          "metadata": {}
        }
      ],
      "execution_count": 32
    },
    {
      "cell_type": "code",
      "source": "# get_loc - get integer location for requested label\ndf.iloc[:3, [df.columns.get_loc(c) for c in ['shrimp', 'red fish']]]",
      "metadata": {
        "trusted": true
      },
      "outputs": [
        {
          "execution_count": 34,
          "output_type": "execute_result",
          "data": {
            "text/plain": "   shrimp  red fish\na      10         2\nb      20         3\nc      30         5",
            "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>shrimp</th>\n      <th>red fish</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>a</th>\n      <td>10</td>\n      <td>2</td>\n    </tr>\n    <tr>\n      <th>b</th>\n      <td>20</td>\n      <td>3</td>\n    </tr>\n    <tr>\n      <th>c</th>\n      <td>30</td>\n      <td>5</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
          },
          "metadata": {}
        }
      ],
      "execution_count": 34
    },
    {
      "cell_type": "code",
      "source": "df.loc[df.shrimp > 40, ['shrimp']]",
      "metadata": {
        "trusted": true
      },
      "outputs": [
        {
          "execution_count": 39,
          "output_type": "execute_result",
          "data": {
            "text/plain": "   shrimp\ne      50\nf      60",
            "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>shrimp</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>e</th>\n      <td>50</td>\n    </tr>\n    <tr>\n      <th>f</th>\n      <td>60</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
          },
          "metadata": {}
        }
      ],
      "execution_count": 39
    },
    {
      "cell_type": "markdown",
      "source": "#### 3.4 Basic Operations",
      "metadata": {}
    },
    {
      "cell_type": "code",
      "source": "df = pd.DataFrame(\n    {\n        'a' : [2,3,11, 13],\n        'b' : ['fox', 'rabbit', 'hound', 'rabbit']\n    }\n)\nprint(df)",
      "metadata": {
        "trusted": true
      },
      "outputs": [
        {
          "name": "stdout",
          "text": "    a       b\n0   2     fox\n1   3  rabbit\n2  11   hound\n3  13  rabbit\n",
          "output_type": "stream"
        }
      ],
      "execution_count": 41
    },
    {
      "cell_type": "code",
      "source": "# inserting a new column into df\n\ndf['c'] = [1, 0 , 1, 2]\n\n# we cannot use df.c = 1\nprint(df)",
      "metadata": {
        "trusted": true
      },
      "outputs": [
        {
          "name": "stdout",
          "text": "    a       b  c\n0   2     fox  1\n1   3  rabbit  0\n2  11   hound  1\n3  13  rabbit  2\n",
          "output_type": "stream"
        }
      ],
      "execution_count": 44
    },
    {
      "cell_type": "code",
      "source": "df['e'] = df.a + df.c\nprint(df)",
      "metadata": {
        "trusted": true
      },
      "outputs": [
        {
          "name": "stdout",
          "text": "    a       b  c   e\n0   2     fox  1   3\n1   3  rabbit  0   3\n2  11   hound  1  12\n3  13  rabbit  2  15\n",
          "output_type": "stream"
        }
      ],
      "execution_count": 45
    },
    {
      "cell_type": "code",
      "source": "df.loc[df.b == 'rabbit', 'f'] = 0\nprint(df)",
      "metadata": {
        "trusted": true
      },
      "outputs": [
        {
          "name": "stdout",
          "text": "    a       b  c   e    f\n0   2     fox  1   3  NaN\n1   3  rabbit  0   3  0.0\n2  11   hound  1  12  NaN\n3  13  rabbit  2  15  0.0\n",
          "output_type": "stream"
        }
      ],
      "execution_count": 47
    },
    {
      "cell_type": "code",
      "source": "df.drop(columns = ['a', 'c'], inplace = True)\nprint(df)",
      "metadata": {
        "trusted": true
      },
      "outputs": [
        {
          "name": "stdout",
          "text": "        b   e    f\n0     fox   3  NaN\n1  rabbit   3  0.0\n2   hound  12  NaN\n3  rabbit  15  0.0\n",
          "output_type": "stream"
        }
      ],
      "execution_count": 48
    },
    {
      "cell_type": "markdown",
      "source": "#### 3.5 apply()",
      "metadata": {}
    },
    {
      "cell_type": "code",
      "source": "df = pd.DataFrame(\n    {\n        'A' : [5.2, 1.7, 9.4],\n        'B' : [3.9, 4.0, 7.8]\n    }\n)\nprint(df)",
      "metadata": {
        "trusted": true
      },
      "outputs": [
        {
          "name": "stdout",
          "text": "     A    B\n0  5.2  3.9\n1  1.7  4.0\n2  9.4  7.8\n",
          "output_type": "stream"
        }
      ],
      "execution_count": 49
    },
    {
      "cell_type": "code",
      "source": "# apply a function along an axis of the DataFrame\n\ndf.apply(func = np.sum, axis = 1)",
      "metadata": {
        "trusted": true
      },
      "outputs": [
        {
          "execution_count": 51,
          "output_type": "execute_result",
          "data": {
            "text/plain": "0     9.1\n1     5.7\n2    17.2\ndtype: float64"
          },
          "metadata": {}
        }
      ],
      "execution_count": 51
    },
    {
      "cell_type": "code",
      "source": "kids = pd.DataFrame(\n    {\n        'name' : pd.Series(['alice', 'mary', 'jimmy', 'johnny', 'susan'], dtype = 'string'),\n        'age' : [9,13,11,15, 8],\n        'with_adult' : [True, False, False, True, True]\n    }\n)\nprint(kids)",
      "metadata": {
        "trusted": true
      },
      "outputs": [
        {
          "name": "stdout",
          "text": "     name  age  with_adult\n0   alice    9        True\n1    mary   13       False\n2   jimmy   11       False\n3  johnny   15        True\n4   susan    8        True\n",
          "output_type": "stream"
        }
      ],
      "execution_count": 52
    },
    {
      "cell_type": "code",
      "source": "def is_allowed(age, with_adult):\n    return age >= 12 or with_adult",
      "metadata": {
        "trusted": true
      },
      "outputs": [],
      "execution_count": 57
    },
    {
      "cell_type": "code",
      "source": "# kids.apply(is_allowed, axis = 1) # doesn't work\n\nkids.apply(lambda row : is_allowed(row.loc['age'], row.loc['with_adult']), axis = 1)",
      "metadata": {
        "trusted": true
      },
      "outputs": [
        {
          "execution_count": 59,
          "output_type": "execute_result",
          "data": {
            "text/plain": "0     True\n1     True\n2    False\n3     True\n4     True\ndtype: bool"
          },
          "metadata": {}
        }
      ],
      "execution_count": 59
    },
    {
      "cell_type": "code",
      "source": "kids['is_allowed'] = kids.apply(lambda row: is_allowed(row.loc['age'], row.loc['with_adult']), axis = 1)\nprint(kids)",
      "metadata": {
        "trusted": true
      },
      "outputs": [
        {
          "name": "stdout",
          "text": "     name  age  with_adult  is_allowed\n0   alice    9        True        True\n1    mary   13       False        True\n2   jimmy   11       False       False\n3  johnny   15        True        True\n4   susan    8        True        True\n",
          "output_type": "stream"
        }
      ],
      "execution_count": 61
    },
    {
      "cell_type": "markdown",
      "source": "#### 3.6 view vs copy",
      "metadata": {
        "jp-MarkdownHeadingCollapsed": true
      }
    },
    {
      "cell_type": "code",
      "source": "df = pd.DataFrame(\n    {\n        'x' : [1,2,3],\n        'y' : [10, 20, 30]\n    }\n)\nprint(df)",
      "metadata": {
        "trusted": true
      },
      "outputs": [
        {
          "name": "stdout",
          "text": "   x   y\n0  1  10\n1  2  20\n2  3  30\n",
          "output_type": "stream"
        }
      ],
      "execution_count": 77
    },
    {
      "cell_type": "code",
      "source": "v1 = df['x']",
      "metadata": {
        "trusted": true
      },
      "outputs": [],
      "execution_count": 78
    },
    {
      "cell_type": "code",
      "source": "v1.iloc[0] = 999\nprint(df) # df changed",
      "metadata": {
        "trusted": true
      },
      "outputs": [
        {
          "name": "stdout",
          "text": "     x   y\n0  999  10\n1    2  20\n2    3  30\n",
          "output_type": "stream"
        }
      ],
      "execution_count": 79
    },
    {
      "cell_type": "code",
      "source": "v2 = df['y'] + 1",
      "metadata": {
        "trusted": true
      },
      "outputs": [],
      "execution_count": 80
    },
    {
      "cell_type": "code",
      "source": "v2.iloc[0] = 0\nprint(df) # df doesn't change",
      "metadata": {
        "trusted": true
      },
      "outputs": [
        {
          "name": "stdout",
          "text": "     x   y\n0  999  10\n1    2  20\n2    3  30\n",
          "output_type": "stream"
        }
      ],
      "execution_count": 81
    },
    {
      "cell_type": "code",
      "source": "v1 = df.iloc[[0,1]]\nv2 = df.iloc[:2]",
      "metadata": {
        "trusted": true
      },
      "outputs": [],
      "execution_count": 82
    },
    {
      "cell_type": "code",
      "source": "v1.iloc[1] = 111\nprint(df)",
      "metadata": {
        "trusted": true
      },
      "outputs": [
        {
          "name": "stderr",
          "text": "<ipython-input-84-4c2417139498>:1: SettingWithCopyWarning: \nA value is trying to be set on a copy of a slice from a DataFrame\n\nSee the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n  v1.iloc[1] = 111\n",
          "output_type": "stream"
        },
        {
          "name": "stdout",
          "text": "     x   y\n0  999  10\n1    2  20\n2    3  30\n",
          "output_type": "stream"
        }
      ],
      "execution_count": 84
    },
    {
      "cell_type": "code",
      "source": "v2.iloc[1] = 222\nprint(df) #changed because v2 is just a reference to the memory. v2 is a view not a copy.",
      "metadata": {
        "trusted": true
      },
      "outputs": [
        {
          "name": "stderr",
          "text": "<ipython-input-85-6c752e41547c>:1: SettingWithCopyWarning: \nA value is trying to be set on a copy of a slice from a DataFrame\n\nSee the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n  v2.iloc[1] = 222\n",
          "output_type": "stream"
        },
        {
          "name": "stdout",
          "text": "     x    y\n0  999   10\n1  222  222\n2    3   30\n",
          "output_type": "stream"
        }
      ],
      "execution_count": 85
    },
    {
      "cell_type": "markdown",
      "source": "#### 3.7 merge()",
      "metadata": {
        "jp-MarkdownHeadingCollapsed": true
      }
    },
    {
      "cell_type": "code",
      "source": "pets = pd.DataFrame(\n    data = {\n        'name' : ['Mr. Snuggles', 'Honey Chew Chew', 'Professor', 'Chairman Meow', 'hello'],\n        'type' : ['cat', 'dog', 'dog', 'cat', 'horse']\n    },\n    index = [71, 42, 11, 98, 42]\n)\n\nprint(pets)",
      "metadata": {
        "trusted": true
      },
      "outputs": [
        {
          "name": "stdout",
          "text": "               name   type\n71     Mr. Snuggles    cat\n42  Honey Chew Chew    dog\n11        Professor    dog\n98         Chairman    cat\n42            hello  horse\n",
          "output_type": "stream"
        }
      ],
      "execution_count": 87
    },
    {
      "cell_type": "code",
      "source": "visits = pd.DataFrame(\n    data = {\n        'pet_id' : [42, 31, 71, 42, 98, 42],\n        'date' : ['2019-03-15', '2019-03-15', '2019-04-05', '2019-04-06', '2019-04-07', '2019-04-08']\n    }\n)\nprint(visits)",
      "metadata": {
        "trusted": true
      },
      "outputs": [
        {
          "name": "stdout",
          "text": "   pet_id        date\n0      42  2019-03-15\n1      31  2019-03-15\n2      71  2019-04-05\n3      42  2019-04-06\n4      98  2019-04-07\n5      42  2019-04-08\n",
          "output_type": "stream"
        }
      ],
      "execution_count": 90
    },
    {
      "cell_type": "code",
      "source": "# merge - merges data frames or named series objects with a databse-style join\n\n# params - left, right, how (left, right, outer, inner(default))\n\npd.merge(left = pets, right = visits, how = 'inner', left_index =True, right_on = 'pet_id')\n",
      "metadata": {
        "trusted": true
      },
      "outputs": [
        {
          "execution_count": 92,
          "output_type": "execute_result",
          "data": {
            "text/plain": "              name   type  pet_id        date\n2     Mr. Snuggles    cat      71  2019-04-05\n0  Honey Chew Chew    dog      42  2019-03-15\n3  Honey Chew Chew    dog      42  2019-04-06\n5  Honey Chew Chew    dog      42  2019-04-08\n0            hello  horse      42  2019-03-15\n3            hello  horse      42  2019-04-06\n5            hello  horse      42  2019-04-08\n4         Chairman    cat      98  2019-04-07",
            "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>name</th>\n      <th>type</th>\n      <th>pet_id</th>\n      <th>date</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>2</th>\n      <td>Mr. Snuggles</td>\n      <td>cat</td>\n      <td>71</td>\n      <td>2019-04-05</td>\n    </tr>\n    <tr>\n      <th>0</th>\n      <td>Honey Chew Chew</td>\n      <td>dog</td>\n      <td>42</td>\n      <td>2019-03-15</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>Honey Chew Chew</td>\n      <td>dog</td>\n      <td>42</td>\n      <td>2019-04-06</td>\n    </tr>\n    <tr>\n      <th>5</th>\n      <td>Honey Chew Chew</td>\n      <td>dog</td>\n      <td>42</td>\n      <td>2019-04-08</td>\n    </tr>\n    <tr>\n      <th>0</th>\n      <td>hello</td>\n      <td>horse</td>\n      <td>42</td>\n      <td>2019-03-15</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>hello</td>\n      <td>horse</td>\n      <td>42</td>\n      <td>2019-04-06</td>\n    </tr>\n    <tr>\n      <th>5</th>\n      <td>hello</td>\n      <td>horse</td>\n      <td>42</td>\n      <td>2019-04-08</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>Chairman</td>\n      <td>cat</td>\n      <td>98</td>\n      <td>2019-04-07</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
          },
          "metadata": {}
        }
      ],
      "execution_count": 92
    },
    {
      "cell_type": "code",
      "source": "pets.index.rename('pet_id', inplace = True)\nprint(pets)",
      "metadata": {
        "trusted": true
      },
      "outputs": [
        {
          "name": "stdout",
          "text": "                   name   type\npet_id                        \n71         Mr. Snuggles    cat\n42      Honey Chew Chew    dog\n11            Professor    dog\n98             Chairman    cat\n42                hello  horse\n",
          "output_type": "stream"
        }
      ],
      "execution_count": 93
    },
    {
      "cell_type": "code",
      "source": "visits.index.rename('visit_id', inplace = True)\nprint(visits)",
      "metadata": {
        "trusted": true
      },
      "outputs": [
        {
          "name": "stdout",
          "text": "          pet_id        date\nvisit_id                    \n0             42  2019-03-15\n1             31  2019-03-15\n2             71  2019-04-05\n3             42  2019-04-06\n4             98  2019-04-07\n5             42  2019-04-08\n",
          "output_type": "stream"
        }
      ],
      "execution_count": 94
    },
    {
      "cell_type": "code",
      "source": "# advantage of setting petid, visit_id is we can us 'on' while merging\n\npd.merge(left = pets, right = visits, how = 'inner', on = 'pet_id')",
      "metadata": {
        "trusted": true
      },
      "outputs": [
        {
          "execution_count": 95,
          "output_type": "execute_result",
          "data": {
            "text/plain": "   pet_id             name   type        date\n0      71     Mr. Snuggles    cat  2019-04-05\n1      42  Honey Chew Chew    dog  2019-03-15\n2      42  Honey Chew Chew    dog  2019-04-06\n3      42  Honey Chew Chew    dog  2019-04-08\n4      42            hello  horse  2019-03-15\n5      42            hello  horse  2019-04-06\n6      42            hello  horse  2019-04-08\n7      98         Chairman    cat  2019-04-07",
            "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>pet_id</th>\n      <th>name</th>\n      <th>type</th>\n      <th>date</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>71</td>\n      <td>Mr. Snuggles</td>\n      <td>cat</td>\n      <td>2019-04-05</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>42</td>\n      <td>Honey Chew Chew</td>\n      <td>dog</td>\n      <td>2019-03-15</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>42</td>\n      <td>Honey Chew Chew</td>\n      <td>dog</td>\n      <td>2019-04-06</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>42</td>\n      <td>Honey Chew Chew</td>\n      <td>dog</td>\n      <td>2019-04-08</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>42</td>\n      <td>hello</td>\n      <td>horse</td>\n      <td>2019-03-15</td>\n    </tr>\n    <tr>\n      <th>5</th>\n      <td>42</td>\n      <td>hello</td>\n      <td>horse</td>\n      <td>2019-04-06</td>\n    </tr>\n    <tr>\n      <th>6</th>\n      <td>42</td>\n      <td>hello</td>\n      <td>horse</td>\n      <td>2019-04-08</td>\n    </tr>\n    <tr>\n      <th>7</th>\n      <td>98</td>\n      <td>Chairman</td>\n      <td>cat</td>\n      <td>2019-04-07</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
          },
          "metadata": {}
        }
      ],
      "execution_count": 95
    },
    {
      "cell_type": "code",
      "source": "pd.merge(left = pets, right = visits, how = 'left', on = 'pet_id')",
      "metadata": {
        "trusted": true
      },
      "outputs": [
        {
          "execution_count": 96,
          "output_type": "execute_result",
          "data": {
            "text/plain": "   pet_id             name   type        date\n0      71     Mr. Snuggles    cat  2019-04-05\n1      42  Honey Chew Chew    dog  2019-03-15\n2      42  Honey Chew Chew    dog  2019-04-06\n3      42  Honey Chew Chew    dog  2019-04-08\n4      11        Professor    dog         NaN\n5      98         Chairman    cat  2019-04-07\n6      42            hello  horse  2019-03-15\n7      42            hello  horse  2019-04-06\n8      42            hello  horse  2019-04-08",
            "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>pet_id</th>\n      <th>name</th>\n      <th>type</th>\n      <th>date</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>71</td>\n      <td>Mr. Snuggles</td>\n      <td>cat</td>\n      <td>2019-04-05</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>42</td>\n      <td>Honey Chew Chew</td>\n      <td>dog</td>\n      <td>2019-03-15</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>42</td>\n      <td>Honey Chew Chew</td>\n      <td>dog</td>\n      <td>2019-04-06</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>42</td>\n      <td>Honey Chew Chew</td>\n      <td>dog</td>\n      <td>2019-04-08</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>11</td>\n      <td>Professor</td>\n      <td>dog</td>\n      <td>NaN</td>\n    </tr>\n    <tr>\n      <th>5</th>\n      <td>98</td>\n      <td>Chairman</td>\n      <td>cat</td>\n      <td>2019-04-07</td>\n    </tr>\n    <tr>\n      <th>6</th>\n      <td>42</td>\n      <td>hello</td>\n      <td>horse</td>\n      <td>2019-03-15</td>\n    </tr>\n    <tr>\n      <th>7</th>\n      <td>42</td>\n      <td>hello</td>\n      <td>horse</td>\n      <td>2019-04-06</td>\n    </tr>\n    <tr>\n      <th>8</th>\n      <td>42</td>\n      <td>hello</td>\n      <td>horse</td>\n      <td>2019-04-08</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
          },
          "metadata": {}
        }
      ],
      "execution_count": 96
    },
    {
      "cell_type": "code",
      "source": "pd.merge(left = pets, right = visits, how = 'right', on = 'pet_id')",
      "metadata": {
        "trusted": true
      },
      "outputs": [
        {
          "execution_count": 98,
          "output_type": "execute_result",
          "data": {
            "text/plain": "   pet_id             name   type        date\n0      42  Honey Chew Chew    dog  2019-03-15\n1      42            hello  horse  2019-03-15\n2      31              NaN    NaN  2019-03-15\n3      71     Mr. Snuggles    cat  2019-04-05\n4      42  Honey Chew Chew    dog  2019-04-06\n5      42            hello  horse  2019-04-06\n6      98         Chairman    cat  2019-04-07\n7      42  Honey Chew Chew    dog  2019-04-08\n8      42            hello  horse  2019-04-08",
            "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>pet_id</th>\n      <th>name</th>\n      <th>type</th>\n      <th>date</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>42</td>\n      <td>Honey Chew Chew</td>\n      <td>dog</td>\n      <td>2019-03-15</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>42</td>\n      <td>hello</td>\n      <td>horse</td>\n      <td>2019-03-15</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>31</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>2019-03-15</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>71</td>\n      <td>Mr. Snuggles</td>\n      <td>cat</td>\n      <td>2019-04-05</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>42</td>\n      <td>Honey Chew Chew</td>\n      <td>dog</td>\n      <td>2019-04-06</td>\n    </tr>\n    <tr>\n      <th>5</th>\n      <td>42</td>\n      <td>hello</td>\n      <td>horse</td>\n      <td>2019-04-06</td>\n    </tr>\n    <tr>\n      <th>6</th>\n      <td>98</td>\n      <td>Chairman</td>\n      <td>cat</td>\n      <td>2019-04-07</td>\n    </tr>\n    <tr>\n      <th>7</th>\n      <td>42</td>\n      <td>Honey Chew Chew</td>\n      <td>dog</td>\n      <td>2019-04-08</td>\n    </tr>\n    <tr>\n      <th>8</th>\n      <td>42</td>\n      <td>hello</td>\n      <td>horse</td>\n      <td>2019-04-08</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
          },
          "metadata": {}
        }
      ],
      "execution_count": 98
    },
    {
      "cell_type": "code",
      "source": "pd.merge(left = pets, right = visits, how = 'outer', on = 'pet_id')",
      "metadata": {
        "trusted": true
      },
      "outputs": [
        {
          "execution_count": 99,
          "output_type": "execute_result",
          "data": {
            "text/plain": "   pet_id             name   type        date\n0      71     Mr. Snuggles    cat  2019-04-05\n1      42  Honey Chew Chew    dog  2019-03-15\n2      42  Honey Chew Chew    dog  2019-04-06\n3      42  Honey Chew Chew    dog  2019-04-08\n4      42            hello  horse  2019-03-15\n5      42            hello  horse  2019-04-06\n6      42            hello  horse  2019-04-08\n7      11        Professor    dog         NaN\n8      98         Chairman    cat  2019-04-07\n9      31              NaN    NaN  2019-03-15",
            "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>pet_id</th>\n      <th>name</th>\n      <th>type</th>\n      <th>date</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>71</td>\n      <td>Mr. Snuggles</td>\n      <td>cat</td>\n      <td>2019-04-05</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>42</td>\n      <td>Honey Chew Chew</td>\n      <td>dog</td>\n      <td>2019-03-15</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>42</td>\n      <td>Honey Chew Chew</td>\n      <td>dog</td>\n      <td>2019-04-06</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>42</td>\n      <td>Honey Chew Chew</td>\n      <td>dog</td>\n      <td>2019-04-08</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>42</td>\n      <td>hello</td>\n      <td>horse</td>\n      <td>2019-03-15</td>\n    </tr>\n    <tr>\n      <th>5</th>\n      <td>42</td>\n      <td>hello</td>\n      <td>horse</td>\n      <td>2019-04-06</td>\n    </tr>\n    <tr>\n      <th>6</th>\n      <td>42</td>\n      <td>hello</td>\n      <td>horse</td>\n      <td>2019-04-08</td>\n    </tr>\n    <tr>\n      <th>7</th>\n      <td>11</td>\n      <td>Professor</td>\n      <td>dog</td>\n      <td>NaN</td>\n    </tr>\n    <tr>\n      <th>8</th>\n      <td>98</td>\n      <td>Chairman</td>\n      <td>cat</td>\n      <td>2019-04-07</td>\n    </tr>\n    <tr>\n      <th>9</th>\n      <td>31</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>2019-03-15</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
          },
          "metadata": {}
        }
      ],
      "execution_count": 99
    },
    {
      "cell_type": "code",
      "source": "# anti join i.e records that are in pets but not in visits\n# easy but less efficient and uses lot of memory\nouter = pd.merge(left = pets, right = visits, how = 'outer', on = 'pet_id', indicator = True)\nprint(outer)",
      "metadata": {
        "trusted": true
      },
      "outputs": [
        {
          "name": "stdout",
          "text": "   pet_id             name   type        date      _merge\n0      71     Mr. Snuggles    cat  2019-04-05        both\n1      42  Honey Chew Chew    dog  2019-03-15        both\n2      42  Honey Chew Chew    dog  2019-04-06        both\n3      42  Honey Chew Chew    dog  2019-04-08        both\n4      42            hello  horse  2019-03-15        both\n5      42            hello  horse  2019-04-06        both\n6      42            hello  horse  2019-04-08        both\n7      11        Professor    dog         NaN   left_only\n8      98         Chairman    cat  2019-04-07        both\n9      31              NaN    NaN  2019-03-15  right_only\n",
          "output_type": "stream"
        }
      ],
      "execution_count": 100
    },
    {
      "cell_type": "code",
      "source": "a_not_in_b = outer.loc[outer._merge == 'left_only']\nprint(a_not_in_b)",
      "metadata": {
        "trusted": true
      },
      "outputs": [
        {
          "name": "stdout",
          "text": "   pet_id       name type date     _merge\n7      11  Professor  dog  NaN  left_only\n",
          "output_type": "stream"
        }
      ],
      "execution_count": 102
    },
    {
      "cell_type": "code",
      "source": "# memory efficient way\npets.index.isin(visits.pet_id)\n",
      "metadata": {
        "trusted": true
      },
      "outputs": [
        {
          "execution_count": 103,
          "output_type": "execute_result",
          "data": {
            "text/plain": "array([ True,  True, False,  True,  True])"
          },
          "metadata": {}
        }
      ],
      "execution_count": 103
    },
    {
      "cell_type": "code",
      "source": "pets.loc[~pets.index.isin(visits.pet_id)]",
      "metadata": {
        "trusted": true
      },
      "outputs": [
        {
          "execution_count": 104,
          "output_type": "execute_result",
          "data": {
            "text/plain": "             name type\npet_id                \n11      Professor  dog",
            "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>name</th>\n      <th>type</th>\n    </tr>\n    <tr>\n      <th>pet_id</th>\n      <th></th>\n      <th></th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>11</th>\n      <td>Professor</td>\n      <td>dog</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
          },
          "metadata": {}
        }
      ],
      "execution_count": 104
    },
    {
      "cell_type": "markdown",
      "source": "#### 3.8 Aggregation",
      "metadata": {}
    },
    {
      "cell_type": "code",
      "source": "df = pd.DataFrame({\n    'x' : [3.1, 5.5, 9.2, 1.7, 1.2, 8.3, 2.6],\n    'y' : [1.4, np.nan, 5.0, 5.8, 9.0, np.nan, 9.2]\n})\n\nprint(df)",
      "metadata": {
        "trusted": true
      },
      "outputs": [
        {
          "name": "stdout",
          "text": "     x    y\n0  3.1  1.4\n1  5.5  NaN\n2  9.2  5.0\n3  1.7  5.8\n4  1.2  9.0\n5  8.3  NaN\n6  2.6  9.2\n",
          "output_type": "stream"
        }
      ],
      "execution_count": 105
    },
    {
      "cell_type": "code",
      "source": "# agg - aggregate using one or more operations over the specified axis.\n\ndf.agg('sum')",
      "metadata": {
        "trusted": true
      },
      "outputs": [
        {
          "execution_count": 106,
          "output_type": "execute_result",
          "data": {
            "text/plain": "x    31.6\ny    30.4\ndtype: float64"
          },
          "metadata": {}
        }
      ],
      "execution_count": 106
    },
    {
      "cell_type": "code",
      "source": "# as a df\ndf.agg(['sum']) ",
      "metadata": {
        "trusted": true
      },
      "outputs": [
        {
          "execution_count": 107,
          "output_type": "execute_result",
          "data": {
            "text/plain": "        x     y\nsum  31.6  30.4",
            "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>x</th>\n      <th>y</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>sum</th>\n      <td>31.6</td>\n      <td>30.4</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
          },
          "metadata": {}
        }
      ],
      "execution_count": 107
    },
    {
      "cell_type": "code",
      "source": "df.agg(['sum', 'mean']) ",
      "metadata": {
        "trusted": true
      },
      "outputs": [
        {
          "execution_count": 108,
          "output_type": "execute_result",
          "data": {
            "text/plain": "              x      y\nsum   31.600000  30.40\nmean   4.514286   6.08",
            "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>x</th>\n      <th>y</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>sum</th>\n      <td>31.600000</td>\n      <td>30.40</td>\n    </tr>\n    <tr>\n      <th>mean</th>\n      <td>4.514286</td>\n      <td>6.08</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
          },
          "metadata": {}
        }
      ],
      "execution_count": 108
    },
    {
      "cell_type": "code",
      "source": "# you can even pass custom functions\ndef foofun(x):\n    return x.iloc[1]\n\ndf.agg(foofun)",
      "metadata": {
        "trusted": true
      },
      "outputs": [
        {
          "execution_count": 110,
          "output_type": "execute_result",
          "data": {
            "text/plain": "x    5.5\ny    NaN\ndtype: float64"
          },
          "metadata": {}
        }
      ],
      "execution_count": 110
    },
    {
      "cell_type": "code",
      "source": "df.agg({'x' : ['sum', 'mean'], 'y' : ['min', 'max']})",
      "metadata": {
        "trusted": true
      },
      "outputs": [
        {
          "execution_count": 111,
          "output_type": "execute_result",
          "data": {
            "text/plain": "              x    y\nsum   31.600000  NaN\nmean   4.514286  NaN\nmin         NaN  1.4\nmax         NaN  9.2",
            "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>x</th>\n      <th>y</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>sum</th>\n      <td>31.600000</td>\n      <td>NaN</td>\n    </tr>\n    <tr>\n      <th>mean</th>\n      <td>4.514286</td>\n      <td>NaN</td>\n    </tr>\n    <tr>\n      <th>min</th>\n      <td>NaN</td>\n      <td>1.4</td>\n    </tr>\n    <tr>\n      <th>max</th>\n      <td>NaN</td>\n      <td>9.2</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
          },
          "metadata": {}
        }
      ],
      "execution_count": 111
    },
    {
      "cell_type": "markdown",
      "source": "#### 3.9 groupby()",
      "metadata": {}
    },
    {
      "cell_type": "code",
      "source": "df = pd.DataFrame({\n    'A' : ['foo', 'bar', 'foo', 'bar', 'bar', 'foo', 'foo'],\n    'B' : [False, True, False, True, True, True, True],\n    'C' : [2.1, 1.9, 3.6, 4.0, 1.9, 7.8, 2.8],\n    'D' : [50, np.nan, 30, 90, 10, np.nan, 10]\n}).convert_dtypes() # convert_dtypes - convert columns to best possible dtypes using dtypes supporting pd.NA\nprint(df)",
      "metadata": {
        "trusted": true
      },
      "outputs": [
        {
          "name": "stdout",
          "text": "     A      B    C     D\n0  foo  False  2.1    50\n1  bar   True  1.9  <NA>\n2  foo  False  3.6    30\n3  bar   True  4.0    90\n4  bar   True  1.9    10\n5  foo   True  7.8  <NA>\n6  foo   True  2.8    10\n",
          "output_type": "stream"
        }
      ],
      "execution_count": 135
    },
    {
      "cell_type": "code",
      "source": "groups = df.groupby(by = 'A')\nprint(groups.groups)",
      "metadata": {
        "trusted": true
      },
      "outputs": [
        {
          "name": "stdout",
          "text": "{'bar': [1, 3, 4], 'foo': [0, 2, 5, 6]}\n",
          "output_type": "stream"
        }
      ],
      "execution_count": 136
    },
    {
      "cell_type": "code",
      "source": "print(df)",
      "metadata": {
        "trusted": true
      },
      "outputs": [
        {
          "name": "stdout",
          "text": "     A      B    C     D\n0  foo  False  2.1    50\n1  bar   True  1.9  <NA>\n2  foo  False  3.6    30\n3  bar   True  4.0    90\n4  bar   True  1.9    10\n5  foo   True  7.8  <NA>\n6  foo   True  2.8    10\n",
          "output_type": "stream"
        }
      ],
      "execution_count": 137
    },
    {
      "cell_type": "code",
      "source": "df.groupby('A')['C'].sum()",
      "metadata": {
        "trusted": true
      },
      "outputs": [
        {
          "execution_count": 117,
          "output_type": "execute_result",
          "data": {
            "text/plain": "A\nbar     7.8\nfoo    16.3\nName: C, dtype: Float64"
          },
          "metadata": {}
        }
      ],
      "execution_count": 117
    },
    {
      "cell_type": "code",
      "source": "df.groupby('A')[['C']].sum()",
      "metadata": {
        "trusted": true
      },
      "outputs": [
        {
          "execution_count": 118,
          "output_type": "execute_result",
          "data": {
            "text/plain": "        C\nA        \nbar   7.8\nfoo  16.3",
            "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>C</th>\n    </tr>\n    <tr>\n      <th>A</th>\n      <th></th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>bar</th>\n      <td>7.8</td>\n    </tr>\n    <tr>\n      <th>foo</th>\n      <td>16.3</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
          },
          "metadata": {}
        }
      ],
      "execution_count": 118
    },
    {
      "cell_type": "code",
      "source": "df.groupby('A')[['C', 'D']].agg('sum')",
      "metadata": {
        "trusted": true
      },
      "outputs": [
        {
          "execution_count": 119,
          "output_type": "execute_result",
          "data": {
            "text/plain": "        C    D\nA             \nbar   7.8  100\nfoo  16.3   90",
            "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>C</th>\n      <th>D</th>\n    </tr>\n    <tr>\n      <th>A</th>\n      <th></th>\n      <th></th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>bar</th>\n      <td>7.8</td>\n      <td>100</td>\n    </tr>\n    <tr>\n      <th>foo</th>\n      <td>16.3</td>\n      <td>90</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
          },
          "metadata": {}
        }
      ],
      "execution_count": 119
    },
    {
      "cell_type": "code",
      "source": "df.groupby(by = ['B','D'], dropna = False).agg(\n    C_min = ('C', 'min'),\n    C_max = ('C', np.max)\n)",
      "metadata": {
        "trusted": true
      },
      "outputs": [
        {
          "execution_count": 134,
          "output_type": "execute_result",
          "data": {
            "text/plain": "            C_min  C_max\nB     D                 \nFalse 30      3.6    3.6\n      50      2.1    2.1\nTrue  10      1.9    2.8\n      90      4.0    4.0\n      <NA>    1.9    7.8",
            "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th></th>\n      <th>C_min</th>\n      <th>C_max</th>\n    </tr>\n    <tr>\n      <th>B</th>\n      <th>D</th>\n      <th></th>\n      <th></th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th rowspan=\"2\" valign=\"top\">False</th>\n      <th>30</th>\n      <td>3.6</td>\n      <td>3.6</td>\n    </tr>\n    <tr>\n      <th>50</th>\n      <td>2.1</td>\n      <td>2.1</td>\n    </tr>\n    <tr>\n      <th rowspan=\"3\" valign=\"top\">True</th>\n      <th>10</th>\n      <td>1.9</td>\n      <td>2.8</td>\n    </tr>\n    <tr>\n      <th>90</th>\n      <td>4.0</td>\n      <td>4.0</td>\n    </tr>\n    <tr>\n      <th>&lt;NA&gt;</th>\n      <td>1.9</td>\n      <td>7.8</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
          },
          "metadata": {}
        }
      ],
      "execution_count": 134
    },
    {
      "cell_type": "code",
      "source": "df.groupby('A')['C'].transform(lambda x : x.sort_values().values)",
      "metadata": {
        "trusted": true
      },
      "outputs": [
        {
          "execution_count": 141,
          "output_type": "execute_result",
          "data": {
            "text/plain": "0    2.1\n1    1.9\n2    2.8\n3    1.9\n4    4.0\n5    3.6\n6    7.8\nName: C, dtype: Float64"
          },
          "metadata": {}
        }
      ],
      "execution_count": 141
    },
    {
      "cell_type": "code",
      "source": "df['C_sorted_within_A'] = df.groupby('A')['C'].transform(lambda x : x.sort_values().values)\nprint(df)",
      "metadata": {
        "trusted": true
      },
      "outputs": [
        {
          "name": "stdout",
          "text": "     A      B    C     D  C_sorted_within_A\n0  foo  False  2.1    50                2.1\n1  bar   True  1.9  <NA>                1.9\n2  foo  False  3.6    30                2.8\n3  bar   True  4.0    90                1.9\n4  bar   True  1.9    10                4.0\n5  foo   True  7.8  <NA>                3.6\n6  foo   True  2.8    10                7.8\n",
          "output_type": "stream"
        }
      ],
      "execution_count": 144
    },
    {
      "cell_type": "code",
      "source": "",
      "metadata": {
        "trusted": true
      },
      "outputs": [],
      "execution_count": null
    }
  ]
 }