-
-
Save datavudeja/20de87878a156b6546f938784a22212b to your computer and use it in GitHub Desktop.
Pandas cheatsheet with examples
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "metadata": { | |
| "kernelspec": { | |
| "name": "python", | |
| "display_name": "Python (Pyodide)", | |
| "language": "python" | |
| }, | |
| "language_info": { | |
| "codemirror_mode": { | |
| "name": "python", | |
| "version": 3 | |
| }, | |
| "file_extension": ".py", | |
| "mimetype": "text/x-python", | |
| "name": "python", | |
| "nbconvert_exporter": "python", | |
| "pygments_lexer": "ipython3", | |
| "version": "3.8" | |
| } | |
| }, | |
| "nbformat_minor": 4, | |
| "nbformat": 4, | |
| "cells": [ | |
| { | |
| "cell_type": "code", | |
| "source": "import pandas as pd\nimport numpy as np", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [], | |
| "execution_count": 1 | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "source": "### Series\n", | |
| "metadata": { | |
| "jp-MarkdownHeadingCollapsed": true | |
| } | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "source": "#### 2.1 Series Creation", | |
| "metadata": { | |
| "jp-MarkdownHeadingCollapsed": true | |
| } | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "x = pd.Series([5, 10, 15, 20, 25, 30, 35])", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [], | |
| "execution_count": 4 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "print(x)", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "text": "0 5\n1 10\n2 15\n3 20\n4 25\n5 30\n6 35\ndtype: int64\n", | |
| "output_type": "stream" | |
| } | |
| ], | |
| "execution_count": 5 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "type(x)", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [ | |
| { | |
| "execution_count": 6, | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": "pandas.core.series.Series" | |
| }, | |
| "metadata": {} | |
| } | |
| ], | |
| "execution_count": 6 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "x.head()", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [ | |
| { | |
| "execution_count": 7, | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": "0 5\n1 10\n2 15\n3 20\n4 25\ndtype: int64" | |
| }, | |
| "metadata": {} | |
| } | |
| ], | |
| "execution_count": 7 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "x.tail()", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [ | |
| { | |
| "execution_count": 8, | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": "2 15\n3 20\n4 25\n5 30\n6 35\ndtype: int64" | |
| }, | |
| "metadata": {} | |
| } | |
| ], | |
| "execution_count": 8 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "data = {'a' : 0.,'b' : 1., 'c' : 2.,'d': 3,}\n\ny = pd.Series(data)\n\nprint(y)", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "text": "a 0.0\nb 1.0\nc 2.0\nd 3.0\ndtype: float64\n", | |
| "output_type": "stream" | |
| } | |
| ], | |
| "execution_count": 11 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "z = np.array([10, 20, 30 , 40], dtype = 'int64')\n\npd.Series(z)", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [ | |
| { | |
| "execution_count": 16, | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": "0 10\n1 20\n2 30\n3 40\ndtype: int64" | |
| }, | |
| "metadata": {} | |
| } | |
| ], | |
| "execution_count": 16 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "pd.Series(np.linspace(start = 1, stop = 2, num = 10))", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [ | |
| { | |
| "execution_count": 17, | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": "0 1.000000\n1 1.111111\n2 1.222222\n3 1.333333\n4 1.444444\n5 1.555556\n6 1.666667\n7 1.777778\n8 1.888889\n9 2.000000\ndtype: float64" | |
| }, | |
| "metadata": {} | |
| } | |
| ], | |
| "execution_count": 17 | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "source": "#### 2.2 Basic Indexing", | |
| "metadata": { | |
| "jp-MarkdownHeadingCollapsed": true | |
| } | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "x = pd.Series([5, 10, 15, 20, 25])\nx.index = [3, 1, 4, 0, 2]\n\nprint(x)", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "text": "3 5\n1 10\n4 15\n0 20\n2 25\ndtype: int64\n", | |
| "output_type": "stream" | |
| } | |
| ], | |
| "execution_count": 21 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "x[0]", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [ | |
| { | |
| "execution_count": 22, | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": "20" | |
| }, | |
| "metadata": {} | |
| } | |
| ], | |
| "execution_count": 22 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "x.loc[0]", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [ | |
| { | |
| "execution_count": 25, | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": "20" | |
| }, | |
| "metadata": {} | |
| } | |
| ], | |
| "execution_count": 25 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "x.iloc[0]", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [ | |
| { | |
| "execution_count": 29, | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": "5" | |
| }, | |
| "metadata": {} | |
| } | |
| ], | |
| "execution_count": 29 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "x.iloc[1:4:2]", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [ | |
| { | |
| "execution_count": 30, | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": "1 10\n0 20\ndtype: int64" | |
| }, | |
| "metadata": {} | |
| } | |
| ], | |
| "execution_count": 30 | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "source": "###### Range index vs int64 Index", | |
| "metadata": {} | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "x = pd.Series(np.random.normal(size = 10**7))\nprint(x)", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "text": "0 -0.095360\n1 -0.531923\n2 0.989790\n3 -0.515691\n4 0.229343\n ... \n9999995 -0.308055\n9999996 0.685634\n9999997 1.574605\n9999998 -0.551347\n9999999 -0.376001\nLength: 10000000, dtype: float64\n", | |
| "output_type": "stream" | |
| } | |
| ], | |
| "execution_count": 33 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "x.index", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [ | |
| { | |
| "execution_count": 39, | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": "RangeIndex(start=0, stop=10000000, step=1)" | |
| }, | |
| "metadata": {} | |
| } | |
| ], | |
| "execution_count": 39 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "y = pd.Series(np.random.normal(size = 10**7), index = np.arange(10**7))\nprint(y)", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "text": "0 -0.048644\n1 1.150449\n2 1.340439\n3 1.286724\n4 -1.398852\n ... \n9999995 -1.448750\n9999996 0.604911\n9999997 0.025833\n9999998 -0.064581\n9999999 -0.112200\nLength: 10000000, dtype: float64\n", | |
| "output_type": "stream" | |
| } | |
| ], | |
| "execution_count": 36 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "y.index", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [ | |
| { | |
| "execution_count": 38, | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": "Int64Index([ 0, 1, 2, 3, 4, 5, 6,\n 7, 8, 9,\n ...\n 9999990, 9999991, 9999992, 9999993, 9999994, 9999995, 9999996,\n 9999997, 9999998, 9999999],\n dtype='int64', length=10000000)" | |
| }, | |
| "metadata": {} | |
| } | |
| ], | |
| "execution_count": 38 | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "source": "int64 index consumes more memory than range index. it literally stores all the 10^7 indices", | |
| "metadata": {} | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "import sys", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [], | |
| "execution_count": 40 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "sys.getsizeof(x)", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [ | |
| { | |
| "execution_count": 41, | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": "80000088" | |
| }, | |
| "metadata": {} | |
| } | |
| ], | |
| "execution_count": 41 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "sys.getsizeof(y)", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [ | |
| { | |
| "execution_count": 43, | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": "160000016" | |
| }, | |
| "metadata": {} | |
| } | |
| ], | |
| "execution_count": 43 | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "source": "###### Overwriting data in a series", | |
| "metadata": {} | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "foo = pd.Series([10, 20, 30, 40, 50], index = ['a', 'b', 'c','d','e'])\nprint(foo)", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "text": "a 10\nb 20\nc 30\nd 40\ne 50\ndtype: int64\n", | |
| "output_type": "stream" | |
| } | |
| ], | |
| "execution_count": 44 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "foo.iloc[1] = 200\nprint(foo)", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "text": "a 10\nb 200\nc 30\nd 40\ne 50\ndtype: int64\n", | |
| "output_type": "stream" | |
| } | |
| ], | |
| "execution_count": 46 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "foo.iloc[[0,1,2]] = 99\nprint(foo)", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "text": "a 99\nb 99\nc 99\nd 40\ne 50\ndtype: int64\n", | |
| "output_type": "stream" | |
| } | |
| ], | |
| "execution_count": 48 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "foo.iloc[:3] = 999\nprint(foo)", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "text": "a 999\nb 999\nc 999\nd 40\ne 50\ndtype: int64\n", | |
| "output_type": "stream" | |
| } | |
| ], | |
| "execution_count": 50 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "x = pd.Series([10, 20, 30, 40])\ny = pd.Series([1, 11, 111, 1111], index = [7,3,2,0])\n\nprint(y)", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "text": "7 1\n3 11\n2 111\n0 1111\ndtype: int64\n", | |
| "output_type": "stream" | |
| } | |
| ], | |
| "execution_count": 61 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "print(x)\n", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "text": "0 10\n1 20\n2 30\n3 40\ndtype: int64\n", | |
| "output_type": "stream" | |
| } | |
| ], | |
| "execution_count": 62 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "x.loc[[0,1]] = y", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [], | |
| "execution_count": 53 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "print(x) # looks for labels 0, 1 in x and try to replace the same with 0, 1 from y. as there is no 1 in y, it changes to NaN", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "text": "0 1111.0\n1 NaN\n2 30.0\n3 40.0\ndtype: float64\n", | |
| "output_type": "stream" | |
| } | |
| ], | |
| "execution_count": 55 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "x.iloc[:2] = y.to_numpy()[:2]\nprint(x)", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "text": "0 1\n1 11\n2 30\n3 40\ndtype: int64\n", | |
| "output_type": "stream" | |
| } | |
| ], | |
| "execution_count": 63 | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "source": "#### 2.3 Series Basic Operations", | |
| "metadata": { | |
| "jp-MarkdownHeadingCollapsed": true | |
| } | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "x = pd.Series([1,2,3,4])\nprint(x)", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "text": "0 1\n1 2\n2 3\n3 4\ndtype: int64\n", | |
| "output_type": "stream" | |
| } | |
| ], | |
| "execution_count": 66 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "x + 1", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [ | |
| { | |
| "execution_count": 67, | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": "0 2\n1 3\n2 4\n3 5\ndtype: int64" | |
| }, | |
| "metadata": {} | |
| } | |
| ], | |
| "execution_count": 67 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "x + pd.Series(1) # only adds at index 0", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [ | |
| { | |
| "execution_count": 68, | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": "0 2.0\n1 NaN\n2 NaN\n3 NaN\ndtype: float64" | |
| }, | |
| "metadata": {} | |
| } | |
| ], | |
| "execution_count": 68 | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "source": "what is we don't want NaNs?", | |
| "metadata": {} | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "x = pd.Series([1, 2,3,4])\ny = pd.Series([10, 20], index = [2,1])\n\nprint(x)\nprint(y)", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "text": "0 1\n1 2\n2 3\n3 4\ndtype: int64\n2 10\n1 20\ndtype: int64\n", | |
| "output_type": "stream" | |
| } | |
| ], | |
| "execution_count": 69 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "x.loc[y.index] += y\nprint(x)", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "text": "0 1\n1 22\n2 13\n3 4\ndtype: int64\n", | |
| "output_type": "stream" | |
| } | |
| ], | |
| "execution_count": 71 | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "source": "#### 2.4 Boolean Indexing", | |
| "metadata": { | |
| "jp-MarkdownHeadingCollapsed": true | |
| } | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "foo = pd.Series([20, 50, 11, 45, 17, 31])\nprint(foo)", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "text": "0 20\n1 50\n2 11\n3 45\n4 17\n5 31\ndtype: int64\n", | |
| "output_type": "stream" | |
| } | |
| ], | |
| "execution_count": 73 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "foo < 20", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [ | |
| { | |
| "execution_count": 74, | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": "0 False\n1 False\n2 True\n3 False\n4 True\n5 False\ndtype: bool" | |
| }, | |
| "metadata": {} | |
| } | |
| ], | |
| "execution_count": 74 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "mask = foo < 20\nfoo.loc[mask] #gets only values below 20. you can write foo.loc[foo < 20] too.", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [ | |
| { | |
| "execution_count": 75, | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": "2 11\n4 17\ndtype: int64" | |
| }, | |
| "metadata": {} | |
| } | |
| ], | |
| "execution_count": 75 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "foo.index = [0,1,2,3,5,4] # swap last 2 indices\nprint(foo)", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "text": "0 20\n1 50\n2 11\n3 45\n5 17\n4 31\ndtype: int64\n", | |
| "output_type": "stream" | |
| } | |
| ], | |
| "execution_count": 77 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "foo.loc[mask] # tries to match indicex of foo and mask", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [ | |
| { | |
| "execution_count": 78, | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": "2 11\n4 31\ndtype: int64" | |
| }, | |
| "metadata": {} | |
| } | |
| ], | |
| "execution_count": 78 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "foo.loc[mask.to_numpy()]", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [ | |
| { | |
| "execution_count": 80, | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": "2 11\n5 17\ndtype: int64" | |
| }, | |
| "metadata": {} | |
| } | |
| ], | |
| "execution_count": 80 | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "source": "combining boolean series - element wise and element wise or negation", | |
| "metadata": {} | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "ages = pd.Series(\n data = [42, 43, 14, 18, 1],\n index = ['peter', 'lois', 'chris', 'meg', 'stewie']\n)\nprint(ages)", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "text": "peter 42\nlois 43\nchris 14\nmeg 18\nstewie 1\ndtype: int64\n", | |
| "output_type": "stream" | |
| } | |
| ], | |
| "execution_count": 81 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "genders = pd.Series(\n data = ['female', 'female', 'male', 'male', 'male'],\n index = ['lois', 'meg', 'chris', 'peter', 'stewie'],\n dtype = 'string'\n)\nprint(genders)", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "text": "lois female\nmeg female\nchris male\npeter male\nstewie male\ndtype: string\n", | |
| "output_type": "stream" | |
| } | |
| ], | |
| "execution_count": 83 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "# Even though their indices are in diff order. we can still answer questions like\n# who's a male younger than 18?\n\nmask = (genders == 'male') & (ages < 18)\nmask.loc[mask]", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [ | |
| { | |
| "execution_count": 85, | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": "chris True\nstewie True\ndtype: bool" | |
| }, | |
| "metadata": {} | |
| } | |
| ], | |
| "execution_count": 85 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "mask", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [ | |
| { | |
| "execution_count": 86, | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": "chris True\nlois False\nmeg False\npeter False\nstewie True\ndtype: bool" | |
| }, | |
| "metadata": {} | |
| } | |
| ], | |
| "execution_count": 86 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "~mask", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [ | |
| { | |
| "execution_count": 87, | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": "chris False\nlois True\nmeg True\npeter True\nstewie False\ndtype: bool" | |
| }, | |
| "metadata": {} | |
| } | |
| ], | |
| "execution_count": 87 | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "source": "#### 2.5 Series Missing Values", | |
| "metadata": { | |
| "jp-MarkdownHeadingCollapsed": true | |
| } | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "# history of Nan\n\nroux = pd.Series([1,2,3])\nprint(roux)\nroux.iloc[1] = np.nan\nprint(roux) # converts the whole series to floating point number", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "text": "0 1\n1 2\n2 3\ndtype: int64\n0 1.0\n1 NaN\n2 3.0\ndtype: float64\n", | |
| "output_type": "stream" | |
| } | |
| ], | |
| "execution_count": 92 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "roux = pd.Series([1,2,3], dtype = 'Int64')\nprint(roux)", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "text": "0 1\n1 2\n2 3\ndtype: Int64\n", | |
| "output_type": "stream" | |
| } | |
| ], | |
| "execution_count": 93 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "roux.iloc[1] = np.nan # or pd.Na or None\nprint(roux)", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "text": "0 1\n1 <NA>\n2 3\ndtype: Int64\n", | |
| "output_type": "stream" | |
| } | |
| ], | |
| "execution_count": 95 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "# fill na\nx = pd.Series([1, pd.NA, 3, pd.NA], dtype = 'Int64')\nprint(x)", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "text": "0 1\n1 <NA>\n2 3\n3 <NA>\ndtype: Int64\n", | |
| "output_type": "stream" | |
| } | |
| ], | |
| "execution_count": 96 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "x.fillna(-1)", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [ | |
| { | |
| "execution_count": 97, | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": "0 1\n1 -1\n2 3\n3 -1\ndtype: Int64" | |
| }, | |
| "metadata": {} | |
| } | |
| ], | |
| "execution_count": 97 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "x # notice the na values hasn't changed", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [ | |
| { | |
| "execution_count": 98, | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": "0 1\n1 <NA>\n2 3\n3 <NA>\ndtype: Int64" | |
| }, | |
| "metadata": {} | |
| } | |
| ], | |
| "execution_count": 98 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "# to change it. use inplace=True\nx.fillna(-1, inplace=True)\nprint(x)", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "text": "0 1\n1 -1\n2 3\n3 -1\ndtype: Int64\n", | |
| "output_type": "stream" | |
| } | |
| ], | |
| "execution_count": 99 | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "source": "#### 2.6 Vectorization", | |
| "metadata": { | |
| "jp-MarkdownHeadingCollapsed": true | |
| } | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "x = pd.Series(np.random.uniform(low = 1, high = 2, size = 10**6))\nprint(x)\n", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "text": "0 1.410044\n1 1.161561\n2 1.107593\n3 1.293172\n4 1.155268\n ... \n999995 1.682250\n999996 1.589057\n999997 1.310423\n999998 1.540775\n999999 1.796745\nLength: 1000000, dtype: float64\n", | |
| "output_type": "stream" | |
| } | |
| ], | |
| "execution_count": 101 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "def average(x):\n avg = 0.0\n for i in range(x.size):\n avg += x.iloc[i]/x.size\n return avg", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [], | |
| "execution_count": 104 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "average(x) # very slow", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [ | |
| { | |
| "execution_count": 105, | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": "1.4999582043534072" | |
| }, | |
| "metadata": {} | |
| } | |
| ], | |
| "execution_count": 105 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "np.mean(x) # very fast\n\n# avoid using loops.", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [ | |
| { | |
| "execution_count": 106, | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": "1.4999582043534536" | |
| }, | |
| "metadata": {} | |
| } | |
| ], | |
| "execution_count": 106 | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "source": "#### 2.7 apply()", | |
| "metadata": { | |
| "jp-MarkdownHeadingCollapsed": true | |
| } | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "foo = pd.Series([3,9,2,2,8,7])\nprint(foo)", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "text": "0 3\n1 9\n2 2\n3 2\n4 8\n5 7\ndtype: int64\n", | |
| "output_type": "stream" | |
| } | |
| ], | |
| "execution_count": 108 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "def my_func(x, a = 1, b = 3):\n return x - a if x % 2 == 0 else x + b", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [], | |
| "execution_count": 112 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "foo.apply(my_func, a = 2, b = 4)", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [ | |
| { | |
| "execution_count": 114, | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": "0 7\n1 13\n2 0\n3 0\n4 6\n5 11\ndtype: int64" | |
| }, | |
| "metadata": {} | |
| } | |
| ], | |
| "execution_count": 114 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "# apply() is not vectorized\n\nbigfoo = pd.Series(np.random.randint(low = 0, high = 9, size = 10**7))\nprint(bigfoo)", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "text": "0 0\n1 7\n2 2\n3 2\n4 8\n ..\n9999995 5\n9999996 1\n9999997 6\n9999998 4\n9999999 5\nLength: 10000000, dtype: int32\n", | |
| "output_type": "stream" | |
| } | |
| ], | |
| "execution_count": 116 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "%%timeit\ny1 = bigfoo.apply(my_func)", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "text": "4.96 s ± 771 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n", | |
| "output_type": "stream" | |
| } | |
| ], | |
| "execution_count": 117 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "%%timeit\na = bigfoo.to_numpy()\ny2 = pd.Series(np.where(a % 2 == 0, a - 1, a + 3))", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "text": "90.2 ms ± 29.2 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n", | |
| "output_type": "stream" | |
| } | |
| ], | |
| "execution_count": 118 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "# apply method is for convienience and clarity. not speed", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [], | |
| "execution_count": 119 | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "source": "#### 2.8 View vs Copy", | |
| "metadata": { | |
| "jp-MarkdownHeadingCollapsed": true | |
| } | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "x = pd.Series(\n data = [2,3,5,7,11],\n index = [2,11,12, 30, 30]\n)\nprint(x)", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "text": "2 2\n11 3\n12 5\n30 7\n30 11\ndtype: int64\n", | |
| "output_type": "stream" | |
| } | |
| ], | |
| "execution_count": 120 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "y = x\ny.iloc[0] = 99\nprint(y)", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "text": "2 99\n11 3\n12 5\n30 7\n30 11\ndtype: int64\n", | |
| "output_type": "stream" | |
| } | |
| ], | |
| "execution_count": 121 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "print(x) # also modifies x", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "text": "2 99\n11 3\n12 5\n30 7\n30 11\ndtype: int64\n", | |
| "output_type": "stream" | |
| } | |
| ], | |
| "execution_count": 123 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "y = x.copy()\ny.iloc[0] = 999\nprint(y)", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "text": "2 999\n11 3\n12 5\n30 7\n30 11\ndtype: int64\n", | |
| "output_type": "stream" | |
| } | |
| ], | |
| "execution_count": 124 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "print(x) # now x is not changed", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "text": "2 99\n11 3\n12 5\n30 7\n30 11\ndtype: int64\n", | |
| "output_type": "stream" | |
| } | |
| ], | |
| "execution_count": 125 | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "source": "#### 2.9 Challenge : Baby Names", | |
| "metadata": { | |
| "jp-MarkdownHeadingCollapsed": true | |
| } | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "# you and your spouse have decide to let the internet name your next child. You've asked the great people of the web to \n# submit their favorite names and you've compiled their submissions\n# into a series called babynames. Determine how many people have voted for the following names: 'Chad', 'Ruger', and 'Zeltron'", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [], | |
| "execution_count": 126 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "babynames = pd.Series(['Jathonathon', 'Zeltron', 'Ruger', 'Phreddy', 'Ruger', 'Chad', 'Chad' 'Ruger', 'Ryan', 'Ruger', 'Chad', 'Ryan', 'Phreddy', 'Phreddy', 'Phreddy', 'Mister', 'Zeltron'\n , 'Ryan', 'Ruger', 'Ruger', 'Jathonathon', 'Jathonathon', 'Ruger', 'Chad', 'Zeltron'], dtype = 'string')", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [], | |
| "execution_count": 127 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "# solution\n\nbabynames.value_counts().loc[['Chad', 'Ruger', 'Zeltron']] # values_counts - returns a series containing counts of unique values.", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [ | |
| { | |
| "execution_count": 129, | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": "Chad 3\nRuger 6\nZeltron 3\ndtype: Int64" | |
| }, | |
| "metadata": {} | |
| } | |
| ], | |
| "execution_count": 129 | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "source": "#### 2.10 Challenge: Bess Knees", | |
| "metadata": { | |
| "jp-MarkdownHeadingCollapsed": true | |
| } | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "# Given, two series, bees and knees, if the ith value of bees is NaN, double the ith value inside knees.", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [], | |
| "execution_count": 130 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "bees = pd.Series([True, True, False, np.nan, True, False, True, np.nan])\nprint(bees)", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "text": "0 True\n1 True\n2 False\n3 NaN\n4 True\n5 False\n6 True\n7 NaN\ndtype: object\n", | |
| "output_type": "stream" | |
| } | |
| ], | |
| "execution_count": 132 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "knees = pd.Series([5,2,9,1,3,10,5,2], index = [7,0,2,6,3,5,1,4])\nprint(knees)", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "text": "7 5\n0 2\n2 9\n6 1\n3 3\n5 10\n1 5\n4 2\ndtype: int64\n", | |
| "output_type": "stream" | |
| } | |
| ], | |
| "execution_count": 137 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "# solution\n\n# to_numpy is important else the index wouldn't have matched.\n# knees.loc[pd.isna(bees)] will indices from bees\n\nknees.loc[pd.isna(bees).to_numpy()] *= 2\nprint(knees)", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "text": "7 5\n0 2\n2 9\n6 2\n3 3\n5 10\n1 5\n4 4\ndtype: int64\n", | |
| "output_type": "stream" | |
| } | |
| ], | |
| "execution_count": 136 | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "source": "#### 2.11 Challenge: Car Shopping", | |
| "metadata": { | |
| "jp-MarkdownHeadingCollapsed": true | |
| } | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "# After accidentally leaving an ice chest of fish and shrimp in your car for a week while you were on vacation, you're now in the market for a new vehicle.\n# Your insurance didn't cover the loss, so you want to make sure you get a good deal on your new car. Given a series of car asking_prices and another series of car fair_prices,\n# determine which cars for sale are a good deal. In other words, identify cars whose asking price is less than their fair price.\n\n# The result should be a list of integer indices corresponding to the good deals in asking_prices.", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [], | |
| "execution_count": 141 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "asking_prices = pd.Series([5000, 7600, 9000, 8500, 7000], index =['civic', 'civic', 'camry', 'mustang', 'mustang'])\nfair_prices = pd.Series([5500, 7500, 7500], index = ['civic', 'mustang', 'camry'])", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [], | |
| "execution_count": 152 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "# solution\n\n# we can restructure this and solve it with Dataframes. we haven't learnt dataframes yet.\n\nall_fair_prices = fair_prices.loc[asking_prices.index]\noff_market_prices = asking_prices - all_fair_prices\nbelow_fair_prices = (off_market_prices < 0).reset_index(drop=True)\nbelow_fair_prices.loc[below_fair_prices].index.to_list()\n", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [ | |
| { | |
| "execution_count": 156, | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": "[0, 4]" | |
| }, | |
| "metadata": {} | |
| } | |
| ], | |
| "execution_count": 156 | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "source": "#### 2.12 Challenge: Price Gouging", | |
| "metadata": { | |
| "jp-MarkdownHeadingCollapsed": true | |
| } | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "# Track the price of ground beef everyday for 10 days. You've compiled the data into a series called beef_prices, whose index represents the day of each recording. Determin which \n# day had the biggest price increase from the prior day.", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [], | |
| "execution_count": 157 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "generator = np.random.default_rng(123)\nbeef_prices = pd.Series(\n data = np.round(generator.uniform(low = 3, high = 5, size = 10), 2),\n index = generator.choice(10, size = 10, replace = False)\n)\nprint(beef_prices)", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "text": "4 4.36\n8 3.11\n2 3.44\n0 3.37\n6 3.35\n9 4.62\n3 4.85\n5 3.55\n1 4.64\n7 4.78\ndtype: float64\n", | |
| "output_type": "stream" | |
| } | |
| ], | |
| "execution_count": 159 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "# solution\n\nbeef_prices.sort_index(inplace = True)\nbeef_prices_prev = beef_prices.shift(periods = 1)\ndaily_changes = beef_prices - beef_prices_prev\ndaily_changes.idxmax() # returns row label of the maximum value", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [ | |
| { | |
| "execution_count": 164, | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": "9" | |
| }, | |
| "metadata": {} | |
| } | |
| ], | |
| "execution_count": 164 | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "source": "#### 2.13 Challenge: Fair Teams", | |
| "metadata": { | |
| "jp-MarkdownHeadingCollapsed": true | |
| } | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "# 6 coaches and 20 players have signed up. Your job is to randomly and fairly determine the teams, assigning players to coaches. Keep in mind that some teams \n# will have three players and some teams will have four players. given a Series of coaches and Series of players, create a Series of random coach-to-player mappings.\n\n# The resulting Series should have coach names in its index and corresponding player names in its values.", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [], | |
| "execution_count": 165 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "coaches = pd.Series(['Aaron', 'Donald', 'Joshua', 'Peter', 'Scott', 'Stephen'], dtype = 'string')\nplayers = pd.Series(['Asher', 'Connor', 'Elizabeth', 'Emily', 'Ethan', 'Hannah', 'Isabella', 'Isaiah', 'James', 'Joshua', 'Julian', 'Layla', 'Leo', \n 'Madison', 'Mia', 'Oliver', 'Ryan', 'Scarlat', 'William', 'Wyatt'], dtype = 'string')", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [], | |
| "execution_count": 166 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "coaches = coaches.sample(frac=1, random_state=2357) # returns a random sample of items from an axis of object\nplayers = players.sample(frac=1, random_state=7532)\n\nrepeats = np.ceil(len(players)/len(coaches)).astype('int64') # num of times coaches list is repeated beside player list. ceil(20/6) \ncoaches_repeated = pd.concat([coaches] * repeats).head(len(players)) # coaches list to match the length of players. head(players) is used to pick only 20 \n\nresult = players.copy()\nresult.index = pd.Index(coaches_repeated, name ='coach')\nprint(result)", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "text": "1 Donald\n3 Peter\n5 Stephen\n0 Aaron\n2 Joshua\n4 Scott\ndtype: string\n12 Leo\n13 Madison\n2 Elizabeth\n10 Julian\n8 James\n11 Layla\n6 Isabella\n7 Isaiah\n15 Oliver\n14 Mia\n17 Scarlat\n3 Emily\n5 Hannah\n1 Connor\n9 Joshua\n4 Ethan\n0 Asher\n18 William\n16 Ryan\n19 Wyatt\ndtype: string\ncoach\nDonald Leo\nPeter Madison\nStephen Elizabeth\nAaron Julian\nJoshua James\nScott Layla\nDonald Isabella\nPeter Isaiah\nStephen Oliver\nAaron Mia\nJoshua Scarlat\nScott Emily\nDonald Hannah\nPeter Connor\nStephen Joshua\nAaron Ethan\nJoshua Asher\nScott William\nDonald Ryan\nPeter Wyatt\ndtype: string\n", | |
| "output_type": "stream" | |
| } | |
| ], | |
| "execution_count": 168 | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "source": "### DataFrame", | |
| "metadata": {} | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "source": "#### 3.1 DataFrame Creation", | |
| "metadata": {} | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "# Dataframe is a table of data with a row index. row index is an unlabelled column on the left.\n\ndf = pd.DataFrame({'name' : ['Bob', 'Sue', 'Mary'], 'age' : [39, 57, 28]})\n\nprint(df)", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "text": " name age\n0 Bob 39\n1 Sue 57\n2 Mary 28\n", | |
| "output_type": "stream" | |
| } | |
| ], | |
| "execution_count": 3 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "df = pd.DataFrame(\n [\n ['Bob', 39],\n ['Sue', 57],\n ['Mary', 28]\n ], columns =['name', 'age']\n)\nprint(df)", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "text": " name age\n0 Bob 39\n1 Sue 57\n2 Mary 28\n", | |
| "output_type": "stream" | |
| } | |
| ], | |
| "execution_count": 5 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "df.info()", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "text": "<class 'pandas.core.frame.DataFrame'>\nRangeIndex: 3 entries, 0 to 2\nData columns (total 2 columns):\n # Column Non-Null Count Dtype \n--- ------ -------------- ----- \n 0 name 3 non-null object\n 1 age 3 non-null int64 \ndtypes: int64(1), object(1)\nmemory usage: 108.0+ bytes\n", | |
| "output_type": "stream" | |
| } | |
| ], | |
| "execution_count": 6 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "df.shape", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [ | |
| { | |
| "execution_count": 7, | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": "(3, 2)" | |
| }, | |
| "metadata": {} | |
| } | |
| ], | |
| "execution_count": 7 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "df.axes", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [ | |
| { | |
| "execution_count": 8, | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": "[RangeIndex(start=0, stop=3, step=1), Index(['name', 'age'], dtype='object')]" | |
| }, | |
| "metadata": {} | |
| } | |
| ], | |
| "execution_count": 8 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "df.size", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [ | |
| { | |
| "execution_count": 9, | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": "6" | |
| }, | |
| "metadata": {} | |
| } | |
| ], | |
| "execution_count": 9 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "df.rename(columns = {'age' : 'years'}, inplace = True)\nprint(df)", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "text": " name years\n0 Bob 39\n1 Sue 57\n2 Mary 28\n", | |
| "output_type": "stream" | |
| } | |
| ], | |
| "execution_count": 11 | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "source": "#### 3.2 To and From CSV", | |
| "metadata": {} | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "# create data \n\ndf = pd.DataFrame(\n {\n 'id' : np.arange(1000),\n 'b' : np.random.normal(size = 1000),\n 'c': pd.Series(np.random.choice(['cat', 'dog', 'hippo'], size = 1000, replace = True))\n }\n)\nprint(df)", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "text": " id b c\n0 0 1.703454 dog\n1 1 -0.291755 dog\n2 2 -0.255308 cat\n3 3 -0.628465 cat\n4 4 -0.108681 dog\n.. ... ... ...\n995 995 -0.127438 hippo\n996 996 -2.956545 hippo\n997 997 0.032741 cat\n998 998 -0.966289 hippo\n999 999 0.086645 cat\n\n[1000 rows x 3 columns]\n", | |
| "output_type": "stream" | |
| } | |
| ], | |
| "execution_count": 2 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "df.to_csv('pets.csv', index = False)", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [], | |
| "execution_count": 3 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "pets = pd.read_csv('pets.csv')\nprint(pets)", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "text": " id b c\n0 0 1.703454 dog\n1 1 -0.291755 dog\n2 2 -0.255308 cat\n3 3 -0.628465 cat\n4 4 -0.108681 dog\n.. ... ... ...\n995 995 -0.127438 hippo\n996 996 -2.956545 hippo\n997 997 0.032741 cat\n998 998 -0.966289 hippo\n999 999 0.086645 cat\n\n[1000 rows x 3 columns]\n", | |
| "output_type": "stream" | |
| } | |
| ], | |
| "execution_count": 4 | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "source": "#### 3.3 Basic Indexing", | |
| "metadata": {} | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "df = pd.DataFrame(\n {\n 'shrimp' : [10, 20, 30 , 40, 50, 60],\n 'crab' : [5, 10, 15, 20, 25, 30],\n 'red fish' : [2,3,5,7,11, 13]\n }\n)\nprint(df)", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "text": " shrimp crab red fish\n0 10 5 2\n1 20 10 3\n2 30 15 5\n3 40 20 7\n4 50 25 11\n5 60 30 13\n", | |
| "output_type": "stream" | |
| } | |
| ], | |
| "execution_count": 5 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "df['shrimp']", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [ | |
| { | |
| "execution_count": 6, | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": "0 10\n1 20\n2 30\n3 40\n4 50\n5 60\nName: shrimp, dtype: int64" | |
| }, | |
| "metadata": {} | |
| } | |
| ], | |
| "execution_count": 6 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "df.shrimp", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [ | |
| { | |
| "execution_count": 7, | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": "0 10\n1 20\n2 30\n3 40\n4 50\n5 60\nName: shrimp, dtype: int64" | |
| }, | |
| "metadata": {} | |
| } | |
| ], | |
| "execution_count": 7 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "df[['shrimp', 'red fish']] # copy of original dataframe", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [ | |
| { | |
| "execution_count": 9, | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": " shrimp red fish\n0 10 2\n1 20 3\n2 30 5\n3 40 7\n4 50 11\n5 60 13", | |
| "text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>shrimp</th>\n <th>red fish</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>10</td>\n <td>2</td>\n </tr>\n <tr>\n <th>1</th>\n <td>20</td>\n <td>3</td>\n </tr>\n <tr>\n <th>2</th>\n <td>30</td>\n <td>5</td>\n </tr>\n <tr>\n <th>3</th>\n <td>40</td>\n <td>7</td>\n </tr>\n <tr>\n <th>4</th>\n <td>50</td>\n <td>11</td>\n </tr>\n <tr>\n <th>5</th>\n <td>60</td>\n <td>13</td>\n </tr>\n </tbody>\n</table>\n</div>" | |
| }, | |
| "metadata": {} | |
| } | |
| ], | |
| "execution_count": 9 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "df.iloc[[0,2,4]]", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [ | |
| { | |
| "execution_count": 10, | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": " shrimp crab red fish\n0 10 5 2\n2 30 15 5\n4 50 25 11", | |
| "text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>shrimp</th>\n <th>crab</th>\n <th>red fish</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>10</td>\n <td>5</td>\n <td>2</td>\n </tr>\n <tr>\n <th>2</th>\n <td>30</td>\n <td>15</td>\n <td>5</td>\n </tr>\n <tr>\n <th>4</th>\n <td>50</td>\n <td>25</td>\n <td>11</td>\n </tr>\n </tbody>\n</table>\n</div>" | |
| }, | |
| "metadata": {} | |
| } | |
| ], | |
| "execution_count": 10 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "df.iloc[0:5:2] #give me everything from 0 (inclusive) to 5 (exclusive), stepping by 2", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [ | |
| { | |
| "execution_count": 14, | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": " shrimp crab red fish\n0 10 5 2\n2 30 15 5\n4 50 25 11", | |
| "text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>shrimp</th>\n <th>crab</th>\n <th>red fish</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>10</td>\n <td>5</td>\n <td>2</td>\n </tr>\n <tr>\n <th>2</th>\n <td>30</td>\n <td>15</td>\n <td>5</td>\n </tr>\n <tr>\n <th>4</th>\n <td>50</td>\n <td>25</td>\n <td>11</td>\n </tr>\n </tbody>\n</table>\n</div>" | |
| }, | |
| "metadata": {} | |
| } | |
| ], | |
| "execution_count": 14 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "df.iloc[1] # returns as Series", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [ | |
| { | |
| "execution_count": 15, | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": "shrimp 20\ncrab 10\nred fish 3\nName: 1, dtype: int64" | |
| }, | |
| "metadata": {} | |
| } | |
| ], | |
| "execution_count": 15 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "df.iloc[[1]]", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [ | |
| { | |
| "execution_count": 16, | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": " shrimp crab red fish\n1 20 10 3", | |
| "text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>shrimp</th>\n <th>crab</th>\n <th>red fish</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>1</th>\n <td>20</td>\n <td>10</td>\n <td>3</td>\n </tr>\n </tbody>\n</table>\n</div>" | |
| }, | |
| "metadata": {} | |
| } | |
| ], | |
| "execution_count": 16 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "df.iloc[:, [0,1]] # give me every row of the df but only columns 0, 1", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [ | |
| { | |
| "execution_count": 20, | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": " shrimp crab\n0 10 5\n1 20 10\n2 30 15\n3 40 20\n4 50 25\n5 60 30", | |
| "text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>shrimp</th>\n <th>crab</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>10</td>\n <td>5</td>\n </tr>\n <tr>\n <th>1</th>\n <td>20</td>\n <td>10</td>\n </tr>\n <tr>\n <th>2</th>\n <td>30</td>\n <td>15</td>\n </tr>\n <tr>\n <th>3</th>\n <td>40</td>\n <td>20</td>\n </tr>\n <tr>\n <th>4</th>\n <td>50</td>\n <td>25</td>\n </tr>\n <tr>\n <th>5</th>\n <td>60</td>\n <td>30</td>\n </tr>\n </tbody>\n</table>\n</div>" | |
| }, | |
| "metadata": {} | |
| } | |
| ], | |
| "execution_count": 20 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "df.iloc[[0,2], [1,2]] # give me rows 0, 2 with columns 1, 2", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [ | |
| { | |
| "execution_count": 22, | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": " crab red fish\n0 5 2\n2 15 5", | |
| "text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>crab</th>\n <th>red fish</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>5</td>\n <td>2</td>\n </tr>\n <tr>\n <th>2</th>\n <td>15</td>\n <td>5</td>\n </tr>\n </tbody>\n</table>\n</div>" | |
| }, | |
| "metadata": {} | |
| } | |
| ], | |
| "execution_count": 22 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "df.index = ['a', 'b', 'c', 'd', 'e', 'f']\nprint(df)", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "text": " shrimp crab red fish\na 10 5 2\nb 20 10 3\nc 30 15 5\nd 40 20 7\ne 50 25 11\nf 60 30 13\n", | |
| "output_type": "stream" | |
| } | |
| ], | |
| "execution_count": 23 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "df.loc[['b', 'e']]", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [ | |
| { | |
| "execution_count": 24, | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": " shrimp crab red fish\nb 20 10 3\ne 50 25 11", | |
| "text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>shrimp</th>\n <th>crab</th>\n <th>red fish</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>b</th>\n <td>20</td>\n <td>10</td>\n <td>3</td>\n </tr>\n <tr>\n <th>e</th>\n <td>50</td>\n <td>25</td>\n <td>11</td>\n </tr>\n </tbody>\n</table>\n</div>" | |
| }, | |
| "metadata": {} | |
| } | |
| ], | |
| "execution_count": 24 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "df.loc[['a', 'c', 'f'], ['crab' , 'shrimp']]", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [ | |
| { | |
| "execution_count": 25, | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": " crab shrimp\na 5 10\nc 15 30\nf 30 60", | |
| "text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>crab</th>\n <th>shrimp</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>a</th>\n <td>5</td>\n <td>10</td>\n </tr>\n <tr>\n <th>c</th>\n <td>15</td>\n <td>30</td>\n </tr>\n <tr>\n <th>f</th>\n <td>30</td>\n <td>60</td>\n </tr>\n </tbody>\n</table>\n</div>" | |
| }, | |
| "metadata": {} | |
| } | |
| ], | |
| "execution_count": 25 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "df.loc['b':'e', ['crab', 'shrimp']]", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [ | |
| { | |
| "execution_count": 28, | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": " crab shrimp\nb 10 20\nc 15 30\nd 20 40\ne 25 50", | |
| "text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>crab</th>\n <th>shrimp</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>b</th>\n <td>10</td>\n <td>20</td>\n </tr>\n <tr>\n <th>c</th>\n <td>15</td>\n <td>30</td>\n </tr>\n <tr>\n <th>d</th>\n <td>20</td>\n <td>40</td>\n </tr>\n <tr>\n <th>e</th>\n <td>25</td>\n <td>50</td>\n </tr>\n </tbody>\n</table>\n</div>" | |
| }, | |
| "metadata": {} | |
| } | |
| ], | |
| "execution_count": 28 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "# df.iloc[:3, ['shrimp']] doesn't work in pandas. it expects numeric indexers\n\ndf.iloc[:3, [0, 2]]", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [ | |
| { | |
| "execution_count": 32, | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": " shrimp red fish\na 10 2\nb 20 3\nc 30 5", | |
| "text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>shrimp</th>\n <th>red fish</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>a</th>\n <td>10</td>\n <td>2</td>\n </tr>\n <tr>\n <th>b</th>\n <td>20</td>\n <td>3</td>\n </tr>\n <tr>\n <th>c</th>\n <td>30</td>\n <td>5</td>\n </tr>\n </tbody>\n</table>\n</div>" | |
| }, | |
| "metadata": {} | |
| } | |
| ], | |
| "execution_count": 32 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "# get_loc - get integer location for requested label\ndf.iloc[:3, [df.columns.get_loc(c) for c in ['shrimp', 'red fish']]]", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [ | |
| { | |
| "execution_count": 34, | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": " shrimp red fish\na 10 2\nb 20 3\nc 30 5", | |
| "text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>shrimp</th>\n <th>red fish</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>a</th>\n <td>10</td>\n <td>2</td>\n </tr>\n <tr>\n <th>b</th>\n <td>20</td>\n <td>3</td>\n </tr>\n <tr>\n <th>c</th>\n <td>30</td>\n <td>5</td>\n </tr>\n </tbody>\n</table>\n</div>" | |
| }, | |
| "metadata": {} | |
| } | |
| ], | |
| "execution_count": 34 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "df.loc[df.shrimp > 40, ['shrimp']]", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [ | |
| { | |
| "execution_count": 39, | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": " shrimp\ne 50\nf 60", | |
| "text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>shrimp</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>e</th>\n <td>50</td>\n </tr>\n <tr>\n <th>f</th>\n <td>60</td>\n </tr>\n </tbody>\n</table>\n</div>" | |
| }, | |
| "metadata": {} | |
| } | |
| ], | |
| "execution_count": 39 | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "source": "#### 3.4 Basic Operations", | |
| "metadata": {} | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "df = pd.DataFrame(\n {\n 'a' : [2,3,11, 13],\n 'b' : ['fox', 'rabbit', 'hound', 'rabbit']\n }\n)\nprint(df)", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "text": " a b\n0 2 fox\n1 3 rabbit\n2 11 hound\n3 13 rabbit\n", | |
| "output_type": "stream" | |
| } | |
| ], | |
| "execution_count": 41 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "# inserting a new column into df\n\ndf['c'] = [1, 0 , 1, 2]\n\n# we cannot use df.c = 1\nprint(df)", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "text": " a b c\n0 2 fox 1\n1 3 rabbit 0\n2 11 hound 1\n3 13 rabbit 2\n", | |
| "output_type": "stream" | |
| } | |
| ], | |
| "execution_count": 44 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "df['e'] = df.a + df.c\nprint(df)", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "text": " a b c e\n0 2 fox 1 3\n1 3 rabbit 0 3\n2 11 hound 1 12\n3 13 rabbit 2 15\n", | |
| "output_type": "stream" | |
| } | |
| ], | |
| "execution_count": 45 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "df.loc[df.b == 'rabbit', 'f'] = 0\nprint(df)", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "text": " a b c e f\n0 2 fox 1 3 NaN\n1 3 rabbit 0 3 0.0\n2 11 hound 1 12 NaN\n3 13 rabbit 2 15 0.0\n", | |
| "output_type": "stream" | |
| } | |
| ], | |
| "execution_count": 47 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "df.drop(columns = ['a', 'c'], inplace = True)\nprint(df)", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "text": " b e f\n0 fox 3 NaN\n1 rabbit 3 0.0\n2 hound 12 NaN\n3 rabbit 15 0.0\n", | |
| "output_type": "stream" | |
| } | |
| ], | |
| "execution_count": 48 | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "source": "#### 3.5 apply()", | |
| "metadata": {} | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "df = pd.DataFrame(\n {\n 'A' : [5.2, 1.7, 9.4],\n 'B' : [3.9, 4.0, 7.8]\n }\n)\nprint(df)", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "text": " A B\n0 5.2 3.9\n1 1.7 4.0\n2 9.4 7.8\n", | |
| "output_type": "stream" | |
| } | |
| ], | |
| "execution_count": 49 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "# apply a function along an axis of the DataFrame\n\ndf.apply(func = np.sum, axis = 1)", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [ | |
| { | |
| "execution_count": 51, | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": "0 9.1\n1 5.7\n2 17.2\ndtype: float64" | |
| }, | |
| "metadata": {} | |
| } | |
| ], | |
| "execution_count": 51 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "kids = pd.DataFrame(\n {\n 'name' : pd.Series(['alice', 'mary', 'jimmy', 'johnny', 'susan'], dtype = 'string'),\n 'age' : [9,13,11,15, 8],\n 'with_adult' : [True, False, False, True, True]\n }\n)\nprint(kids)", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "text": " name age with_adult\n0 alice 9 True\n1 mary 13 False\n2 jimmy 11 False\n3 johnny 15 True\n4 susan 8 True\n", | |
| "output_type": "stream" | |
| } | |
| ], | |
| "execution_count": 52 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "def is_allowed(age, with_adult):\n return age >= 12 or with_adult", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [], | |
| "execution_count": 57 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "# kids.apply(is_allowed, axis = 1) # doesn't work\n\nkids.apply(lambda row : is_allowed(row.loc['age'], row.loc['with_adult']), axis = 1)", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [ | |
| { | |
| "execution_count": 59, | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": "0 True\n1 True\n2 False\n3 True\n4 True\ndtype: bool" | |
| }, | |
| "metadata": {} | |
| } | |
| ], | |
| "execution_count": 59 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "kids['is_allowed'] = kids.apply(lambda row: is_allowed(row.loc['age'], row.loc['with_adult']), axis = 1)\nprint(kids)", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "text": " name age with_adult is_allowed\n0 alice 9 True True\n1 mary 13 False True\n2 jimmy 11 False False\n3 johnny 15 True True\n4 susan 8 True True\n", | |
| "output_type": "stream" | |
| } | |
| ], | |
| "execution_count": 61 | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "source": "#### 3.6 view vs copy", | |
| "metadata": { | |
| "jp-MarkdownHeadingCollapsed": true | |
| } | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "df = pd.DataFrame(\n {\n 'x' : [1,2,3],\n 'y' : [10, 20, 30]\n }\n)\nprint(df)", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "text": " x y\n0 1 10\n1 2 20\n2 3 30\n", | |
| "output_type": "stream" | |
| } | |
| ], | |
| "execution_count": 77 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "v1 = df['x']", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [], | |
| "execution_count": 78 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "v1.iloc[0] = 999\nprint(df) # df changed", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "text": " x y\n0 999 10\n1 2 20\n2 3 30\n", | |
| "output_type": "stream" | |
| } | |
| ], | |
| "execution_count": 79 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "v2 = df['y'] + 1", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [], | |
| "execution_count": 80 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "v2.iloc[0] = 0\nprint(df) # df doesn't change", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "text": " x y\n0 999 10\n1 2 20\n2 3 30\n", | |
| "output_type": "stream" | |
| } | |
| ], | |
| "execution_count": 81 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "v1 = df.iloc[[0,1]]\nv2 = df.iloc[:2]", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [], | |
| "execution_count": 82 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "v1.iloc[1] = 111\nprint(df)", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stderr", | |
| "text": "<ipython-input-84-4c2417139498>:1: SettingWithCopyWarning: \nA value is trying to be set on a copy of a slice from a DataFrame\n\nSee the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n v1.iloc[1] = 111\n", | |
| "output_type": "stream" | |
| }, | |
| { | |
| "name": "stdout", | |
| "text": " x y\n0 999 10\n1 2 20\n2 3 30\n", | |
| "output_type": "stream" | |
| } | |
| ], | |
| "execution_count": 84 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "v2.iloc[1] = 222\nprint(df) #changed because v2 is just a reference to the memory. v2 is a view not a copy.", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stderr", | |
| "text": "<ipython-input-85-6c752e41547c>:1: SettingWithCopyWarning: \nA value is trying to be set on a copy of a slice from a DataFrame\n\nSee the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n v2.iloc[1] = 222\n", | |
| "output_type": "stream" | |
| }, | |
| { | |
| "name": "stdout", | |
| "text": " x y\n0 999 10\n1 222 222\n2 3 30\n", | |
| "output_type": "stream" | |
| } | |
| ], | |
| "execution_count": 85 | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "source": "#### 3.7 merge()", | |
| "metadata": { | |
| "jp-MarkdownHeadingCollapsed": true | |
| } | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "pets = pd.DataFrame(\n data = {\n 'name' : ['Mr. Snuggles', 'Honey Chew Chew', 'Professor', 'Chairman Meow', 'hello'],\n 'type' : ['cat', 'dog', 'dog', 'cat', 'horse']\n },\n index = [71, 42, 11, 98, 42]\n)\n\nprint(pets)", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "text": " name type\n71 Mr. Snuggles cat\n42 Honey Chew Chew dog\n11 Professor dog\n98 Chairman cat\n42 hello horse\n", | |
| "output_type": "stream" | |
| } | |
| ], | |
| "execution_count": 87 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "visits = pd.DataFrame(\n data = {\n 'pet_id' : [42, 31, 71, 42, 98, 42],\n 'date' : ['2019-03-15', '2019-03-15', '2019-04-05', '2019-04-06', '2019-04-07', '2019-04-08']\n }\n)\nprint(visits)", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "text": " pet_id date\n0 42 2019-03-15\n1 31 2019-03-15\n2 71 2019-04-05\n3 42 2019-04-06\n4 98 2019-04-07\n5 42 2019-04-08\n", | |
| "output_type": "stream" | |
| } | |
| ], | |
| "execution_count": 90 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "# merge - merges data frames or named series objects with a databse-style join\n\n# params - left, right, how (left, right, outer, inner(default))\n\npd.merge(left = pets, right = visits, how = 'inner', left_index =True, right_on = 'pet_id')\n", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [ | |
| { | |
| "execution_count": 92, | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": " name type pet_id date\n2 Mr. Snuggles cat 71 2019-04-05\n0 Honey Chew Chew dog 42 2019-03-15\n3 Honey Chew Chew dog 42 2019-04-06\n5 Honey Chew Chew dog 42 2019-04-08\n0 hello horse 42 2019-03-15\n3 hello horse 42 2019-04-06\n5 hello horse 42 2019-04-08\n4 Chairman cat 98 2019-04-07", | |
| "text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>name</th>\n <th>type</th>\n <th>pet_id</th>\n <th>date</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>2</th>\n <td>Mr. Snuggles</td>\n <td>cat</td>\n <td>71</td>\n <td>2019-04-05</td>\n </tr>\n <tr>\n <th>0</th>\n <td>Honey Chew Chew</td>\n <td>dog</td>\n <td>42</td>\n <td>2019-03-15</td>\n </tr>\n <tr>\n <th>3</th>\n <td>Honey Chew Chew</td>\n <td>dog</td>\n <td>42</td>\n <td>2019-04-06</td>\n </tr>\n <tr>\n <th>5</th>\n <td>Honey Chew Chew</td>\n <td>dog</td>\n <td>42</td>\n <td>2019-04-08</td>\n </tr>\n <tr>\n <th>0</th>\n <td>hello</td>\n <td>horse</td>\n <td>42</td>\n <td>2019-03-15</td>\n </tr>\n <tr>\n <th>3</th>\n <td>hello</td>\n <td>horse</td>\n <td>42</td>\n <td>2019-04-06</td>\n </tr>\n <tr>\n <th>5</th>\n <td>hello</td>\n <td>horse</td>\n <td>42</td>\n <td>2019-04-08</td>\n </tr>\n <tr>\n <th>4</th>\n <td>Chairman</td>\n <td>cat</td>\n <td>98</td>\n <td>2019-04-07</td>\n </tr>\n </tbody>\n</table>\n</div>" | |
| }, | |
| "metadata": {} | |
| } | |
| ], | |
| "execution_count": 92 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "pets.index.rename('pet_id', inplace = True)\nprint(pets)", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "text": " name type\npet_id \n71 Mr. Snuggles cat\n42 Honey Chew Chew dog\n11 Professor dog\n98 Chairman cat\n42 hello horse\n", | |
| "output_type": "stream" | |
| } | |
| ], | |
| "execution_count": 93 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "visits.index.rename('visit_id', inplace = True)\nprint(visits)", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "text": " pet_id date\nvisit_id \n0 42 2019-03-15\n1 31 2019-03-15\n2 71 2019-04-05\n3 42 2019-04-06\n4 98 2019-04-07\n5 42 2019-04-08\n", | |
| "output_type": "stream" | |
| } | |
| ], | |
| "execution_count": 94 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "# advantage of setting petid, visit_id is we can us 'on' while merging\n\npd.merge(left = pets, right = visits, how = 'inner', on = 'pet_id')", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [ | |
| { | |
| "execution_count": 95, | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": " pet_id name type date\n0 71 Mr. Snuggles cat 2019-04-05\n1 42 Honey Chew Chew dog 2019-03-15\n2 42 Honey Chew Chew dog 2019-04-06\n3 42 Honey Chew Chew dog 2019-04-08\n4 42 hello horse 2019-03-15\n5 42 hello horse 2019-04-06\n6 42 hello horse 2019-04-08\n7 98 Chairman cat 2019-04-07", | |
| "text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>pet_id</th>\n <th>name</th>\n <th>type</th>\n <th>date</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>71</td>\n <td>Mr. Snuggles</td>\n <td>cat</td>\n <td>2019-04-05</td>\n </tr>\n <tr>\n <th>1</th>\n <td>42</td>\n <td>Honey Chew Chew</td>\n <td>dog</td>\n <td>2019-03-15</td>\n </tr>\n <tr>\n <th>2</th>\n <td>42</td>\n <td>Honey Chew Chew</td>\n <td>dog</td>\n <td>2019-04-06</td>\n </tr>\n <tr>\n <th>3</th>\n <td>42</td>\n <td>Honey Chew Chew</td>\n <td>dog</td>\n <td>2019-04-08</td>\n </tr>\n <tr>\n <th>4</th>\n <td>42</td>\n <td>hello</td>\n <td>horse</td>\n <td>2019-03-15</td>\n </tr>\n <tr>\n <th>5</th>\n <td>42</td>\n <td>hello</td>\n <td>horse</td>\n <td>2019-04-06</td>\n </tr>\n <tr>\n <th>6</th>\n <td>42</td>\n <td>hello</td>\n <td>horse</td>\n <td>2019-04-08</td>\n </tr>\n <tr>\n <th>7</th>\n <td>98</td>\n <td>Chairman</td>\n <td>cat</td>\n <td>2019-04-07</td>\n </tr>\n </tbody>\n</table>\n</div>" | |
| }, | |
| "metadata": {} | |
| } | |
| ], | |
| "execution_count": 95 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "pd.merge(left = pets, right = visits, how = 'left', on = 'pet_id')", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [ | |
| { | |
| "execution_count": 96, | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": " pet_id name type date\n0 71 Mr. Snuggles cat 2019-04-05\n1 42 Honey Chew Chew dog 2019-03-15\n2 42 Honey Chew Chew dog 2019-04-06\n3 42 Honey Chew Chew dog 2019-04-08\n4 11 Professor dog NaN\n5 98 Chairman cat 2019-04-07\n6 42 hello horse 2019-03-15\n7 42 hello horse 2019-04-06\n8 42 hello horse 2019-04-08", | |
| "text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>pet_id</th>\n <th>name</th>\n <th>type</th>\n <th>date</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>71</td>\n <td>Mr. Snuggles</td>\n <td>cat</td>\n <td>2019-04-05</td>\n </tr>\n <tr>\n <th>1</th>\n <td>42</td>\n <td>Honey Chew Chew</td>\n <td>dog</td>\n <td>2019-03-15</td>\n </tr>\n <tr>\n <th>2</th>\n <td>42</td>\n <td>Honey Chew Chew</td>\n <td>dog</td>\n <td>2019-04-06</td>\n </tr>\n <tr>\n <th>3</th>\n <td>42</td>\n <td>Honey Chew Chew</td>\n <td>dog</td>\n <td>2019-04-08</td>\n </tr>\n <tr>\n <th>4</th>\n <td>11</td>\n <td>Professor</td>\n <td>dog</td>\n <td>NaN</td>\n </tr>\n <tr>\n <th>5</th>\n <td>98</td>\n <td>Chairman</td>\n <td>cat</td>\n <td>2019-04-07</td>\n </tr>\n <tr>\n <th>6</th>\n <td>42</td>\n <td>hello</td>\n <td>horse</td>\n <td>2019-03-15</td>\n </tr>\n <tr>\n <th>7</th>\n <td>42</td>\n <td>hello</td>\n <td>horse</td>\n <td>2019-04-06</td>\n </tr>\n <tr>\n <th>8</th>\n <td>42</td>\n <td>hello</td>\n <td>horse</td>\n <td>2019-04-08</td>\n </tr>\n </tbody>\n</table>\n</div>" | |
| }, | |
| "metadata": {} | |
| } | |
| ], | |
| "execution_count": 96 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "pd.merge(left = pets, right = visits, how = 'right', on = 'pet_id')", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [ | |
| { | |
| "execution_count": 98, | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": " pet_id name type date\n0 42 Honey Chew Chew dog 2019-03-15\n1 42 hello horse 2019-03-15\n2 31 NaN NaN 2019-03-15\n3 71 Mr. Snuggles cat 2019-04-05\n4 42 Honey Chew Chew dog 2019-04-06\n5 42 hello horse 2019-04-06\n6 98 Chairman cat 2019-04-07\n7 42 Honey Chew Chew dog 2019-04-08\n8 42 hello horse 2019-04-08", | |
| "text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>pet_id</th>\n <th>name</th>\n <th>type</th>\n <th>date</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>42</td>\n <td>Honey Chew Chew</td>\n <td>dog</td>\n <td>2019-03-15</td>\n </tr>\n <tr>\n <th>1</th>\n <td>42</td>\n <td>hello</td>\n <td>horse</td>\n <td>2019-03-15</td>\n </tr>\n <tr>\n <th>2</th>\n <td>31</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>2019-03-15</td>\n </tr>\n <tr>\n <th>3</th>\n <td>71</td>\n <td>Mr. Snuggles</td>\n <td>cat</td>\n <td>2019-04-05</td>\n </tr>\n <tr>\n <th>4</th>\n <td>42</td>\n <td>Honey Chew Chew</td>\n <td>dog</td>\n <td>2019-04-06</td>\n </tr>\n <tr>\n <th>5</th>\n <td>42</td>\n <td>hello</td>\n <td>horse</td>\n <td>2019-04-06</td>\n </tr>\n <tr>\n <th>6</th>\n <td>98</td>\n <td>Chairman</td>\n <td>cat</td>\n <td>2019-04-07</td>\n </tr>\n <tr>\n <th>7</th>\n <td>42</td>\n <td>Honey Chew Chew</td>\n <td>dog</td>\n <td>2019-04-08</td>\n </tr>\n <tr>\n <th>8</th>\n <td>42</td>\n <td>hello</td>\n <td>horse</td>\n <td>2019-04-08</td>\n </tr>\n </tbody>\n</table>\n</div>" | |
| }, | |
| "metadata": {} | |
| } | |
| ], | |
| "execution_count": 98 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "pd.merge(left = pets, right = visits, how = 'outer', on = 'pet_id')", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [ | |
| { | |
| "execution_count": 99, | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": " pet_id name type date\n0 71 Mr. Snuggles cat 2019-04-05\n1 42 Honey Chew Chew dog 2019-03-15\n2 42 Honey Chew Chew dog 2019-04-06\n3 42 Honey Chew Chew dog 2019-04-08\n4 42 hello horse 2019-03-15\n5 42 hello horse 2019-04-06\n6 42 hello horse 2019-04-08\n7 11 Professor dog NaN\n8 98 Chairman cat 2019-04-07\n9 31 NaN NaN 2019-03-15", | |
| "text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>pet_id</th>\n <th>name</th>\n <th>type</th>\n <th>date</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>71</td>\n <td>Mr. Snuggles</td>\n <td>cat</td>\n <td>2019-04-05</td>\n </tr>\n <tr>\n <th>1</th>\n <td>42</td>\n <td>Honey Chew Chew</td>\n <td>dog</td>\n <td>2019-03-15</td>\n </tr>\n <tr>\n <th>2</th>\n <td>42</td>\n <td>Honey Chew Chew</td>\n <td>dog</td>\n <td>2019-04-06</td>\n </tr>\n <tr>\n <th>3</th>\n <td>42</td>\n <td>Honey Chew Chew</td>\n <td>dog</td>\n <td>2019-04-08</td>\n </tr>\n <tr>\n <th>4</th>\n <td>42</td>\n <td>hello</td>\n <td>horse</td>\n <td>2019-03-15</td>\n </tr>\n <tr>\n <th>5</th>\n <td>42</td>\n <td>hello</td>\n <td>horse</td>\n <td>2019-04-06</td>\n </tr>\n <tr>\n <th>6</th>\n <td>42</td>\n <td>hello</td>\n <td>horse</td>\n <td>2019-04-08</td>\n </tr>\n <tr>\n <th>7</th>\n <td>11</td>\n <td>Professor</td>\n <td>dog</td>\n <td>NaN</td>\n </tr>\n <tr>\n <th>8</th>\n <td>98</td>\n <td>Chairman</td>\n <td>cat</td>\n <td>2019-04-07</td>\n </tr>\n <tr>\n <th>9</th>\n <td>31</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>2019-03-15</td>\n </tr>\n </tbody>\n</table>\n</div>" | |
| }, | |
| "metadata": {} | |
| } | |
| ], | |
| "execution_count": 99 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "# anti join i.e records that are in pets but not in visits\n# easy but less efficient and uses lot of memory\nouter = pd.merge(left = pets, right = visits, how = 'outer', on = 'pet_id', indicator = True)\nprint(outer)", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "text": " pet_id name type date _merge\n0 71 Mr. Snuggles cat 2019-04-05 both\n1 42 Honey Chew Chew dog 2019-03-15 both\n2 42 Honey Chew Chew dog 2019-04-06 both\n3 42 Honey Chew Chew dog 2019-04-08 both\n4 42 hello horse 2019-03-15 both\n5 42 hello horse 2019-04-06 both\n6 42 hello horse 2019-04-08 both\n7 11 Professor dog NaN left_only\n8 98 Chairman cat 2019-04-07 both\n9 31 NaN NaN 2019-03-15 right_only\n", | |
| "output_type": "stream" | |
| } | |
| ], | |
| "execution_count": 100 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "a_not_in_b = outer.loc[outer._merge == 'left_only']\nprint(a_not_in_b)", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "text": " pet_id name type date _merge\n7 11 Professor dog NaN left_only\n", | |
| "output_type": "stream" | |
| } | |
| ], | |
| "execution_count": 102 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "# memory efficient way\npets.index.isin(visits.pet_id)\n", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [ | |
| { | |
| "execution_count": 103, | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": "array([ True, True, False, True, True])" | |
| }, | |
| "metadata": {} | |
| } | |
| ], | |
| "execution_count": 103 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "pets.loc[~pets.index.isin(visits.pet_id)]", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [ | |
| { | |
| "execution_count": 104, | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": " name type\npet_id \n11 Professor dog", | |
| "text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>name</th>\n <th>type</th>\n </tr>\n <tr>\n <th>pet_id</th>\n <th></th>\n <th></th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>11</th>\n <td>Professor</td>\n <td>dog</td>\n </tr>\n </tbody>\n</table>\n</div>" | |
| }, | |
| "metadata": {} | |
| } | |
| ], | |
| "execution_count": 104 | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "source": "#### 3.8 Aggregation", | |
| "metadata": {} | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "df = pd.DataFrame({\n 'x' : [3.1, 5.5, 9.2, 1.7, 1.2, 8.3, 2.6],\n 'y' : [1.4, np.nan, 5.0, 5.8, 9.0, np.nan, 9.2]\n})\n\nprint(df)", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "text": " x y\n0 3.1 1.4\n1 5.5 NaN\n2 9.2 5.0\n3 1.7 5.8\n4 1.2 9.0\n5 8.3 NaN\n6 2.6 9.2\n", | |
| "output_type": "stream" | |
| } | |
| ], | |
| "execution_count": 105 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "# agg - aggregate using one or more operations over the specified axis.\n\ndf.agg('sum')", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [ | |
| { | |
| "execution_count": 106, | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": "x 31.6\ny 30.4\ndtype: float64" | |
| }, | |
| "metadata": {} | |
| } | |
| ], | |
| "execution_count": 106 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "# as a df\ndf.agg(['sum']) ", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [ | |
| { | |
| "execution_count": 107, | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": " x y\nsum 31.6 30.4", | |
| "text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>x</th>\n <th>y</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>sum</th>\n <td>31.6</td>\n <td>30.4</td>\n </tr>\n </tbody>\n</table>\n</div>" | |
| }, | |
| "metadata": {} | |
| } | |
| ], | |
| "execution_count": 107 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "df.agg(['sum', 'mean']) ", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [ | |
| { | |
| "execution_count": 108, | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": " x y\nsum 31.600000 30.40\nmean 4.514286 6.08", | |
| "text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>x</th>\n <th>y</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>sum</th>\n <td>31.600000</td>\n <td>30.40</td>\n </tr>\n <tr>\n <th>mean</th>\n <td>4.514286</td>\n <td>6.08</td>\n </tr>\n </tbody>\n</table>\n</div>" | |
| }, | |
| "metadata": {} | |
| } | |
| ], | |
| "execution_count": 108 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "# you can even pass custom functions\ndef foofun(x):\n return x.iloc[1]\n\ndf.agg(foofun)", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [ | |
| { | |
| "execution_count": 110, | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": "x 5.5\ny NaN\ndtype: float64" | |
| }, | |
| "metadata": {} | |
| } | |
| ], | |
| "execution_count": 110 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "df.agg({'x' : ['sum', 'mean'], 'y' : ['min', 'max']})", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [ | |
| { | |
| "execution_count": 111, | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": " x y\nsum 31.600000 NaN\nmean 4.514286 NaN\nmin NaN 1.4\nmax NaN 9.2", | |
| "text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>x</th>\n <th>y</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>sum</th>\n <td>31.600000</td>\n <td>NaN</td>\n </tr>\n <tr>\n <th>mean</th>\n <td>4.514286</td>\n <td>NaN</td>\n </tr>\n <tr>\n <th>min</th>\n <td>NaN</td>\n <td>1.4</td>\n </tr>\n <tr>\n <th>max</th>\n <td>NaN</td>\n <td>9.2</td>\n </tr>\n </tbody>\n</table>\n</div>" | |
| }, | |
| "metadata": {} | |
| } | |
| ], | |
| "execution_count": 111 | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "source": "#### 3.9 groupby()", | |
| "metadata": {} | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "df = pd.DataFrame({\n 'A' : ['foo', 'bar', 'foo', 'bar', 'bar', 'foo', 'foo'],\n 'B' : [False, True, False, True, True, True, True],\n 'C' : [2.1, 1.9, 3.6, 4.0, 1.9, 7.8, 2.8],\n 'D' : [50, np.nan, 30, 90, 10, np.nan, 10]\n}).convert_dtypes() # convert_dtypes - convert columns to best possible dtypes using dtypes supporting pd.NA\nprint(df)", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "text": " A B C D\n0 foo False 2.1 50\n1 bar True 1.9 <NA>\n2 foo False 3.6 30\n3 bar True 4.0 90\n4 bar True 1.9 10\n5 foo True 7.8 <NA>\n6 foo True 2.8 10\n", | |
| "output_type": "stream" | |
| } | |
| ], | |
| "execution_count": 135 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "groups = df.groupby(by = 'A')\nprint(groups.groups)", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "text": "{'bar': [1, 3, 4], 'foo': [0, 2, 5, 6]}\n", | |
| "output_type": "stream" | |
| } | |
| ], | |
| "execution_count": 136 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "print(df)", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "text": " A B C D\n0 foo False 2.1 50\n1 bar True 1.9 <NA>\n2 foo False 3.6 30\n3 bar True 4.0 90\n4 bar True 1.9 10\n5 foo True 7.8 <NA>\n6 foo True 2.8 10\n", | |
| "output_type": "stream" | |
| } | |
| ], | |
| "execution_count": 137 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "df.groupby('A')['C'].sum()", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [ | |
| { | |
| "execution_count": 117, | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": "A\nbar 7.8\nfoo 16.3\nName: C, dtype: Float64" | |
| }, | |
| "metadata": {} | |
| } | |
| ], | |
| "execution_count": 117 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "df.groupby('A')[['C']].sum()", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [ | |
| { | |
| "execution_count": 118, | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": " C\nA \nbar 7.8\nfoo 16.3", | |
| "text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>C</th>\n </tr>\n <tr>\n <th>A</th>\n <th></th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>bar</th>\n <td>7.8</td>\n </tr>\n <tr>\n <th>foo</th>\n <td>16.3</td>\n </tr>\n </tbody>\n</table>\n</div>" | |
| }, | |
| "metadata": {} | |
| } | |
| ], | |
| "execution_count": 118 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "df.groupby('A')[['C', 'D']].agg('sum')", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [ | |
| { | |
| "execution_count": 119, | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": " C D\nA \nbar 7.8 100\nfoo 16.3 90", | |
| "text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>C</th>\n <th>D</th>\n </tr>\n <tr>\n <th>A</th>\n <th></th>\n <th></th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>bar</th>\n <td>7.8</td>\n <td>100</td>\n </tr>\n <tr>\n <th>foo</th>\n <td>16.3</td>\n <td>90</td>\n </tr>\n </tbody>\n</table>\n</div>" | |
| }, | |
| "metadata": {} | |
| } | |
| ], | |
| "execution_count": 119 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "df.groupby(by = ['B','D'], dropna = False).agg(\n C_min = ('C', 'min'),\n C_max = ('C', np.max)\n)", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [ | |
| { | |
| "execution_count": 134, | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": " C_min C_max\nB D \nFalse 30 3.6 3.6\n 50 2.1 2.1\nTrue 10 1.9 2.8\n 90 4.0 4.0\n <NA> 1.9 7.8", | |
| "text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th></th>\n <th>C_min</th>\n <th>C_max</th>\n </tr>\n <tr>\n <th>B</th>\n <th>D</th>\n <th></th>\n <th></th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th rowspan=\"2\" valign=\"top\">False</th>\n <th>30</th>\n <td>3.6</td>\n <td>3.6</td>\n </tr>\n <tr>\n <th>50</th>\n <td>2.1</td>\n <td>2.1</td>\n </tr>\n <tr>\n <th rowspan=\"3\" valign=\"top\">True</th>\n <th>10</th>\n <td>1.9</td>\n <td>2.8</td>\n </tr>\n <tr>\n <th>90</th>\n <td>4.0</td>\n <td>4.0</td>\n </tr>\n <tr>\n <th><NA></th>\n <td>1.9</td>\n <td>7.8</td>\n </tr>\n </tbody>\n</table>\n</div>" | |
| }, | |
| "metadata": {} | |
| } | |
| ], | |
| "execution_count": 134 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "df.groupby('A')['C'].transform(lambda x : x.sort_values().values)", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [ | |
| { | |
| "execution_count": 141, | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": "0 2.1\n1 1.9\n2 2.8\n3 1.9\n4 4.0\n5 3.6\n6 7.8\nName: C, dtype: Float64" | |
| }, | |
| "metadata": {} | |
| } | |
| ], | |
| "execution_count": 141 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "df['C_sorted_within_A'] = df.groupby('A')['C'].transform(lambda x : x.sort_values().values)\nprint(df)", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "text": " A B C D C_sorted_within_A\n0 foo False 2.1 50 2.1\n1 bar True 1.9 <NA> 1.9\n2 foo False 3.6 30 2.8\n3 bar True 4.0 90 1.9\n4 bar True 1.9 10 4.0\n5 foo True 7.8 <NA> 3.6\n6 foo True 2.8 10 7.8\n", | |
| "output_type": "stream" | |
| } | |
| ], | |
| "execution_count": 144 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": "", | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "outputs": [], | |
| "execution_count": null | |
| } | |
| ] | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment