Created
July 19, 2019 17:57
-
-
Save FavioVazquez/576e91e2f26b55e7f52ecb2acd2deaa7 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Import optimus and pandas" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 110, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from optimus import Optimus\n", | |
"from pyspark.sql.functions import *\n", | |
"import pandas as pd\n", | |
"import numpy as np\n", | |
"op = Optimus()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Read the data" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 21, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"url = 'https://raw.githubusercontent.com/justmarkham/DAT8/master/data/chipotle.tsv'\n", | |
" \n", | |
"chipo_pd = pd.read_csv(url, sep = '\\t')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 25, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"order_id int64\n", | |
"quantity int64\n", | |
"item_name object\n", | |
"choice_description object\n", | |
"item_price object\n", | |
"dtype: object" | |
] | |
}, | |
"execution_count": 25, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"chipo_pd.dtypes" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### We need to to this to be able to read the data from Spark" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 28, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"chipo_pd[['item_name', 'choice_description', 'item_price']] = chipo_pd[['item_name', 'choice_description', 'item_price']].astype(str)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Transform data from Pandas to Optimus (Spark)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 30, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"chipo = op.spark.createDataFrame(chipo_pd)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# See the first 10 entries" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 33, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<style>\n", | |
" .data_type {\n", | |
" font-size: 0.8em;\n", | |
" font-weight: normal;\n", | |
" }\n", | |
"\n", | |
" .column_name {\n", | |
" font-size: 1.2em;\n", | |
" }\n", | |
"\n", | |
" .info_items {\n", | |
" margin: 10px 0;\n", | |
" font-size: 0.8em;\n", | |
" }\n", | |
"\n", | |
" .optimus_table td {\n", | |
" padding: 2px;\n", | |
" border-left: 1px solid #cccccc;\n", | |
" border-right: 1px solid #cccccc;\n", | |
" }\n", | |
"\n", | |
" .optimus_table tr:nth-child(even) {\n", | |
" background-color: #f2f2f2 !important;\n", | |
" }\n", | |
"\n", | |
" .optimus_table tr:nth-child(odd) {\n", | |
" background-color: #ffffff !important;\n", | |
" }\n", | |
"\n", | |
" .optimus_table thead {\n", | |
" border-bottom: 1px solid black;\n", | |
" }\n", | |
" .optimus_table{\n", | |
" font-size: 12px;\n", | |
" }\n", | |
"\n", | |
" .optimus_table tbody{\n", | |
" font-family: monospace;\n", | |
" border-bottom: 1px solid #cccccc;\n", | |
" }\n", | |
"\n", | |
"\n", | |
"</style>\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"<div class=\"info_items\">Viewing 10 of 4622 rows / 5 columns</div>\n", | |
"<div class=\"info_items\">4 partition(s)</div>\n", | |
"\n", | |
"<table class=\"optimus_table\">\n", | |
" <thead>\n", | |
" <tr>\n", | |
" \n", | |
" <th>\n", | |
" <div class=\"column_name\">order_id</div>\n", | |
" <div class=\"data_type\">1 (bigint)</div>\n", | |
" <div class=\"data_type\">\n", | |
" \n", | |
" nullable\n", | |
" \n", | |
" </div>\n", | |
" </th>\n", | |
" \n", | |
" <th>\n", | |
" <div class=\"column_name\">quantity</div>\n", | |
" <div class=\"data_type\">2 (bigint)</div>\n", | |
" <div class=\"data_type\">\n", | |
" \n", | |
" nullable\n", | |
" \n", | |
" </div>\n", | |
" </th>\n", | |
" \n", | |
" <th>\n", | |
" <div class=\"column_name\">item_name</div>\n", | |
" <div class=\"data_type\">3 (string)</div>\n", | |
" <div class=\"data_type\">\n", | |
" \n", | |
" nullable\n", | |
" \n", | |
" </div>\n", | |
" </th>\n", | |
" \n", | |
" <th>\n", | |
" <div class=\"column_name\">choice_description</div>\n", | |
" <div class=\"data_type\">4 (string)</div>\n", | |
" <div class=\"data_type\">\n", | |
" \n", | |
" nullable\n", | |
" \n", | |
" </div>\n", | |
" </th>\n", | |
" \n", | |
" <th>\n", | |
" <div class=\"column_name\">item_price</div>\n", | |
" <div class=\"data_type\">5 (string)</div>\n", | |
" <div class=\"data_type\">\n", | |
" \n", | |
" nullable\n", | |
" \n", | |
" </div>\n", | |
" </th>\n", | |
" \n", | |
" </tr>\n", | |
"\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" \n", | |
" <tr>\n", | |
" \n", | |
" <td>\n", | |
" <div title='1'>1</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='1'>1</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='Chips⸱and⸱Fresh⸱Tomato⸱Salsa'>Chips⸱and⸱Fresh⸱Tomato⸱Salsa</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='nan'>nan</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='$2.39⸱'>$2.39⸱</div>\n", | |
" </td>\n", | |
" \n", | |
" </tr>\n", | |
" \n", | |
" <tr>\n", | |
" \n", | |
" <td>\n", | |
" <div title='1'>1</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='1'>1</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='Izze'>Izze</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='[Clementine]'>[Clementine]</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='$3.39⸱'>$3.39⸱</div>\n", | |
" </td>\n", | |
" \n", | |
" </tr>\n", | |
" \n", | |
" <tr>\n", | |
" \n", | |
" <td>\n", | |
" <div title='1'>1</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='1'>1</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='Nantucket⸱Nectar'>Nantucket⸱Nectar</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='[Apple]'>[Apple]</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='$3.39⸱'>$3.39⸱</div>\n", | |
" </td>\n", | |
" \n", | |
" </tr>\n", | |
" \n", | |
" <tr>\n", | |
" \n", | |
" <td>\n", | |
" <div title='1'>1</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='1'>1</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='Chips⸱and⸱Tomatillo-Green⸱Chili⸱Salsa'>Chips⸱and⸱Tomatillo-Green⸱Chili⸱Salsa</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='nan'>nan</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='$2.39⸱'>$2.39⸱</div>\n", | |
" </td>\n", | |
" \n", | |
" </tr>\n", | |
" \n", | |
" <tr>\n", | |
" \n", | |
" <td>\n", | |
" <div title='2'>2</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='2'>2</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='Chicken⸱Bowl'>Chicken⸱Bowl</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='[Tomatillo-Red⸱Chili⸱Salsa⸱(Hot),⸱[Black⸱Beans,⸱Rice,⸱Cheese,⸱Sour⸱Cream]]'>[Tomatillo-Red⸱Chili⸱Salsa⸱(Hot),⸱[Black⸱Beans,⸱Rice,⸱Cheese,⸱Sour⸱Cream]]</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='$16.98⸱'>$16.98⸱</div>\n", | |
" </td>\n", | |
" \n", | |
" </tr>\n", | |
" \n", | |
" <tr>\n", | |
" \n", | |
" <td>\n", | |
" <div title='3'>3</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='1'>1</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='Chicken⸱Bowl'>Chicken⸱Bowl</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='[Fresh⸱Tomato⸱Salsa⸱(Mild),⸱[Rice,⸱Cheese,⸱Sour⸱Cream,⸱Guacamole,⸱Lettuce]]'>[Fresh⸱Tomato⸱Salsa⸱(Mild),⸱[Rice,⸱Cheese,⸱Sour⸱Cream,⸱Guacamole,⸱Lettuce]]</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='$10.98⸱'>$10.98⸱</div>\n", | |
" </td>\n", | |
" \n", | |
" </tr>\n", | |
" \n", | |
" <tr>\n", | |
" \n", | |
" <td>\n", | |
" <div title='3'>3</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='1'>1</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='Side⸱of⸱Chips'>Side⸱of⸱Chips</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='nan'>nan</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='$1.69⸱'>$1.69⸱</div>\n", | |
" </td>\n", | |
" \n", | |
" </tr>\n", | |
" \n", | |
" <tr>\n", | |
" \n", | |
" <td>\n", | |
" <div title='4'>4</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='1'>1</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='Steak⸱Burrito'>Steak⸱Burrito</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='[Tomatillo⸱Red⸱Chili⸱Salsa,⸱[Fajita⸱Vegetables,⸱Black⸱Beans,⸱Pinto⸱Beans,⸱Cheese,⸱Sour⸱Cream,⸱Guacamole,⸱Lettuce]]'>[Tomatillo⸱Red⸱Chili⸱Salsa,⸱[Fajita⸱Vegetables,⸱Black⸱Beans,⸱Pinto⸱Beans,⸱Cheese,⸱Sour⸱Cream,⸱Guacamole,⸱Lettuce]]</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='$11.75⸱'>$11.75⸱</div>\n", | |
" </td>\n", | |
" \n", | |
" </tr>\n", | |
" \n", | |
" <tr>\n", | |
" \n", | |
" <td>\n", | |
" <div title='4'>4</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='1'>1</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='Steak⸱Soft⸱Tacos'>Steak⸱Soft⸱Tacos</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='[Tomatillo⸱Green⸱Chili⸱Salsa,⸱[Pinto⸱Beans,⸱Cheese,⸱Sour⸱Cream,⸱Lettuce]]'>[Tomatillo⸱Green⸱Chili⸱Salsa,⸱[Pinto⸱Beans,⸱Cheese,⸱Sour⸱Cream,⸱Lettuce]]</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='$9.25⸱'>$9.25⸱</div>\n", | |
" </td>\n", | |
" \n", | |
" </tr>\n", | |
" \n", | |
" <tr>\n", | |
" \n", | |
" <td>\n", | |
" <div title='5'>5</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='1'>1</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='Steak⸱Burrito'>Steak⸱Burrito</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='[Fresh⸱Tomato⸱Salsa,⸱[Rice,⸱Black⸱Beans,⸱Pinto⸱Beans,⸱Cheese,⸱Sour⸱Cream,⸱Lettuce]]'>[Fresh⸱Tomato⸱Salsa,⸱[Rice,⸱Black⸱Beans,⸱Pinto⸱Beans,⸱Cheese,⸱Sour⸱Cream,⸱Lettuce]]</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='$9.25⸱'>$9.25⸱</div>\n", | |
" </td>\n", | |
" \n", | |
" </tr>\n", | |
" \n", | |
" </tbody>\n", | |
"</table>\n", | |
"\n", | |
"<div class=\"info_items\">Viewing 10 of 4622 rows / 5 columns</div>\n", | |
"<div class=\"info_items\">4 partition(s)</div>\n" | |
], | |
"text/plain": [ | |
"<IPython.core.display.HTML object>" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"chipo.table(10)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# What is the number of observations and columns in the dataset?" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 34, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"4622" | |
] | |
}, | |
"execution_count": 34, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"chipo.count()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 36, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"{'cols_count': 5,\n", | |
" 'rows_count': 4622,\n", | |
" 'missing_count': '0.0%',\n", | |
" 'size': '-1 Bytes'}" | |
] | |
}, | |
"execution_count": 36, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"op.profiler.dataset_info(chipo)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Print the name of all the columns" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 37, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"['order_id', 'quantity', 'item_name', 'choice_description', 'item_price']" | |
] | |
}, | |
"execution_count": 37, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"chipo.columns" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"**NOTE: Spark dataframes are not indexed.**" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Which was the most-ordered item and how many items were ordered?" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 61, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<style>\n", | |
" .data_type {\n", | |
" font-size: 0.8em;\n", | |
" font-weight: normal;\n", | |
" }\n", | |
"\n", | |
" .column_name {\n", | |
" font-size: 1.2em;\n", | |
" }\n", | |
"\n", | |
" .info_items {\n", | |
" margin: 10px 0;\n", | |
" font-size: 0.8em;\n", | |
" }\n", | |
"\n", | |
" .optimus_table td {\n", | |
" padding: 2px;\n", | |
" border-left: 1px solid #cccccc;\n", | |
" border-right: 1px solid #cccccc;\n", | |
" }\n", | |
"\n", | |
" .optimus_table tr:nth-child(even) {\n", | |
" background-color: #f2f2f2 !important;\n", | |
" }\n", | |
"\n", | |
" .optimus_table tr:nth-child(odd) {\n", | |
" background-color: #ffffff !important;\n", | |
" }\n", | |
"\n", | |
" .optimus_table thead {\n", | |
" border-bottom: 1px solid black;\n", | |
" }\n", | |
" .optimus_table{\n", | |
" font-size: 12px;\n", | |
" }\n", | |
"\n", | |
" .optimus_table tbody{\n", | |
" font-family: monospace;\n", | |
" border-bottom: 1px solid #cccccc;\n", | |
" }\n", | |
"\n", | |
"\n", | |
"</style>\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"<div class=\"info_items\">Viewing 1 of 50 rows / 2 columns</div>\n", | |
"<div class=\"info_items\">41 partition(s)</div>\n", | |
"\n", | |
"<table class=\"optimus_table\">\n", | |
" <thead>\n", | |
" <tr>\n", | |
" \n", | |
" <th>\n", | |
" <div class=\"column_name\">item_name</div>\n", | |
" <div class=\"data_type\">1 (string)</div>\n", | |
" <div class=\"data_type\">\n", | |
" \n", | |
" nullable\n", | |
" \n", | |
" </div>\n", | |
" </th>\n", | |
" \n", | |
" <th>\n", | |
" <div class=\"column_name\">quantity</div>\n", | |
" <div class=\"data_type\">2 (bigint)</div>\n", | |
" <div class=\"data_type\">\n", | |
" \n", | |
" nullable\n", | |
" \n", | |
" </div>\n", | |
" </th>\n", | |
" \n", | |
" </tr>\n", | |
"\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" \n", | |
" <tr>\n", | |
" \n", | |
" <td>\n", | |
" <div title='Chicken⸱Bowl'>Chicken⸱Bowl</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='761'>761</div>\n", | |
" </td>\n", | |
" \n", | |
" </tr>\n", | |
" \n", | |
" </tbody>\n", | |
"</table>\n", | |
"\n", | |
"<div class=\"info_items\">Viewing 1 of 50 rows / 2 columns</div>\n", | |
"<div class=\"info_items\">41 partition(s)</div>\n" | |
], | |
"text/plain": [ | |
"<IPython.core.display.HTML object>" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"# Here we are renaming the column sum(quantity) to quantity with Optimus function rename inside of cols\n", | |
"(chipo.groupby(\"item_name\")\n", | |
" .sum(\"quantity\")\n", | |
" .cols.rename(\"sum(quantity)\", \"quantity\")\n", | |
" .sort(desc(\"quantity\"))\n", | |
" .table(1))" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# What was the most ordered item in the choice_description column" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 65, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<style>\n", | |
" .data_type {\n", | |
" font-size: 0.8em;\n", | |
" font-weight: normal;\n", | |
" }\n", | |
"\n", | |
" .column_name {\n", | |
" font-size: 1.2em;\n", | |
" }\n", | |
"\n", | |
" .info_items {\n", | |
" margin: 10px 0;\n", | |
" font-size: 0.8em;\n", | |
" }\n", | |
"\n", | |
" .optimus_table td {\n", | |
" padding: 2px;\n", | |
" border-left: 1px solid #cccccc;\n", | |
" border-right: 1px solid #cccccc;\n", | |
" }\n", | |
"\n", | |
" .optimus_table tr:nth-child(even) {\n", | |
" background-color: #f2f2f2 !important;\n", | |
" }\n", | |
"\n", | |
" .optimus_table tr:nth-child(odd) {\n", | |
" background-color: #ffffff !important;\n", | |
" }\n", | |
"\n", | |
" .optimus_table thead {\n", | |
" border-bottom: 1px solid black;\n", | |
" }\n", | |
" .optimus_table{\n", | |
" font-size: 12px;\n", | |
" }\n", | |
"\n", | |
" .optimus_table tbody{\n", | |
" font-family: monospace;\n", | |
" border-bottom: 1px solid #cccccc;\n", | |
" }\n", | |
"\n", | |
"\n", | |
"</style>\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"<div class=\"info_items\">Viewing 1 of 1044 rows / 2 columns</div>\n", | |
"<div class=\"info_items\">21 partition(s)</div>\n", | |
"\n", | |
"<table class=\"optimus_table\">\n", | |
" <thead>\n", | |
" <tr>\n", | |
" \n", | |
" <th>\n", | |
" <div class=\"column_name\">choice_description</div>\n", | |
" <div class=\"data_type\">1 (string)</div>\n", | |
" <div class=\"data_type\">\n", | |
" \n", | |
" nullable\n", | |
" \n", | |
" </div>\n", | |
" </th>\n", | |
" \n", | |
" <th>\n", | |
" <div class=\"column_name\">quantity</div>\n", | |
" <div class=\"data_type\">2 (bigint)</div>\n", | |
" <div class=\"data_type\">\n", | |
" \n", | |
" nullable\n", | |
" \n", | |
" </div>\n", | |
" </th>\n", | |
" \n", | |
" </tr>\n", | |
"\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" \n", | |
" <tr>\n", | |
" \n", | |
" <td>\n", | |
" <div title='nan'>nan</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='1382'>1382</div>\n", | |
" </td>\n", | |
" \n", | |
" </tr>\n", | |
" \n", | |
" </tbody>\n", | |
"</table>\n", | |
"\n", | |
"<div class=\"info_items\">Viewing 1 of 1044 rows / 2 columns</div>\n", | |
"<div class=\"info_items\">21 partition(s)</div>\n" | |
], | |
"text/plain": [ | |
"<IPython.core.display.HTML object>" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"(chipo.groupby(\"choice_description\")\n", | |
" .sum(\"quantity\")\n", | |
" .cols.rename(\"sum(quantity)\", \"quantity\")\n", | |
" .sort(desc(\"quantity\"))\n", | |
" .table(1))" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Here we have a problem, is showing that nan was the most order item from `choice_desccription`. Let's solve that:" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 77, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<style>\n", | |
" .data_type {\n", | |
" font-size: 0.8em;\n", | |
" font-weight: normal;\n", | |
" }\n", | |
"\n", | |
" .column_name {\n", | |
" font-size: 1.2em;\n", | |
" }\n", | |
"\n", | |
" .info_items {\n", | |
" margin: 10px 0;\n", | |
" font-size: 0.8em;\n", | |
" }\n", | |
"\n", | |
" .optimus_table td {\n", | |
" padding: 2px;\n", | |
" border-left: 1px solid #cccccc;\n", | |
" border-right: 1px solid #cccccc;\n", | |
" }\n", | |
"\n", | |
" .optimus_table tr:nth-child(even) {\n", | |
" background-color: #f2f2f2 !important;\n", | |
" }\n", | |
"\n", | |
" .optimus_table tr:nth-child(odd) {\n", | |
" background-color: #ffffff !important;\n", | |
" }\n", | |
"\n", | |
" .optimus_table thead {\n", | |
" border-bottom: 1px solid black;\n", | |
" }\n", | |
" .optimus_table{\n", | |
" font-size: 12px;\n", | |
" }\n", | |
"\n", | |
" .optimus_table tbody{\n", | |
" font-family: monospace;\n", | |
" border-bottom: 1px solid #cccccc;\n", | |
" }\n", | |
"\n", | |
"\n", | |
"</style>\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"<div class=\"info_items\">Viewing 1 of 1043 rows / 2 columns</div>\n", | |
"<div class=\"info_items\">20 partition(s)</div>\n", | |
"\n", | |
"<table class=\"optimus_table\">\n", | |
" <thead>\n", | |
" <tr>\n", | |
" \n", | |
" <th>\n", | |
" <div class=\"column_name\">choice_description</div>\n", | |
" <div class=\"data_type\">1 (string)</div>\n", | |
" <div class=\"data_type\">\n", | |
" \n", | |
" nullable\n", | |
" \n", | |
" </div>\n", | |
" </th>\n", | |
" \n", | |
" <th>\n", | |
" <div class=\"column_name\">quantity</div>\n", | |
" <div class=\"data_type\">2 (bigint)</div>\n", | |
" <div class=\"data_type\">\n", | |
" \n", | |
" nullable\n", | |
" \n", | |
" </div>\n", | |
" </th>\n", | |
" \n", | |
" </tr>\n", | |
"\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" \n", | |
" <tr>\n", | |
" \n", | |
" <td>\n", | |
" <div title='[Diet⸱Coke]'>[Diet⸱Coke]</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='159'>159</div>\n", | |
" </td>\n", | |
" \n", | |
" </tr>\n", | |
" \n", | |
" </tbody>\n", | |
"</table>\n", | |
"\n", | |
"<div class=\"info_items\">Viewing 1 of 1043 rows / 2 columns</div>\n", | |
"<div class=\"info_items\">20 partition(s)</div>\n" | |
], | |
"text/plain": [ | |
"<IPython.core.display.HTML object>" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"# First we are transforming \"nan\" strings to real nulls, and then droping them\n", | |
"(chipo.cols.replace(\"choice_description\",\"nan\")\n", | |
" .dropna()\n", | |
" .groupby(\"choice_description\")\n", | |
" .sum(\"quantity\")\n", | |
" .cols.rename(\"sum(quantity)\", \"quantity\")\n", | |
" .sort(desc(\"quantity\"))\n", | |
" .table(1))" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# How many items were orderd in total?" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 80, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"4972" | |
] | |
}, | |
"execution_count": 80, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"chipo.cols.sum(\"quantity\")" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Turn the item price into a float" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 81, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[('order_id', 'bigint'),\n", | |
" ('quantity', 'bigint'),\n", | |
" ('item_name', 'string'),\n", | |
" ('choice_description', 'string'),\n", | |
" ('item_price', 'string')]" | |
] | |
}, | |
"execution_count": 81, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"chipo.dtypes" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 88, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<style>\n", | |
" .data_type {\n", | |
" font-size: 0.8em;\n", | |
" font-weight: normal;\n", | |
" }\n", | |
"\n", | |
" .column_name {\n", | |
" font-size: 1.2em;\n", | |
" }\n", | |
"\n", | |
" .info_items {\n", | |
" margin: 10px 0;\n", | |
" font-size: 0.8em;\n", | |
" }\n", | |
"\n", | |
" .optimus_table td {\n", | |
" padding: 2px;\n", | |
" border-left: 1px solid #cccccc;\n", | |
" border-right: 1px solid #cccccc;\n", | |
" }\n", | |
"\n", | |
" .optimus_table tr:nth-child(even) {\n", | |
" background-color: #f2f2f2 !important;\n", | |
" }\n", | |
"\n", | |
" .optimus_table tr:nth-child(odd) {\n", | |
" background-color: #ffffff !important;\n", | |
" }\n", | |
"\n", | |
" .optimus_table thead {\n", | |
" border-bottom: 1px solid black;\n", | |
" }\n", | |
" .optimus_table{\n", | |
" font-size: 12px;\n", | |
" }\n", | |
"\n", | |
" .optimus_table tbody{\n", | |
" font-family: monospace;\n", | |
" border-bottom: 1px solid #cccccc;\n", | |
" }\n", | |
"\n", | |
"\n", | |
"</style>\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"<div class=\"info_items\">Viewing 1 of 4622 rows / 5 columns</div>\n", | |
"<div class=\"info_items\">4 partition(s)</div>\n", | |
"\n", | |
"<table class=\"optimus_table\">\n", | |
" <thead>\n", | |
" <tr>\n", | |
" \n", | |
" <th>\n", | |
" <div class=\"column_name\">order_id</div>\n", | |
" <div class=\"data_type\">1 (bigint)</div>\n", | |
" <div class=\"data_type\">\n", | |
" \n", | |
" nullable\n", | |
" \n", | |
" </div>\n", | |
" </th>\n", | |
" \n", | |
" <th>\n", | |
" <div class=\"column_name\">quantity</div>\n", | |
" <div class=\"data_type\">2 (bigint)</div>\n", | |
" <div class=\"data_type\">\n", | |
" \n", | |
" nullable\n", | |
" \n", | |
" </div>\n", | |
" </th>\n", | |
" \n", | |
" <th>\n", | |
" <div class=\"column_name\">item_name</div>\n", | |
" <div class=\"data_type\">3 (string)</div>\n", | |
" <div class=\"data_type\">\n", | |
" \n", | |
" nullable\n", | |
" \n", | |
" </div>\n", | |
" </th>\n", | |
" \n", | |
" <th>\n", | |
" <div class=\"column_name\">choice_description</div>\n", | |
" <div class=\"data_type\">4 (string)</div>\n", | |
" <div class=\"data_type\">\n", | |
" \n", | |
" nullable\n", | |
" \n", | |
" </div>\n", | |
" </th>\n", | |
" \n", | |
" <th>\n", | |
" <div class=\"column_name\">item_price</div>\n", | |
" <div class=\"data_type\">5 (string)</div>\n", | |
" <div class=\"data_type\">\n", | |
" \n", | |
" nullable\n", | |
" \n", | |
" </div>\n", | |
" </th>\n", | |
" \n", | |
" </tr>\n", | |
"\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" \n", | |
" <tr>\n", | |
" \n", | |
" <td>\n", | |
" <div title='1'>1</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='1'>1</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='Chips⸱and⸱Fresh⸱Tomato⸱Salsa'>Chips⸱and⸱Fresh⸱Tomato⸱Salsa</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='nan'>nan</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='$2.39⸱'>$2.39⸱</div>\n", | |
" </td>\n", | |
" \n", | |
" </tr>\n", | |
" \n", | |
" </tbody>\n", | |
"</table>\n", | |
"\n", | |
"<div class=\"info_items\">Viewing 1 of 4622 rows / 5 columns</div>\n", | |
"<div class=\"info_items\">4 partition(s)</div>\n" | |
], | |
"text/plain": [ | |
"<IPython.core.display.HTML object>" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"## Let's see the format of the price\n", | |
"chipo.table(1)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 94, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Use substr (like in SQL) to get from the first numer to the end and then cast it\n", | |
"chipo = chipo.withColumn(\"item_price\", chipo.item_price.substr(2,10).cast(\"float\"))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 96, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<style>\n", | |
" .data_type {\n", | |
" font-size: 0.8em;\n", | |
" font-weight: normal;\n", | |
" }\n", | |
"\n", | |
" .column_name {\n", | |
" font-size: 1.2em;\n", | |
" }\n", | |
"\n", | |
" .info_items {\n", | |
" margin: 10px 0;\n", | |
" font-size: 0.8em;\n", | |
" }\n", | |
"\n", | |
" .optimus_table td {\n", | |
" padding: 2px;\n", | |
" border-left: 1px solid #cccccc;\n", | |
" border-right: 1px solid #cccccc;\n", | |
" }\n", | |
"\n", | |
" .optimus_table tr:nth-child(even) {\n", | |
" background-color: #f2f2f2 !important;\n", | |
" }\n", | |
"\n", | |
" .optimus_table tr:nth-child(odd) {\n", | |
" background-color: #ffffff !important;\n", | |
" }\n", | |
"\n", | |
" .optimus_table thead {\n", | |
" border-bottom: 1px solid black;\n", | |
" }\n", | |
" .optimus_table{\n", | |
" font-size: 12px;\n", | |
" }\n", | |
"\n", | |
" .optimus_table tbody{\n", | |
" font-family: monospace;\n", | |
" border-bottom: 1px solid #cccccc;\n", | |
" }\n", | |
"\n", | |
"\n", | |
"</style>\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"<div class=\"info_items\">Viewing 2 of 4622 rows / 5 columns</div>\n", | |
"<div class=\"info_items\">4 partition(s)</div>\n", | |
"\n", | |
"<table class=\"optimus_table\">\n", | |
" <thead>\n", | |
" <tr>\n", | |
" \n", | |
" <th>\n", | |
" <div class=\"column_name\">order_id</div>\n", | |
" <div class=\"data_type\">1 (bigint)</div>\n", | |
" <div class=\"data_type\">\n", | |
" \n", | |
" nullable\n", | |
" \n", | |
" </div>\n", | |
" </th>\n", | |
" \n", | |
" <th>\n", | |
" <div class=\"column_name\">quantity</div>\n", | |
" <div class=\"data_type\">2 (bigint)</div>\n", | |
" <div class=\"data_type\">\n", | |
" \n", | |
" nullable\n", | |
" \n", | |
" </div>\n", | |
" </th>\n", | |
" \n", | |
" <th>\n", | |
" <div class=\"column_name\">item_name</div>\n", | |
" <div class=\"data_type\">3 (string)</div>\n", | |
" <div class=\"data_type\">\n", | |
" \n", | |
" nullable\n", | |
" \n", | |
" </div>\n", | |
" </th>\n", | |
" \n", | |
" <th>\n", | |
" <div class=\"column_name\">choice_description</div>\n", | |
" <div class=\"data_type\">4 (string)</div>\n", | |
" <div class=\"data_type\">\n", | |
" \n", | |
" nullable\n", | |
" \n", | |
" </div>\n", | |
" </th>\n", | |
" \n", | |
" <th>\n", | |
" <div class=\"column_name\">item_price</div>\n", | |
" <div class=\"data_type\">5 (float)</div>\n", | |
" <div class=\"data_type\">\n", | |
" \n", | |
" nullable\n", | |
" \n", | |
" </div>\n", | |
" </th>\n", | |
" \n", | |
" </tr>\n", | |
"\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" \n", | |
" <tr>\n", | |
" \n", | |
" <td>\n", | |
" <div title='1'>1</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='1'>1</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='Chips⸱and⸱Fresh⸱Tomato⸱Salsa'>Chips⸱and⸱Fresh⸱Tomato⸱Salsa</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='nan'>nan</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='2.390000104904175'>2.390000104904175</div>\n", | |
" </td>\n", | |
" \n", | |
" </tr>\n", | |
" \n", | |
" <tr>\n", | |
" \n", | |
" <td>\n", | |
" <div title='1'>1</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='1'>1</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='Izze'>Izze</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='[Clementine]'>[Clementine]</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='3.390000104904175'>3.390000104904175</div>\n", | |
" </td>\n", | |
" \n", | |
" </tr>\n", | |
" \n", | |
" </tbody>\n", | |
"</table>\n", | |
"\n", | |
"<div class=\"info_items\">Viewing 2 of 4622 rows / 5 columns</div>\n", | |
"<div class=\"info_items\">4 partition(s)</div>\n" | |
], | |
"text/plain": [ | |
"<IPython.core.display.HTML object>" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"# Let's see our data now\n", | |
"chipo.table(2)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# How much was the revenue for the period in the dataset?" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 111, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Revenue was: $39237.02\n" | |
] | |
} | |
], | |
"source": [ | |
"# The function mul takes two or more columns and multiples them\n", | |
"# The function sum will sum the values in a specific column\n", | |
"revenue = (chipo.cols.mul(columns=[\"quantity\", \"item_price\"])\n", | |
" .cols.sum(\"mul\"))\n", | |
"print('Revenue was: $' + str(np.round(revenue,2)))" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# How many orders were made in the period?" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 117, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"1834" | |
] | |
}, | |
"execution_count": 117, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"chipo.select(\"order_id\").distinct().count()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# What is the average revenue amount per order?" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 132, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"21.39423" | |
] | |
}, | |
"execution_count": 132, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"(chipo.cols.mul(columns=[\"quantity\", \"item_price\"])\n", | |
" .cols.rename(\"mul\", \"revenue\")\n", | |
" .groupby(\"order_id\").sum(\"revenue\")\n", | |
" .cols.rename(\"sum(revenue)\", \"revenue\")\n", | |
" .cols.mean(\"revenue\"))" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# How many different items are sold?" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 133, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"50" | |
] | |
}, | |
"execution_count": 133, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"chipo.select(\"item_name\").distinct().count()" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.6.8" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment