Created
July 19, 2019 19:28
-
-
Save FavioVazquez/9e6b219bbe35f56c61c957dc5dbdc78f to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from optimus import Optimus\n", | |
"from pyspark.sql.functions import *\n", | |
"import pandas as pd\n", | |
"import numpy as np\n", | |
"op = Optimus()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Create sample data" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"raw_data = {'regiment': ['Nighthawks', 'Nighthawks', 'Nighthawks', 'Nighthawks', 'Dragoons', 'Dragoons', 'Dragoons', 'Dragoons', 'Scouts', 'Scouts', 'Scouts', 'Scouts'], \n", | |
" 'company': ['1st', '1st', '2nd', '2nd', '1st', '1st', '2nd', '2nd','1st', '1st', '2nd', '2nd'], \n", | |
" 'name': ['Miller', 'Jacobson', 'Ali', 'Milner', 'Cooze', 'Jacon', 'Ryaner', 'Sone', 'Sloan', 'Piger', 'Riani', 'Ali'], \n", | |
" 'preTestScore': [4, 24, 31, 2, 3, 4, 24, 31, 2, 3, 2, 3],\n", | |
" 'postTestScore': [25, 94, 57, 62, 70, 25, 94, 57, 62, 70, 62, 70]}" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Create Spark DF" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"df_pd = pd.DataFrame(raw_data)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"regiment = op.spark.createDataFrame(df_pd)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<style>\n", | |
" .data_type {\n", | |
" font-size: 0.8em;\n", | |
" font-weight: normal;\n", | |
" }\n", | |
"\n", | |
" .column_name {\n", | |
" font-size: 1.2em;\n", | |
" }\n", | |
"\n", | |
" .info_items {\n", | |
" margin: 10px 0;\n", | |
" font-size: 0.8em;\n", | |
" }\n", | |
"\n", | |
" .optimus_table td {\n", | |
" padding: 2px;\n", | |
" border-left: 1px solid #cccccc;\n", | |
" border-right: 1px solid #cccccc;\n", | |
" }\n", | |
"\n", | |
" .optimus_table tr:nth-child(even) {\n", | |
" background-color: #f2f2f2 !important;\n", | |
" }\n", | |
"\n", | |
" .optimus_table tr:nth-child(odd) {\n", | |
" background-color: #ffffff !important;\n", | |
" }\n", | |
"\n", | |
" .optimus_table thead {\n", | |
" border-bottom: 1px solid black;\n", | |
" }\n", | |
" .optimus_table{\n", | |
" font-size: 12px;\n", | |
" }\n", | |
"\n", | |
" .optimus_table tbody{\n", | |
" font-family: monospace;\n", | |
" border-bottom: 1px solid #cccccc;\n", | |
" }\n", | |
"\n", | |
"\n", | |
"</style>\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"<div class=\"info_items\">Viewing 12 of 12 rows / 5 columns</div>\n", | |
"<div class=\"info_items\">4 partition(s)</div>\n", | |
"\n", | |
"<table class=\"optimus_table\">\n", | |
" <thead>\n", | |
" <tr>\n", | |
" \n", | |
" <th>\n", | |
" <div class=\"column_name\">regiment</div>\n", | |
" <div class=\"data_type\">1 (string)</div>\n", | |
" <div class=\"data_type\">\n", | |
" \n", | |
" nullable\n", | |
" \n", | |
" </div>\n", | |
" </th>\n", | |
" \n", | |
" <th>\n", | |
" <div class=\"column_name\">company</div>\n", | |
" <div class=\"data_type\">2 (string)</div>\n", | |
" <div class=\"data_type\">\n", | |
" \n", | |
" nullable\n", | |
" \n", | |
" </div>\n", | |
" </th>\n", | |
" \n", | |
" <th>\n", | |
" <div class=\"column_name\">name</div>\n", | |
" <div class=\"data_type\">3 (string)</div>\n", | |
" <div class=\"data_type\">\n", | |
" \n", | |
" nullable\n", | |
" \n", | |
" </div>\n", | |
" </th>\n", | |
" \n", | |
" <th>\n", | |
" <div class=\"column_name\">preTestScore</div>\n", | |
" <div class=\"data_type\">4 (bigint)</div>\n", | |
" <div class=\"data_type\">\n", | |
" \n", | |
" nullable\n", | |
" \n", | |
" </div>\n", | |
" </th>\n", | |
" \n", | |
" <th>\n", | |
" <div class=\"column_name\">postTestScore</div>\n", | |
" <div class=\"data_type\">5 (bigint)</div>\n", | |
" <div class=\"data_type\">\n", | |
" \n", | |
" nullable\n", | |
" \n", | |
" </div>\n", | |
" </th>\n", | |
" \n", | |
" </tr>\n", | |
"\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" \n", | |
" <tr>\n", | |
" \n", | |
" <td>\n", | |
" <div title='Nighthawks'>Nighthawks</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='1st'>1st</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='Miller'>Miller</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='4'>4</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='25'>25</div>\n", | |
" </td>\n", | |
" \n", | |
" </tr>\n", | |
" \n", | |
" <tr>\n", | |
" \n", | |
" <td>\n", | |
" <div title='Nighthawks'>Nighthawks</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='1st'>1st</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='Jacobson'>Jacobson</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='24'>24</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='94'>94</div>\n", | |
" </td>\n", | |
" \n", | |
" </tr>\n", | |
" \n", | |
" <tr>\n", | |
" \n", | |
" <td>\n", | |
" <div title='Nighthawks'>Nighthawks</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='2nd'>2nd</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='Ali'>Ali</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='31'>31</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='57'>57</div>\n", | |
" </td>\n", | |
" \n", | |
" </tr>\n", | |
" \n", | |
" <tr>\n", | |
" \n", | |
" <td>\n", | |
" <div title='Nighthawks'>Nighthawks</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='2nd'>2nd</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='Milner'>Milner</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='2'>2</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='62'>62</div>\n", | |
" </td>\n", | |
" \n", | |
" </tr>\n", | |
" \n", | |
" <tr>\n", | |
" \n", | |
" <td>\n", | |
" <div title='Dragoons'>Dragoons</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='1st'>1st</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='Cooze'>Cooze</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='3'>3</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='70'>70</div>\n", | |
" </td>\n", | |
" \n", | |
" </tr>\n", | |
" \n", | |
" <tr>\n", | |
" \n", | |
" <td>\n", | |
" <div title='Dragoons'>Dragoons</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='1st'>1st</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='Jacon'>Jacon</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='4'>4</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='25'>25</div>\n", | |
" </td>\n", | |
" \n", | |
" </tr>\n", | |
" \n", | |
" <tr>\n", | |
" \n", | |
" <td>\n", | |
" <div title='Dragoons'>Dragoons</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='2nd'>2nd</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='Ryaner'>Ryaner</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='24'>24</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='94'>94</div>\n", | |
" </td>\n", | |
" \n", | |
" </tr>\n", | |
" \n", | |
" <tr>\n", | |
" \n", | |
" <td>\n", | |
" <div title='Dragoons'>Dragoons</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='2nd'>2nd</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='Sone'>Sone</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='31'>31</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='57'>57</div>\n", | |
" </td>\n", | |
" \n", | |
" </tr>\n", | |
" \n", | |
" <tr>\n", | |
" \n", | |
" <td>\n", | |
" <div title='Scouts'>Scouts</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='1st'>1st</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='Sloan'>Sloan</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='2'>2</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='62'>62</div>\n", | |
" </td>\n", | |
" \n", | |
" </tr>\n", | |
" \n", | |
" <tr>\n", | |
" \n", | |
" <td>\n", | |
" <div title='Scouts'>Scouts</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='1st'>1st</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='Piger'>Piger</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='3'>3</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='70'>70</div>\n", | |
" </td>\n", | |
" \n", | |
" </tr>\n", | |
" \n", | |
" <tr>\n", | |
" \n", | |
" <td>\n", | |
" <div title='Scouts'>Scouts</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='2nd'>2nd</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='Riani'>Riani</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='2'>2</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='62'>62</div>\n", | |
" </td>\n", | |
" \n", | |
" </tr>\n", | |
" \n", | |
" <tr>\n", | |
" \n", | |
" <td>\n", | |
" <div title='Scouts'>Scouts</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='2nd'>2nd</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='Ali'>Ali</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='3'>3</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='70'>70</div>\n", | |
" </td>\n", | |
" \n", | |
" </tr>\n", | |
" \n", | |
" </tbody>\n", | |
"</table>\n", | |
"\n", | |
"<div class=\"info_items\">Viewing 12 of 12 rows / 5 columns</div>\n", | |
"<div class=\"info_items\">4 partition(s)</div>\n" | |
], | |
"text/plain": [ | |
"<IPython.core.display.HTML object>" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"regiment.table()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# What is the mean preTestScore from the regiment Nighthawks?" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<style>\n", | |
" .data_type {\n", | |
" font-size: 0.8em;\n", | |
" font-weight: normal;\n", | |
" }\n", | |
"\n", | |
" .column_name {\n", | |
" font-size: 1.2em;\n", | |
" }\n", | |
"\n", | |
" .info_items {\n", | |
" margin: 10px 0;\n", | |
" font-size: 0.8em;\n", | |
" }\n", | |
"\n", | |
" .optimus_table td {\n", | |
" padding: 2px;\n", | |
" border-left: 1px solid #cccccc;\n", | |
" border-right: 1px solid #cccccc;\n", | |
" }\n", | |
"\n", | |
" .optimus_table tr:nth-child(even) {\n", | |
" background-color: #f2f2f2 !important;\n", | |
" }\n", | |
"\n", | |
" .optimus_table tr:nth-child(odd) {\n", | |
" background-color: #ffffff !important;\n", | |
" }\n", | |
"\n", | |
" .optimus_table thead {\n", | |
" border-bottom: 1px solid black;\n", | |
" }\n", | |
" .optimus_table{\n", | |
" font-size: 12px;\n", | |
" }\n", | |
"\n", | |
" .optimus_table tbody{\n", | |
" font-family: monospace;\n", | |
" border-bottom: 1px solid #cccccc;\n", | |
" }\n", | |
"\n", | |
"\n", | |
"</style>\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"<div class=\"info_items\">Viewing 1 of 1 rows / 3 columns</div>\n", | |
"<div class=\"info_items\">200 partition(s)</div>\n", | |
"\n", | |
"<table class=\"optimus_table\">\n", | |
" <thead>\n", | |
" <tr>\n", | |
" \n", | |
" <th>\n", | |
" <div class=\"column_name\">regiment</div>\n", | |
" <div class=\"data_type\">1 (string)</div>\n", | |
" <div class=\"data_type\">\n", | |
" \n", | |
" nullable\n", | |
" \n", | |
" </div>\n", | |
" </th>\n", | |
" \n", | |
" <th>\n", | |
" <div class=\"column_name\">avg(preTestScore)</div>\n", | |
" <div class=\"data_type\">2 (double)</div>\n", | |
" <div class=\"data_type\">\n", | |
" \n", | |
" nullable\n", | |
" \n", | |
" </div>\n", | |
" </th>\n", | |
" \n", | |
" <th>\n", | |
" <div class=\"column_name\">avg(postTestScore)</div>\n", | |
" <div class=\"data_type\">3 (double)</div>\n", | |
" <div class=\"data_type\">\n", | |
" \n", | |
" nullable\n", | |
" \n", | |
" </div>\n", | |
" </th>\n", | |
" \n", | |
" </tr>\n", | |
"\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" \n", | |
" <tr>\n", | |
" \n", | |
" <td>\n", | |
" <div title='Nighthawks'>Nighthawks</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='15.25'>15.25</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='59.5'>59.5</div>\n", | |
" </td>\n", | |
" \n", | |
" </tr>\n", | |
" \n", | |
" </tbody>\n", | |
"</table>\n", | |
"\n", | |
"<div class=\"info_items\">Viewing 1 of 1 rows / 3 columns</div>\n", | |
"<div class=\"info_items\">200 partition(s)</div>\n" | |
], | |
"text/plain": [ | |
"<IPython.core.display.HTML object>" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"regiment.where(regiment.regiment == \"Nighthawks\").groupBy(\"regiment\").mean().table()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Present general statistics by company" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 37, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<style>\n", | |
" .data_type {\n", | |
" font-size: 0.8em;\n", | |
" font-weight: normal;\n", | |
" }\n", | |
"\n", | |
" .column_name {\n", | |
" font-size: 1.2em;\n", | |
" }\n", | |
"\n", | |
" .info_items {\n", | |
" margin: 10px 0;\n", | |
" font-size: 0.8em;\n", | |
" }\n", | |
"\n", | |
" .optimus_table td {\n", | |
" padding: 2px;\n", | |
" border-left: 1px solid #cccccc;\n", | |
" border-right: 1px solid #cccccc;\n", | |
" }\n", | |
"\n", | |
" .optimus_table tr:nth-child(even) {\n", | |
" background-color: #f2f2f2 !important;\n", | |
" }\n", | |
"\n", | |
" .optimus_table tr:nth-child(odd) {\n", | |
" background-color: #ffffff !important;\n", | |
" }\n", | |
"\n", | |
" .optimus_table thead {\n", | |
" border-bottom: 1px solid black;\n", | |
" }\n", | |
" .optimus_table{\n", | |
" font-size: 12px;\n", | |
" }\n", | |
"\n", | |
" .optimus_table tbody{\n", | |
" font-family: monospace;\n", | |
" border-bottom: 1px solid #cccccc;\n", | |
" }\n", | |
"\n", | |
"\n", | |
"</style>\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"<div class=\"info_items\">Viewing 2 of 2 rows / 7 columns</div>\n", | |
"<div class=\"info_items\">200 partition(s)</div>\n", | |
"\n", | |
"<table class=\"optimus_table\">\n", | |
" <thead>\n", | |
" <tr>\n", | |
" \n", | |
" <th>\n", | |
" <div class=\"column_name\">company</div>\n", | |
" <div class=\"data_type\">1 (string)</div>\n", | |
" <div class=\"data_type\">\n", | |
" \n", | |
" nullable\n", | |
" \n", | |
" </div>\n", | |
" </th>\n", | |
" \n", | |
" <th>\n", | |
" <div class=\"column_name\">count(preTestScore)</div>\n", | |
" <div class=\"data_type\">2 (bigint)</div>\n", | |
" <div class=\"data_type\">\n", | |
" \n", | |
" </div>\n", | |
" </th>\n", | |
" \n", | |
" <th>\n", | |
" <div class=\"column_name\">count(postTestScore)</div>\n", | |
" <div class=\"data_type\">3 (bigint)</div>\n", | |
" <div class=\"data_type\">\n", | |
" \n", | |
" </div>\n", | |
" </th>\n", | |
" \n", | |
" <th>\n", | |
" <div class=\"column_name\">avg(preTestScore)</div>\n", | |
" <div class=\"data_type\">4 (double)</div>\n", | |
" <div class=\"data_type\">\n", | |
" \n", | |
" nullable\n", | |
" \n", | |
" </div>\n", | |
" </th>\n", | |
" \n", | |
" <th>\n", | |
" <div class=\"column_name\">avg(postTestScore)</div>\n", | |
" <div class=\"data_type\">5 (double)</div>\n", | |
" <div class=\"data_type\">\n", | |
" \n", | |
" nullable\n", | |
" \n", | |
" </div>\n", | |
" </th>\n", | |
" \n", | |
" <th>\n", | |
" <div class=\"column_name\">stddev_samp(preTestScore)</div>\n", | |
" <div class=\"data_type\">6 (double)</div>\n", | |
" <div class=\"data_type\">\n", | |
" \n", | |
" nullable\n", | |
" \n", | |
" </div>\n", | |
" </th>\n", | |
" \n", | |
" <th>\n", | |
" <div class=\"column_name\">stddev_samp(postTestScore)</div>\n", | |
" <div class=\"data_type\">7 (double)</div>\n", | |
" <div class=\"data_type\">\n", | |
" \n", | |
" nullable\n", | |
" \n", | |
" </div>\n", | |
" </th>\n", | |
" \n", | |
" </tr>\n", | |
"\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" \n", | |
" <tr>\n", | |
" \n", | |
" <td>\n", | |
" <div title='2nd'>2nd</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='6'>6</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='6'>6</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='15.5'>15.5</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='67.0'>67.0</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='14.652644812456213'>14.652644812456213</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='14.057026712644463'>14.057026712644463</div>\n", | |
" </td>\n", | |
" \n", | |
" </tr>\n", | |
" \n", | |
" <tr>\n", | |
" \n", | |
" <td>\n", | |
" <div title='1st'>1st</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='6'>6</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='6'>6</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='6.666666666666667'>6.666666666666667</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='57.666666666666664'>57.666666666666664</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='8.524474568362947'>8.524474568362947</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='27.48575388572536'>27.48575388572536</div>\n", | |
" </td>\n", | |
" \n", | |
" </tr>\n", | |
" \n", | |
" </tbody>\n", | |
"</table>\n", | |
"\n", | |
"<div class=\"info_items\">Viewing 2 of 2 rows / 7 columns</div>\n", | |
"<div class=\"info_items\">200 partition(s)</div>\n" | |
], | |
"text/plain": [ | |
"<IPython.core.display.HTML object>" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"# Maybe not the cleanest solution but it works :)\n", | |
"regiment.groupby('company').agg(count(\"preTestScore\"),\n", | |
" count(\"postTestScore\"), \n", | |
" mean(\"preTestScore\"), \n", | |
" mean(\"postTestScore\"),\n", | |
" stddev(\"preTestScore\"), \n", | |
" stddev(\"postTestScore\")).table()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 38, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from pyspark.sql.functions import pandas_udf, PandasUDFType\n", | |
"\n", | |
"@pandas_udf(\"double\", PandasUDFType.GROUPED_AGG)\n", | |
"def mean_udf(v):\n", | |
" return v.mean()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# What is the mean each company's preTestScore?" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 46, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<style>\n", | |
" .data_type {\n", | |
" font-size: 0.8em;\n", | |
" font-weight: normal;\n", | |
" }\n", | |
"\n", | |
" .column_name {\n", | |
" font-size: 1.2em;\n", | |
" }\n", | |
"\n", | |
" .info_items {\n", | |
" margin: 10px 0;\n", | |
" font-size: 0.8em;\n", | |
" }\n", | |
"\n", | |
" .optimus_table td {\n", | |
" padding: 2px;\n", | |
" border-left: 1px solid #cccccc;\n", | |
" border-right: 1px solid #cccccc;\n", | |
" }\n", | |
"\n", | |
" .optimus_table tr:nth-child(even) {\n", | |
" background-color: #f2f2f2 !important;\n", | |
" }\n", | |
"\n", | |
" .optimus_table tr:nth-child(odd) {\n", | |
" background-color: #ffffff !important;\n", | |
" }\n", | |
"\n", | |
" .optimus_table thead {\n", | |
" border-bottom: 1px solid black;\n", | |
" }\n", | |
" .optimus_table{\n", | |
" font-size: 12px;\n", | |
" }\n", | |
"\n", | |
" .optimus_table tbody{\n", | |
" font-family: monospace;\n", | |
" border-bottom: 1px solid #cccccc;\n", | |
" }\n", | |
"\n", | |
"\n", | |
"</style>\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"<div class=\"info_items\">Viewing 2 of 2 rows / 2 columns</div>\n", | |
"<div class=\"info_items\">200 partition(s)</div>\n", | |
"\n", | |
"<table class=\"optimus_table\">\n", | |
" <thead>\n", | |
" <tr>\n", | |
" \n", | |
" <th>\n", | |
" <div class=\"column_name\">company</div>\n", | |
" <div class=\"data_type\">1 (string)</div>\n", | |
" <div class=\"data_type\">\n", | |
" \n", | |
" nullable\n", | |
" \n", | |
" </div>\n", | |
" </th>\n", | |
" \n", | |
" <th>\n", | |
" <div class=\"column_name\">avg(preTestScore)</div>\n", | |
" <div class=\"data_type\">2 (double)</div>\n", | |
" <div class=\"data_type\">\n", | |
" \n", | |
" nullable\n", | |
" \n", | |
" </div>\n", | |
" </th>\n", | |
" \n", | |
" </tr>\n", | |
"\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" \n", | |
" <tr>\n", | |
" \n", | |
" <td>\n", | |
" <div title='2nd'>2nd</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='15.5'>15.5</div>\n", | |
" </td>\n", | |
" \n", | |
" </tr>\n", | |
" \n", | |
" <tr>\n", | |
" \n", | |
" <td>\n", | |
" <div title='1st'>1st</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='6.666666666666667'>6.666666666666667</div>\n", | |
" </td>\n", | |
" \n", | |
" </tr>\n", | |
" \n", | |
" </tbody>\n", | |
"</table>\n", | |
"\n", | |
"<div class=\"info_items\">Viewing 2 of 2 rows / 2 columns</div>\n", | |
"<div class=\"info_items\">200 partition(s)</div>\n" | |
], | |
"text/plain": [ | |
"<IPython.core.display.HTML object>" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"regiment.groupby('company').mean(\"preTestScore\").table()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Present the mean preTestScores grouped by regiment and company" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 47, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<style>\n", | |
" .data_type {\n", | |
" font-size: 0.8em;\n", | |
" font-weight: normal;\n", | |
" }\n", | |
"\n", | |
" .column_name {\n", | |
" font-size: 1.2em;\n", | |
" }\n", | |
"\n", | |
" .info_items {\n", | |
" margin: 10px 0;\n", | |
" font-size: 0.8em;\n", | |
" }\n", | |
"\n", | |
" .optimus_table td {\n", | |
" padding: 2px;\n", | |
" border-left: 1px solid #cccccc;\n", | |
" border-right: 1px solid #cccccc;\n", | |
" }\n", | |
"\n", | |
" .optimus_table tr:nth-child(even) {\n", | |
" background-color: #f2f2f2 !important;\n", | |
" }\n", | |
"\n", | |
" .optimus_table tr:nth-child(odd) {\n", | |
" background-color: #ffffff !important;\n", | |
" }\n", | |
"\n", | |
" .optimus_table thead {\n", | |
" border-bottom: 1px solid black;\n", | |
" }\n", | |
" .optimus_table{\n", | |
" font-size: 12px;\n", | |
" }\n", | |
"\n", | |
" .optimus_table tbody{\n", | |
" font-family: monospace;\n", | |
" border-bottom: 1px solid #cccccc;\n", | |
" }\n", | |
"\n", | |
"\n", | |
"</style>\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"<div class=\"info_items\">Viewing 6 of 6 rows / 3 columns</div>\n", | |
"<div class=\"info_items\">200 partition(s)</div>\n", | |
"\n", | |
"<table class=\"optimus_table\">\n", | |
" <thead>\n", | |
" <tr>\n", | |
" \n", | |
" <th>\n", | |
" <div class=\"column_name\">regiment</div>\n", | |
" <div class=\"data_type\">1 (string)</div>\n", | |
" <div class=\"data_type\">\n", | |
" \n", | |
" nullable\n", | |
" \n", | |
" </div>\n", | |
" </th>\n", | |
" \n", | |
" <th>\n", | |
" <div class=\"column_name\">company</div>\n", | |
" <div class=\"data_type\">2 (string)</div>\n", | |
" <div class=\"data_type\">\n", | |
" \n", | |
" nullable\n", | |
" \n", | |
" </div>\n", | |
" </th>\n", | |
" \n", | |
" <th>\n", | |
" <div class=\"column_name\">avg(preTestScore)</div>\n", | |
" <div class=\"data_type\">3 (double)</div>\n", | |
" <div class=\"data_type\">\n", | |
" \n", | |
" nullable\n", | |
" \n", | |
" </div>\n", | |
" </th>\n", | |
" \n", | |
" </tr>\n", | |
"\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" \n", | |
" <tr>\n", | |
" \n", | |
" <td>\n", | |
" <div title='Nighthawks'>Nighthawks</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='1st'>1st</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='14.0'>14.0</div>\n", | |
" </td>\n", | |
" \n", | |
" </tr>\n", | |
" \n", | |
" <tr>\n", | |
" \n", | |
" <td>\n", | |
" <div title='Dragoons'>Dragoons</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='1st'>1st</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='3.5'>3.5</div>\n", | |
" </td>\n", | |
" \n", | |
" </tr>\n", | |
" \n", | |
" <tr>\n", | |
" \n", | |
" <td>\n", | |
" <div title='Nighthawks'>Nighthawks</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='2nd'>2nd</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='16.5'>16.5</div>\n", | |
" </td>\n", | |
" \n", | |
" </tr>\n", | |
" \n", | |
" <tr>\n", | |
" \n", | |
" <td>\n", | |
" <div title='Dragoons'>Dragoons</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='2nd'>2nd</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='27.5'>27.5</div>\n", | |
" </td>\n", | |
" \n", | |
" </tr>\n", | |
" \n", | |
" <tr>\n", | |
" \n", | |
" <td>\n", | |
" <div title='Scouts'>Scouts</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='2nd'>2nd</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='2.5'>2.5</div>\n", | |
" </td>\n", | |
" \n", | |
" </tr>\n", | |
" \n", | |
" <tr>\n", | |
" \n", | |
" <td>\n", | |
" <div title='Scouts'>Scouts</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='1st'>1st</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='2.5'>2.5</div>\n", | |
" </td>\n", | |
" \n", | |
" </tr>\n", | |
" \n", | |
" </tbody>\n", | |
"</table>\n", | |
"\n", | |
"<div class=\"info_items\">Viewing 6 of 6 rows / 3 columns</div>\n", | |
"<div class=\"info_items\">200 partition(s)</div>\n" | |
], | |
"text/plain": [ | |
"<IPython.core.display.HTML object>" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"regiment.groupby([\"regiment\",\"company\"]).mean(\"preTestScore\").table()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Present the mean preTestScores grouped by regiment and company without heirarchical indexing" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 59, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<style>\n", | |
" .data_type {\n", | |
" font-size: 0.8em;\n", | |
" font-weight: normal;\n", | |
" }\n", | |
"\n", | |
" .column_name {\n", | |
" font-size: 1.2em;\n", | |
" }\n", | |
"\n", | |
" .info_items {\n", | |
" margin: 10px 0;\n", | |
" font-size: 0.8em;\n", | |
" }\n", | |
"\n", | |
" .optimus_table td {\n", | |
" padding: 2px;\n", | |
" border-left: 1px solid #cccccc;\n", | |
" border-right: 1px solid #cccccc;\n", | |
" }\n", | |
"\n", | |
" .optimus_table tr:nth-child(even) {\n", | |
" background-color: #f2f2f2 !important;\n", | |
" }\n", | |
"\n", | |
" .optimus_table tr:nth-child(odd) {\n", | |
" background-color: #ffffff !important;\n", | |
" }\n", | |
"\n", | |
" .optimus_table thead {\n", | |
" border-bottom: 1px solid black;\n", | |
" }\n", | |
" .optimus_table{\n", | |
" font-size: 12px;\n", | |
" }\n", | |
"\n", | |
" .optimus_table tbody{\n", | |
" font-family: monospace;\n", | |
" border-bottom: 1px solid #cccccc;\n", | |
" }\n", | |
"\n", | |
"\n", | |
"</style>\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"<div class=\"info_items\">Viewing 6 of 6 rows / 3 columns</div>\n", | |
"<div class=\"info_items\">200 partition(s)</div>\n", | |
"\n", | |
"<table class=\"optimus_table\">\n", | |
" <thead>\n", | |
" <tr>\n", | |
" \n", | |
" <th>\n", | |
" <div class=\"column_name\">regiment</div>\n", | |
" <div class=\"data_type\">1 (string)</div>\n", | |
" <div class=\"data_type\">\n", | |
" \n", | |
" nullable\n", | |
" \n", | |
" </div>\n", | |
" </th>\n", | |
" \n", | |
" <th>\n", | |
" <div class=\"column_name\">1st</div>\n", | |
" <div class=\"data_type\">2 (double)</div>\n", | |
" <div class=\"data_type\">\n", | |
" \n", | |
" nullable\n", | |
" \n", | |
" </div>\n", | |
" </th>\n", | |
" \n", | |
" <th>\n", | |
" <div class=\"column_name\">2nd</div>\n", | |
" <div class=\"data_type\">3 (double)</div>\n", | |
" <div class=\"data_type\">\n", | |
" \n", | |
" nullable\n", | |
" \n", | |
" </div>\n", | |
" </th>\n", | |
" \n", | |
" </tr>\n", | |
"\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" \n", | |
" <tr>\n", | |
" \n", | |
" <td>\n", | |
" <div title='Nighthawks'>Nighthawks</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='14.0'>14.0</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='None'>None</div>\n", | |
" </td>\n", | |
" \n", | |
" </tr>\n", | |
" \n", | |
" <tr>\n", | |
" \n", | |
" <td>\n", | |
" <div title='Dragoons'>Dragoons</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='3.5'>3.5</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='None'>None</div>\n", | |
" </td>\n", | |
" \n", | |
" </tr>\n", | |
" \n", | |
" <tr>\n", | |
" \n", | |
" <td>\n", | |
" <div title='Nighthawks'>Nighthawks</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='None'>None</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='16.5'>16.5</div>\n", | |
" </td>\n", | |
" \n", | |
" </tr>\n", | |
" \n", | |
" <tr>\n", | |
" \n", | |
" <td>\n", | |
" <div title='Dragoons'>Dragoons</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='None'>None</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='27.5'>27.5</div>\n", | |
" </td>\n", | |
" \n", | |
" </tr>\n", | |
" \n", | |
" <tr>\n", | |
" \n", | |
" <td>\n", | |
" <div title='Scouts'>Scouts</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='None'>None</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='2.5'>2.5</div>\n", | |
" </td>\n", | |
" \n", | |
" </tr>\n", | |
" \n", | |
" <tr>\n", | |
" \n", | |
" <td>\n", | |
" <div title='Scouts'>Scouts</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='2.5'>2.5</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='None'>None</div>\n", | |
" </td>\n", | |
" \n", | |
" </tr>\n", | |
" \n", | |
" </tbody>\n", | |
"</table>\n", | |
"\n", | |
"<div class=\"info_items\">Viewing 6 of 6 rows / 3 columns</div>\n", | |
"<div class=\"info_items\">200 partition(s)</div>\n" | |
], | |
"text/plain": [ | |
"<IPython.core.display.HTML object>" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"# Not great, not terrible\n", | |
"regiment.groupby([\"regiment\",\"company\"]).pivot(\"company\").mean(\"preTestScore\").drop(\"company\").table()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Group the entire dataframe by regiment and company" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 61, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<style>\n", | |
" .data_type {\n", | |
" font-size: 0.8em;\n", | |
" font-weight: normal;\n", | |
" }\n", | |
"\n", | |
" .column_name {\n", | |
" font-size: 1.2em;\n", | |
" }\n", | |
"\n", | |
" .info_items {\n", | |
" margin: 10px 0;\n", | |
" font-size: 0.8em;\n", | |
" }\n", | |
"\n", | |
" .optimus_table td {\n", | |
" padding: 2px;\n", | |
" border-left: 1px solid #cccccc;\n", | |
" border-right: 1px solid #cccccc;\n", | |
" }\n", | |
"\n", | |
" .optimus_table tr:nth-child(even) {\n", | |
" background-color: #f2f2f2 !important;\n", | |
" }\n", | |
"\n", | |
" .optimus_table tr:nth-child(odd) {\n", | |
" background-color: #ffffff !important;\n", | |
" }\n", | |
"\n", | |
" .optimus_table thead {\n", | |
" border-bottom: 1px solid black;\n", | |
" }\n", | |
" .optimus_table{\n", | |
" font-size: 12px;\n", | |
" }\n", | |
"\n", | |
" .optimus_table tbody{\n", | |
" font-family: monospace;\n", | |
" border-bottom: 1px solid #cccccc;\n", | |
" }\n", | |
"\n", | |
"\n", | |
"</style>\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"<div class=\"info_items\">Viewing 6 of 6 rows / 4 columns</div>\n", | |
"<div class=\"info_items\">200 partition(s)</div>\n", | |
"\n", | |
"<table class=\"optimus_table\">\n", | |
" <thead>\n", | |
" <tr>\n", | |
" \n", | |
" <th>\n", | |
" <div class=\"column_name\">regiment</div>\n", | |
" <div class=\"data_type\">1 (string)</div>\n", | |
" <div class=\"data_type\">\n", | |
" \n", | |
" nullable\n", | |
" \n", | |
" </div>\n", | |
" </th>\n", | |
" \n", | |
" <th>\n", | |
" <div class=\"column_name\">company</div>\n", | |
" <div class=\"data_type\">2 (string)</div>\n", | |
" <div class=\"data_type\">\n", | |
" \n", | |
" nullable\n", | |
" \n", | |
" </div>\n", | |
" </th>\n", | |
" \n", | |
" <th>\n", | |
" <div class=\"column_name\">avg(preTestScore)</div>\n", | |
" <div class=\"data_type\">3 (double)</div>\n", | |
" <div class=\"data_type\">\n", | |
" \n", | |
" nullable\n", | |
" \n", | |
" </div>\n", | |
" </th>\n", | |
" \n", | |
" <th>\n", | |
" <div class=\"column_name\">avg(postTestScore)</div>\n", | |
" <div class=\"data_type\">4 (double)</div>\n", | |
" <div class=\"data_type\">\n", | |
" \n", | |
" nullable\n", | |
" \n", | |
" </div>\n", | |
" </th>\n", | |
" \n", | |
" </tr>\n", | |
"\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" \n", | |
" <tr>\n", | |
" \n", | |
" <td>\n", | |
" <div title='Nighthawks'>Nighthawks</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='1st'>1st</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='14.0'>14.0</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='59.5'>59.5</div>\n", | |
" </td>\n", | |
" \n", | |
" </tr>\n", | |
" \n", | |
" <tr>\n", | |
" \n", | |
" <td>\n", | |
" <div title='Dragoons'>Dragoons</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='1st'>1st</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='3.5'>3.5</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='47.5'>47.5</div>\n", | |
" </td>\n", | |
" \n", | |
" </tr>\n", | |
" \n", | |
" <tr>\n", | |
" \n", | |
" <td>\n", | |
" <div title='Nighthawks'>Nighthawks</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='2nd'>2nd</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='16.5'>16.5</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='59.5'>59.5</div>\n", | |
" </td>\n", | |
" \n", | |
" </tr>\n", | |
" \n", | |
" <tr>\n", | |
" \n", | |
" <td>\n", | |
" <div title='Dragoons'>Dragoons</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='2nd'>2nd</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='27.5'>27.5</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='75.5'>75.5</div>\n", | |
" </td>\n", | |
" \n", | |
" </tr>\n", | |
" \n", | |
" <tr>\n", | |
" \n", | |
" <td>\n", | |
" <div title='Scouts'>Scouts</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='2nd'>2nd</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='2.5'>2.5</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='66.0'>66.0</div>\n", | |
" </td>\n", | |
" \n", | |
" </tr>\n", | |
" \n", | |
" <tr>\n", | |
" \n", | |
" <td>\n", | |
" <div title='Scouts'>Scouts</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='1st'>1st</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='2.5'>2.5</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='66.0'>66.0</div>\n", | |
" </td>\n", | |
" \n", | |
" </tr>\n", | |
" \n", | |
" </tbody>\n", | |
"</table>\n", | |
"\n", | |
"<div class=\"info_items\">Viewing 6 of 6 rows / 4 columns</div>\n", | |
"<div class=\"info_items\">200 partition(s)</div>\n" | |
], | |
"text/plain": [ | |
"<IPython.core.display.HTML object>" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"regiment.groupby([\"regiment\",\"company\"]).mean().table()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# What is the number of observations in each regiment and company" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 62, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<style>\n", | |
" .data_type {\n", | |
" font-size: 0.8em;\n", | |
" font-weight: normal;\n", | |
" }\n", | |
"\n", | |
" .column_name {\n", | |
" font-size: 1.2em;\n", | |
" }\n", | |
"\n", | |
" .info_items {\n", | |
" margin: 10px 0;\n", | |
" font-size: 0.8em;\n", | |
" }\n", | |
"\n", | |
" .optimus_table td {\n", | |
" padding: 2px;\n", | |
" border-left: 1px solid #cccccc;\n", | |
" border-right: 1px solid #cccccc;\n", | |
" }\n", | |
"\n", | |
" .optimus_table tr:nth-child(even) {\n", | |
" background-color: #f2f2f2 !important;\n", | |
" }\n", | |
"\n", | |
" .optimus_table tr:nth-child(odd) {\n", | |
" background-color: #ffffff !important;\n", | |
" }\n", | |
"\n", | |
" .optimus_table thead {\n", | |
" border-bottom: 1px solid black;\n", | |
" }\n", | |
" .optimus_table{\n", | |
" font-size: 12px;\n", | |
" }\n", | |
"\n", | |
" .optimus_table tbody{\n", | |
" font-family: monospace;\n", | |
" border-bottom: 1px solid #cccccc;\n", | |
" }\n", | |
"\n", | |
"\n", | |
"</style>\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"<div class=\"info_items\">Viewing 6 of 6 rows / 3 columns</div>\n", | |
"<div class=\"info_items\">200 partition(s)</div>\n", | |
"\n", | |
"<table class=\"optimus_table\">\n", | |
" <thead>\n", | |
" <tr>\n", | |
" \n", | |
" <th>\n", | |
" <div class=\"column_name\">company</div>\n", | |
" <div class=\"data_type\">1 (string)</div>\n", | |
" <div class=\"data_type\">\n", | |
" \n", | |
" nullable\n", | |
" \n", | |
" </div>\n", | |
" </th>\n", | |
" \n", | |
" <th>\n", | |
" <div class=\"column_name\">regiment</div>\n", | |
" <div class=\"data_type\">2 (string)</div>\n", | |
" <div class=\"data_type\">\n", | |
" \n", | |
" nullable\n", | |
" \n", | |
" </div>\n", | |
" </th>\n", | |
" \n", | |
" <th>\n", | |
" <div class=\"column_name\">count</div>\n", | |
" <div class=\"data_type\">3 (bigint)</div>\n", | |
" <div class=\"data_type\">\n", | |
" \n", | |
" </div>\n", | |
" </th>\n", | |
" \n", | |
" </tr>\n", | |
"\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" \n", | |
" <tr>\n", | |
" \n", | |
" <td>\n", | |
" <div title='1st'>1st</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='Dragoons'>Dragoons</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='2'>2</div>\n", | |
" </td>\n", | |
" \n", | |
" </tr>\n", | |
" \n", | |
" <tr>\n", | |
" \n", | |
" <td>\n", | |
" <div title='1st'>1st</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='Scouts'>Scouts</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='2'>2</div>\n", | |
" </td>\n", | |
" \n", | |
" </tr>\n", | |
" \n", | |
" <tr>\n", | |
" \n", | |
" <td>\n", | |
" <div title='2nd'>2nd</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='Nighthawks'>Nighthawks</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='2'>2</div>\n", | |
" </td>\n", | |
" \n", | |
" </tr>\n", | |
" \n", | |
" <tr>\n", | |
" \n", | |
" <td>\n", | |
" <div title='2nd'>2nd</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='Dragoons'>Dragoons</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='2'>2</div>\n", | |
" </td>\n", | |
" \n", | |
" </tr>\n", | |
" \n", | |
" <tr>\n", | |
" \n", | |
" <td>\n", | |
" <div title='1st'>1st</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='Nighthawks'>Nighthawks</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='2'>2</div>\n", | |
" </td>\n", | |
" \n", | |
" </tr>\n", | |
" \n", | |
" <tr>\n", | |
" \n", | |
" <td>\n", | |
" <div title='2nd'>2nd</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='Scouts'>Scouts</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='2'>2</div>\n", | |
" </td>\n", | |
" \n", | |
" </tr>\n", | |
" \n", | |
" </tbody>\n", | |
"</table>\n", | |
"\n", | |
"<div class=\"info_items\">Viewing 6 of 6 rows / 3 columns</div>\n", | |
"<div class=\"info_items\">200 partition(s)</div>\n" | |
], | |
"text/plain": [ | |
"<IPython.core.display.HTML object>" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"regiment.groupby(['company', 'regiment']).count().table()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Iterate over a group and print the name and the whole data from the regiment" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 68, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"+----------+-------+--------+------------+-------------+\n", | |
"| regiment|company| name|preTestScore|postTestScore|\n", | |
"+----------+-------+--------+------------+-------------+\n", | |
"|Nighthawks| 1st| Miller| 4| 25|\n", | |
"|Nighthawks| 1st|Jacobson| 24| 94|\n", | |
"|Nighthawks| 2nd| Ali| 31| 57|\n", | |
"|Nighthawks| 2nd| Milner| 2| 62|\n", | |
"+----------+-------+--------+------------+-------------+\n", | |
"\n", | |
"+--------+-------+------+------------+-------------+\n", | |
"|regiment|company| name|preTestScore|postTestScore|\n", | |
"+--------+-------+------+------------+-------------+\n", | |
"|Dragoons| 1st| Cooze| 3| 70|\n", | |
"|Dragoons| 1st| Jacon| 4| 25|\n", | |
"|Dragoons| 2nd|Ryaner| 24| 94|\n", | |
"|Dragoons| 2nd| Sone| 31| 57|\n", | |
"+--------+-------+------+------------+-------------+\n", | |
"\n", | |
"+--------+-------+-----+------------+-------------+\n", | |
"|regiment|company| name|preTestScore|postTestScore|\n", | |
"+--------+-------+-----+------------+-------------+\n", | |
"| Scouts| 1st|Sloan| 2| 62|\n", | |
"| Scouts| 1st|Piger| 3| 70|\n", | |
"| Scouts| 2nd|Riani| 2| 62|\n", | |
"| Scouts| 2nd| Ali| 3| 70|\n", | |
"+--------+-------+-----+------------+-------------+\n", | |
"\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/plain": [ | |
"[None, None, None]" | |
] | |
}, | |
"execution_count": 68, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# Get unique values in the grouping column\n", | |
"groups = [x[0] for x in regiment.select(\"regiment\").distinct().collect()]\n", | |
"\n", | |
"# Create a filtered DataFrame for each group in a list comprehension\n", | |
"groups_list = [regiment.filter(col('regiment')==x) for x in groups]\n", | |
"\n", | |
"# show the results\n", | |
"[x.show() for x in groups_list]" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.6.8" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment