Created
July 19, 2019 19:29
-
-
Save FavioVazquez/6b1b4e5077c8a818cf301500d896150e to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# US - Baby Names\n", | |
"\n", | |
"## Introduction:\n", | |
"\n", | |
"We are going to use a subset of [US Baby Names from](https://www.kaggle.com/kaggle/us-baby-names) Kaggle.\n", | |
"\n", | |
"In the file it will be names from 2004 until 2014" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 53, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Import the necessary libraries\n", | |
"from optimus import Optimus\n", | |
"from pyspark.sql.functions import *\n", | |
"import pandas as pd\n", | |
"import numpy as np\n", | |
"op = Optimus()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Import the dataset and assing baby_names" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 54, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"baby_names_pd = pd.read_csv('https://raw.githubusercontent.com/guipsamora/pandas_exercises/master/06_Stats/US_Baby_Names/US_Baby_Names_right.csv')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 55, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"baby_names = op.spark.createDataFrame(baby_names_pd)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 56, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"\n", | |
"<h1></h1>\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"<div class=\"info_items\">Viewing 5 of 5 rows / 8 columns</div>\n", | |
"<div class=\"info_items\">1 partition(s)</div>\n", | |
"\n", | |
"<table class=\"optimus_table\">\n", | |
" <thead>\n", | |
" <tr>\n", | |
" \n", | |
" <th>\n", | |
" <div class=\"column_name\">summary</div>\n", | |
" <div class=\"data_type\">1 (string)</div>\n", | |
" <div class=\"data_type\">\n", | |
" \n", | |
" nullable\n", | |
" \n", | |
" </div>\n", | |
" </th>\n", | |
" \n", | |
" <th>\n", | |
" <div class=\"column_name\">Unnamed: 0</div>\n", | |
" <div class=\"data_type\">2 (string)</div>\n", | |
" <div class=\"data_type\">\n", | |
" \n", | |
" nullable\n", | |
" \n", | |
" </div>\n", | |
" </th>\n", | |
" \n", | |
" <th>\n", | |
" <div class=\"column_name\">Id</div>\n", | |
" <div class=\"data_type\">3 (string)</div>\n", | |
" <div class=\"data_type\">\n", | |
" \n", | |
" nullable\n", | |
" \n", | |
" </div>\n", | |
" </th>\n", | |
" \n", | |
" <th>\n", | |
" <div class=\"column_name\">Name</div>\n", | |
" <div class=\"data_type\">4 (string)</div>\n", | |
" <div class=\"data_type\">\n", | |
" \n", | |
" nullable\n", | |
" \n", | |
" </div>\n", | |
" </th>\n", | |
" \n", | |
" <th>\n", | |
" <div class=\"column_name\">Year</div>\n", | |
" <div class=\"data_type\">5 (string)</div>\n", | |
" <div class=\"data_type\">\n", | |
" \n", | |
" nullable\n", | |
" \n", | |
" </div>\n", | |
" </th>\n", | |
" \n", | |
" <th>\n", | |
" <div class=\"column_name\">Gender</div>\n", | |
" <div class=\"data_type\">6 (string)</div>\n", | |
" <div class=\"data_type\">\n", | |
" \n", | |
" nullable\n", | |
" \n", | |
" </div>\n", | |
" </th>\n", | |
" \n", | |
" <th>\n", | |
" <div class=\"column_name\">State</div>\n", | |
" <div class=\"data_type\">7 (string)</div>\n", | |
" <div class=\"data_type\">\n", | |
" \n", | |
" nullable\n", | |
" \n", | |
" </div>\n", | |
" </th>\n", | |
" \n", | |
" <th>\n", | |
" <div class=\"column_name\">Count</div>\n", | |
" <div class=\"data_type\">8 (string)</div>\n", | |
" <div class=\"data_type\">\n", | |
" \n", | |
" nullable\n", | |
" \n", | |
" </div>\n", | |
" </th>\n", | |
" \n", | |
" </tr>\n", | |
"\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" \n", | |
" <tr>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='count'>count\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='1016395'>1016395\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='1016395'>1016395\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='1016395'>1016395\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='1016395'>1016395\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='1016395'>1016395\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='1016395'>1016395\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='1016395'>1016395\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" </tr>\n", | |
" \n", | |
" <tr>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='mean'>mean\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='2830990.4619178567'>2830990.4619178567\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='2830991.4619178567'>2830991.4619178567\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='Infinity'>Infinity\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='2009.0531899507573'>2009.0531899507573\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\"none \"\n", | |
" title='None'>None\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\"none \"\n", | |
" title='None'>None\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='34.85012421351935'>34.85012421351935\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" </tr>\n", | |
" \n", | |
" <tr>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='stddev'>stddev\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='1652475.6514804524'>1652475.6514804524\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='1652475.6514804524'>1652475.6514804524\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='NaN'>NaN\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='3.1382928281815494'>3.1382928281815494\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\"none \"\n", | |
" title='None'>None\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\"none \"\n", | |
" title='None'>None\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='97.3973464861767'>97.3973464861767\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" </tr>\n", | |
" \n", | |
" <tr>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='min'>min\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='11349'>11349\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='11350'>11350\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='Aaban'>Aaban\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='2004'>2004\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='F'>F\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='AK'>AK\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='5'>5\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" </tr>\n", | |
" \n", | |
" <tr>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='max'>max\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='5647425'>5647425\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='5647426'>5647426\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='Zyriah'>Zyriah\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='2014'>2014\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='M'>M\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='WY'>WY\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='4167'>4167\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" </tr>\n", | |
" \n", | |
" </tbody>\n", | |
"</table>\n", | |
"\n", | |
"\n", | |
"<div class=\"info_items\">Viewing 5 of 5 rows / 8 columns</div>\n", | |
"<div class=\"info_items\">1 partition(s)</div>\n" | |
], | |
"text/plain": [ | |
"<IPython.core.display.HTML object>" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"baby_names.describe().table()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# See the first 10 entries" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 57, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"\n", | |
"<h1></h1>\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"<div class=\"info_items\">Viewing 10 of 1.0 million rows / 7 columns</div>\n", | |
"<div class=\"info_items\">1 partition(s)</div>\n", | |
"\n", | |
"<table class=\"optimus_table\">\n", | |
" <thead>\n", | |
" <tr>\n", | |
" \n", | |
" <th>\n", | |
" <div class=\"column_name\">Unnamed: 0</div>\n", | |
" <div class=\"data_type\">1 (bigint)</div>\n", | |
" <div class=\"data_type\">\n", | |
" \n", | |
" nullable\n", | |
" \n", | |
" </div>\n", | |
" </th>\n", | |
" \n", | |
" <th>\n", | |
" <div class=\"column_name\">Id</div>\n", | |
" <div class=\"data_type\">2 (bigint)</div>\n", | |
" <div class=\"data_type\">\n", | |
" \n", | |
" nullable\n", | |
" \n", | |
" </div>\n", | |
" </th>\n", | |
" \n", | |
" <th>\n", | |
" <div class=\"column_name\">Name</div>\n", | |
" <div class=\"data_type\">3 (string)</div>\n", | |
" <div class=\"data_type\">\n", | |
" \n", | |
" nullable\n", | |
" \n", | |
" </div>\n", | |
" </th>\n", | |
" \n", | |
" <th>\n", | |
" <div class=\"column_name\">Year</div>\n", | |
" <div class=\"data_type\">4 (bigint)</div>\n", | |
" <div class=\"data_type\">\n", | |
" \n", | |
" nullable\n", | |
" \n", | |
" </div>\n", | |
" </th>\n", | |
" \n", | |
" <th>\n", | |
" <div class=\"column_name\">Gender</div>\n", | |
" <div class=\"data_type\">5 (string)</div>\n", | |
" <div class=\"data_type\">\n", | |
" \n", | |
" nullable\n", | |
" \n", | |
" </div>\n", | |
" </th>\n", | |
" \n", | |
" <th>\n", | |
" <div class=\"column_name\">State</div>\n", | |
" <div class=\"data_type\">6 (string)</div>\n", | |
" <div class=\"data_type\">\n", | |
" \n", | |
" nullable\n", | |
" \n", | |
" </div>\n", | |
" </th>\n", | |
" \n", | |
" <th>\n", | |
" <div class=\"column_name\">Count</div>\n", | |
" <div class=\"data_type\">7 (bigint)</div>\n", | |
" <div class=\"data_type\">\n", | |
" \n", | |
" nullable\n", | |
" \n", | |
" </div>\n", | |
" </th>\n", | |
" \n", | |
" </tr>\n", | |
"\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" \n", | |
" <tr>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='11349'>11349\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='11350'>11350\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='Emma'>Emma\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='2004'>2004\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='F'>F\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='AK'>AK\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='62'>62\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" </tr>\n", | |
" \n", | |
" <tr>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='11350'>11350\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='11351'>11351\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='Madison'>Madison\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='2004'>2004\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='F'>F\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='AK'>AK\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='48'>48\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" </tr>\n", | |
" \n", | |
" <tr>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='11351'>11351\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='11352'>11352\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='Hannah'>Hannah\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='2004'>2004\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='F'>F\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='AK'>AK\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='46'>46\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" </tr>\n", | |
" \n", | |
" <tr>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='11352'>11352\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='11353'>11353\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='Grace'>Grace\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='2004'>2004\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='F'>F\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='AK'>AK\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='44'>44\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" </tr>\n", | |
" \n", | |
" <tr>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='11353'>11353\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='11354'>11354\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='Emily'>Emily\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='2004'>2004\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='F'>F\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='AK'>AK\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='41'>41\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" </tr>\n", | |
" \n", | |
" <tr>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='11354'>11354\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='11355'>11355\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='Abigail'>Abigail\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='2004'>2004\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='F'>F\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='AK'>AK\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='37'>37\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" </tr>\n", | |
" \n", | |
" <tr>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='11355'>11355\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='11356'>11356\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='Olivia'>Olivia\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='2004'>2004\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='F'>F\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='AK'>AK\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='33'>33\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" </tr>\n", | |
" \n", | |
" <tr>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='11356'>11356\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='11357'>11357\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='Isabella'>Isabella\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='2004'>2004\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='F'>F\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='AK'>AK\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='30'>30\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" </tr>\n", | |
" \n", | |
" <tr>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='11357'>11357\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='11358'>11358\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='Alyssa'>Alyssa\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='2004'>2004\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='F'>F\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='AK'>AK\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='29'>29\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" </tr>\n", | |
" \n", | |
" <tr>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='11358'>11358\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='11359'>11359\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='Sophia'>Sophia\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='2004'>2004\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='F'>F\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='AK'>AK\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='28'>28\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" </tr>\n", | |
" \n", | |
" </tbody>\n", | |
"</table>\n", | |
"\n", | |
"\n", | |
"<div class=\"info_items\">Viewing 10 of 1.0 million rows / 7 columns</div>\n", | |
"<div class=\"info_items\">1 partition(s)</div>\n" | |
], | |
"text/plain": [ | |
"<IPython.core.display.HTML object>" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"baby_names.table(10)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Delete the column 'Unnamed: 0' and 'Id'" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 58, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"baby_names = baby_names.drop(\"Unnamed: 0\", \"Id\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 59, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"ename": "KeyboardInterrupt", | |
"evalue": "", | |
"output_type": "error", | |
"traceback": [ | |
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", | |
"\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", | |
"\u001b[0;32m<ipython-input-59-e3c19713dd04>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mbaby_names\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtable\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m5\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", | |
"\u001b[0;32m~/.local/lib/python3.6/site-packages/optimus/helpers/decorators.py\u001b[0m in \u001b[0;36mwrapper\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 14\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mdecorator\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfunc\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 15\u001b[0m \u001b[0;34m@\u001b[0m\u001b[0mwraps\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfunc\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 16\u001b[0;31m \u001b[0;32mdef\u001b[0m \u001b[0mwrapper\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 17\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 18\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", | |
"\u001b[0;32m~/.local/lib/python3.6/site-packages/optimus/dataframe/extension.py\u001b[0m in \u001b[0;36mtable\u001b[0;34m(self, limit, columns)\u001b[0m\n\u001b[1;32m 332\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mi\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mdtypes\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 333\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mj\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mcolumns\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 334\u001b[0;31m \u001b[0;32mif\u001b[0m \u001b[0mi\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0mj\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 335\u001b[0m \u001b[0mfinal_columns\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 336\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", | |
"\u001b[0;32m~/.local/lib/python3.6/site-packages/optimus/helpers/decorators.py\u001b[0m in \u001b[0;36mwrapper\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 14\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mdecorator\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfunc\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 15\u001b[0m \u001b[0;34m@\u001b[0m\u001b[0mwraps\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfunc\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 16\u001b[0;31m \u001b[0;32mdef\u001b[0m \u001b[0mwrapper\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 17\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 18\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", | |
"\u001b[0;32m~/.local/lib/python3.6/site-packages/optimus/dataframe/extension.py\u001b[0m in \u001b[0;36mtable_html\u001b[0;34m(self, limit, columns)\u001b[0m\n\u001b[1;32m 313\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 314\u001b[0m \u001b[0;34m:\u001b[0m\u001b[0;32mreturn\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 315\u001b[0;31m \"\"\"\n\u001b[0m\u001b[1;32m 316\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 317\u001b[0m \u001b[0mcolumns\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mparse_columns\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcolumns\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", | |
"\u001b[0;32m~/.local/lib/python3.6/site-packages/pyspark/sql/dataframe.py\u001b[0m in \u001b[0;36mcount\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 520\u001b[0m \u001b[0;36m2\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 521\u001b[0m \"\"\"\n\u001b[0;32m--> 522\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_jdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcount\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 523\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 524\u001b[0m \u001b[0;34m@\u001b[0m\u001b[0mignore_unicode_prefix\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", | |
"\u001b[0;32m~/.local/lib/python3.6/site-packages/py4j/java_gateway.py\u001b[0m in \u001b[0;36m__call__\u001b[0;34m(self, *args)\u001b[0m\n\u001b[1;32m 1253\u001b[0m \u001b[0mproto\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mEND_COMMAND_PART\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1254\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1255\u001b[0;31m \u001b[0manswer\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgateway_client\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msend_command\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcommand\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1256\u001b[0m return_value = get_return_value(\n\u001b[1;32m 1257\u001b[0m answer, self.gateway_client, self.target_id, self.name)\n", | |
"\u001b[0;32m~/.local/lib/python3.6/site-packages/py4j/java_gateway.py\u001b[0m in \u001b[0;36msend_command\u001b[0;34m(self, command, retry, binary)\u001b[0m\n\u001b[1;32m 983\u001b[0m \u001b[0mconnection\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_get_connection\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 984\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 985\u001b[0;31m \u001b[0mresponse\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mconnection\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msend_command\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcommand\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 986\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mbinary\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 987\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mresponse\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_create_connection_guard\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mconnection\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", | |
"\u001b[0;32m~/.local/lib/python3.6/site-packages/py4j/java_gateway.py\u001b[0m in \u001b[0;36msend_command\u001b[0;34m(self, command)\u001b[0m\n\u001b[1;32m 1150\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1151\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1152\u001b[0;31m \u001b[0manswer\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0msmart_decode\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstream\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreadline\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1153\u001b[0m \u001b[0mlogger\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdebug\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Answer received: {0}\"\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0manswer\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1154\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0manswer\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstartswith\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mproto\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mRETURN_MESSAGE\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", | |
"\u001b[0;32m/opt/conda/lib/python3.6/socket.py\u001b[0m in \u001b[0;36mreadinto\u001b[0;34m(self, b)\u001b[0m\n\u001b[1;32m 584\u001b[0m \u001b[0;32mwhile\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 585\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 586\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_sock\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrecv_into\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mb\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 587\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mtimeout\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 588\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_timeout_occurred\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", | |
"\u001b[0;31mKeyboardInterrupt\u001b[0m: " | |
] | |
} | |
], | |
"source": [ | |
"baby_names.table(5)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Are there more male or female names in the dataset?" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 33, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"\n", | |
"<h1></h1>\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"<div class=\"info_items\">Viewing 2 of 2 rows / 2 columns</div>\n", | |
"<div class=\"info_items\">2 partition(s)</div>\n", | |
"\n", | |
"<table class=\"optimus_table\">\n", | |
" <thead>\n", | |
" <tr>\n", | |
" \n", | |
" <th>\n", | |
" <div class=\"column_name\">Gender</div>\n", | |
" <div class=\"data_type\">1 (string)</div>\n", | |
" <div class=\"data_type\">\n", | |
" \n", | |
" nullable\n", | |
" \n", | |
" </div>\n", | |
" </th>\n", | |
" \n", | |
" <th>\n", | |
" <div class=\"column_name\">count</div>\n", | |
" <div class=\"data_type\">2 (bigint)</div>\n", | |
" <div class=\"data_type\">\n", | |
" \n", | |
" not nullable\n", | |
" \n", | |
" </div>\n", | |
" </th>\n", | |
" \n", | |
" </tr>\n", | |
"\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" \n", | |
" <tr>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='M'>M\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='457549'>457549\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" </tr>\n", | |
" \n", | |
" <tr>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='F'>F\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='558846'>558846\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" </tr>\n", | |
" \n", | |
" </tbody>\n", | |
"</table>\n", | |
"\n", | |
"\n", | |
"<div class=\"info_items\">Viewing 2 of 2 rows / 2 columns</div>\n", | |
"<div class=\"info_items\">2 partition(s)</div>\n" | |
], | |
"text/plain": [ | |
"<IPython.core.display.HTML object>" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"baby_names.groupby(\"Gender\").count().orderBy(\"count\").table()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Group the dataset by name and assign to names" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 34, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"baby_names = baby_names.drop(\"Year\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 43, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"names = baby_names.groupby(\"Name\").sum().cols.rename(\"sum(Count)\",\"count\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 44, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"\n", | |
"<h1></h1>\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"<div class=\"info_items\">Viewing 5 of 17632 rows / 2 columns</div>\n", | |
"<div class=\"info_items\">200 partition(s)</div>\n", | |
"\n", | |
"<table class=\"optimus_table\">\n", | |
" <thead>\n", | |
" <tr>\n", | |
" \n", | |
" <th>\n", | |
" <div class=\"column_name\">Name</div>\n", | |
" <div class=\"data_type\">1 (string)</div>\n", | |
" <div class=\"data_type\">\n", | |
" \n", | |
" nullable\n", | |
" \n", | |
" </div>\n", | |
" </th>\n", | |
" \n", | |
" <th>\n", | |
" <div class=\"column_name\">count</div>\n", | |
" <div class=\"data_type\">2 (bigint)</div>\n", | |
" <div class=\"data_type\">\n", | |
" \n", | |
" nullable\n", | |
" \n", | |
" </div>\n", | |
" </th>\n", | |
" \n", | |
" </tr>\n", | |
"\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" \n", | |
" <tr>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='Kiana'>Kiana\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='5965'>5965\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" </tr>\n", | |
" \n", | |
" <tr>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='Alayna'>Alayna\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='14171'>14171\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" </tr>\n", | |
" \n", | |
" <tr>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='Ember'>Ember\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='3181'>3181\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" </tr>\n", | |
" \n", | |
" <tr>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='Tyler'>Tyler\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='129989'>129989\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" </tr>\n", | |
" \n", | |
" <tr>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='Maddox'>Maddox\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='20716'>20716\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" </tr>\n", | |
" \n", | |
" </tbody>\n", | |
"</table>\n", | |
"\n", | |
"\n", | |
"<div class=\"info_items\">Viewing 5 of 17632 rows / 2 columns</div>\n", | |
"<div class=\"info_items\">200 partition(s)</div>\n" | |
], | |
"text/plain": [ | |
"<IPython.core.display.HTML object>" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"names.table(5)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 42, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Columns: 2 Rows: 17632\n" | |
] | |
} | |
], | |
"source": [ | |
"print(f'Columns: {op.profiler.dataset_info(names)[\"cols_count\"]}', \n", | |
" f'Rows: {op.profiler.dataset_info(names)[\"rows_count\"]}')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 49, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"\n", | |
"<h1></h1>\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"<div class=\"info_items\">Viewing 10 of 17632 rows / 2 columns</div>\n", | |
"<div class=\"info_items\">141 partition(s)</div>\n", | |
"\n", | |
"<table class=\"optimus_table\">\n", | |
" <thead>\n", | |
" <tr>\n", | |
" \n", | |
" <th>\n", | |
" <div class=\"column_name\">Name</div>\n", | |
" <div class=\"data_type\">1 (string)</div>\n", | |
" <div class=\"data_type\">\n", | |
" \n", | |
" nullable\n", | |
" \n", | |
" </div>\n", | |
" </th>\n", | |
" \n", | |
" <th>\n", | |
" <div class=\"column_name\">count</div>\n", | |
" <div class=\"data_type\">2 (bigint)</div>\n", | |
" <div class=\"data_type\">\n", | |
" \n", | |
" nullable\n", | |
" \n", | |
" </div>\n", | |
" </th>\n", | |
" \n", | |
" </tr>\n", | |
"\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" \n", | |
" <tr>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='Jacob'>Jacob\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='242874'>242874\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" </tr>\n", | |
" \n", | |
" <tr>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='Emma'>Emma\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='214852'>214852\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" </tr>\n", | |
" \n", | |
" <tr>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='Michael'>Michael\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='214405'>214405\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" </tr>\n", | |
" \n", | |
" <tr>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='Ethan'>Ethan\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='209277'>209277\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" </tr>\n", | |
" \n", | |
" <tr>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='Isabella'>Isabella\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='204798'>204798\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" </tr>\n", | |
" \n", | |
" <tr>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='William'>William\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='197894'>197894\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" </tr>\n", | |
" \n", | |
" <tr>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='Joshua'>Joshua\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='191551'>191551\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" </tr>\n", | |
" \n", | |
" <tr>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='Sophia'>Sophia\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='191446'>191446\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" </tr>\n", | |
" \n", | |
" <tr>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='Daniel'>Daniel\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='191440'>191440\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" </tr>\n", | |
" \n", | |
" <tr>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='Emily'>Emily\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='190318'>190318\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" </tr>\n", | |
" \n", | |
" </tbody>\n", | |
"</table>\n", | |
"\n", | |
"\n", | |
"<div class=\"info_items\">Viewing 10 of 17632 rows / 2 columns</div>\n", | |
"<div class=\"info_items\">141 partition(s)</div>\n" | |
], | |
"text/plain": [ | |
"<IPython.core.display.HTML object>" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"names.sort(desc(\"count\")).table(10)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# How many different names exist in the dataset?" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 50, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"17632" | |
] | |
}, | |
"execution_count": 50, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"names.count()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# What is the name with most occurrences?" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 62, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"\n", | |
"<h1></h1>\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"<div class=\"info_items\">Viewing 1 of 17632 rows / 1 columns</div>\n", | |
"<div class=\"info_items\">141 partition(s)</div>\n", | |
"\n", | |
"<table class=\"optimus_table\">\n", | |
" <thead>\n", | |
" <tr>\n", | |
" \n", | |
" <th>\n", | |
" <div class=\"column_name\">Name</div>\n", | |
" <div class=\"data_type\">1 (string)</div>\n", | |
" <div class=\"data_type\">\n", | |
" \n", | |
" nullable\n", | |
" \n", | |
" </div>\n", | |
" </th>\n", | |
" \n", | |
" </tr>\n", | |
"\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" \n", | |
" <tr>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='Jacob'>Jacob\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" </tr>\n", | |
" \n", | |
" </tbody>\n", | |
"</table>\n", | |
"\n", | |
"\n", | |
"<div class=\"info_items\">Viewing 1 of 17632 rows / 1 columns</div>\n", | |
"<div class=\"info_items\">141 partition(s)</div>\n" | |
], | |
"text/plain": [ | |
"<IPython.core.display.HTML object>" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"names.sort(desc(\"count\")).select(\"Name\").table(1)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# How many different names have the least occurrences?" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 86, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"2578" | |
] | |
}, | |
"execution_count": 86, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"min_oc = names.cols.min(\"count\") # This will give you the min ocurrence for names\n", | |
"names.where(col(\"count\") == min_oc).count()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# What is the median name occurrence?" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 99, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"49.0" | |
] | |
}, | |
"execution_count": 99, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"median_oc = names.approxQuantile(\"count\", [0.5], relativeError=0)[0] # This will give you the median ocurrence for names\n", | |
"median_oc" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# What is the standard deviation of names?" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 100, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"11006.06947" | |
] | |
}, | |
"execution_count": 100, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"names.cols.std(\"count\")" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Get a summary with the mean, min, max, std and quartiles." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 111, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"\n", | |
"<h1></h1>\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"<div class=\"info_items\">Viewing 5 of 5 rows / 2 columns</div>\n", | |
"<div class=\"info_items\">1 partition(s)</div>\n", | |
"\n", | |
"<table class=\"optimus_table\">\n", | |
" <thead>\n", | |
" <tr>\n", | |
" \n", | |
" <th>\n", | |
" <div class=\"column_name\">summary</div>\n", | |
" <div class=\"data_type\">1 (string)</div>\n", | |
" <div class=\"data_type\">\n", | |
" \n", | |
" nullable\n", | |
" \n", | |
" </div>\n", | |
" </th>\n", | |
" \n", | |
" <th>\n", | |
" <div class=\"column_name\">count</div>\n", | |
" <div class=\"data_type\">2 (string)</div>\n", | |
" <div class=\"data_type\">\n", | |
" \n", | |
" nullable\n", | |
" \n", | |
" </div>\n", | |
" </th>\n", | |
" \n", | |
" </tr>\n", | |
"\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" \n", | |
" <tr>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='count'>count\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='17632'>17632\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" </tr>\n", | |
" \n", | |
" <tr>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='mean'>mean\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='2008.932168784029'>2008.932168784029\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" </tr>\n", | |
" \n", | |
" <tr>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='stddev'>stddev\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='11006.069467890566'>11006.069467890566\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" </tr>\n", | |
" \n", | |
" <tr>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='min'>min\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='5'>5\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" </tr>\n", | |
" \n", | |
" <tr>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='max'>max\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div class=\" \"\n", | |
" title='242874'>242874\n", | |
" </div>\n", | |
" </td>\n", | |
" \n", | |
" </tr>\n", | |
" \n", | |
" </tbody>\n", | |
"</table>\n", | |
"\n", | |
"\n", | |
"<div class=\"info_items\">Viewing 5 of 5 rows / 2 columns</div>\n", | |
"<div class=\"info_items\">1 partition(s)</div>\n" | |
], | |
"text/plain": [ | |
"<IPython.core.display.HTML object>" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"names.select(\"count\").describe().table()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 117, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[11.0, 49.0, 337.0]" | |
] | |
}, | |
"execution_count": 117, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"names.approxQuantile(\"count\",[0.25,0.5,0.75], relativeError=0)" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.6.8" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment