Created
July 19, 2019 19:29
-
-
Save FavioVazquez/9cb1a93b59813c3323b665d47de30242 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<style> /* Tables*/\n", | |
"\n", | |
" .data_type {\n", | |
" font-size: 0.8em;\n", | |
" font-weight: normal;\n", | |
" }\n", | |
"\n", | |
" .column_name {\n", | |
" font-size: 1.2em;\n", | |
" }\n", | |
"\n", | |
" .info_items {\n", | |
" margin: 10px 0;\n", | |
" font-size: 0.8em;\n", | |
" }\n", | |
"\n", | |
" .optimus_table td {\n", | |
" border: 0px;\n", | |
" }\n", | |
"\n", | |
" .optimus_table tr:nth-child(even) {\n", | |
" background-color: #f2f2f2 !important;\n", | |
" }\n", | |
"\n", | |
" .optimus_table tr:nth-child(odd) {\n", | |
" background-color: #ffffff !important;\n", | |
" }\n", | |
"\n", | |
" .optimus_table thead {\n", | |
" border-bottom: 1px solid black;\n", | |
" }\n", | |
" .optimus_table{\n", | |
" font-size: 12px;\n", | |
" }\n", | |
"\n", | |
" .optimus_table tbody{\n", | |
" font-family: monospace;\n", | |
" border-bottom: 1px solid #cccccc;\n", | |
" }\n", | |
"\n", | |
" /* Profiler */\n", | |
" .main{\n", | |
" width:100%;\n", | |
" overflow:auto;\n", | |
" border-bottom:1px solid #eeeeee;\n", | |
" padding: 10px 0;\n", | |
" }\n", | |
" .panel_profiler{\n", | |
" margin-right:2%;\n", | |
" float:left;\n", | |
" padding-bottom:2%;\n", | |
" }\n", | |
" .panel_profiler tbody{\n", | |
" font-family:monospace;\n", | |
" }\n", | |
" .title_profiler{\n", | |
" padding:20px;\n", | |
" background-color: #eeeeee\n", | |
" }\n", | |
" .info{\n", | |
" overflow: auto\n", | |
" }\n", | |
" .main td, main th{\n", | |
" padding:0em\n", | |
" }\n", | |
" .panel_profiler td {\n", | |
" padding:0.2em\n", | |
" }\n", | |
" .none, .true{\n", | |
" color:#0000ff\n", | |
" }\n", | |
" .optimus_table th {\n", | |
" font-family:sans-serif;\n", | |
" }\n", | |
"\n", | |
" .info_items{\n", | |
" font-family:sans-serif;\n", | |
" font-size:10px;\n", | |
" }\n", | |
"</style>" | |
], | |
"text/plain": [ | |
"<IPython.core.display.HTML object>" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"# Import the necessary libraries\n", | |
"from optimus import Optimus\n", | |
"from pyspark.sql.functions import *\n", | |
"import pandas as pd\n", | |
"import numpy as np\n", | |
"op = Optimus()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Create sample data" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"raw_data_1 = {\n", | |
" 'subject_id': ['1', '2', '3', '4', '5'],\n", | |
" 'first_name': ['Alex', 'Amy', 'Allen', 'Alice', 'Ayoung'], \n", | |
" 'last_name': ['Anderson', 'Ackerman', 'Ali', 'Aoni', 'Atiches']}\n", | |
"\n", | |
"raw_data_2 = {\n", | |
" 'subject_id': ['4', '5', '6', '7', '8'],\n", | |
" 'first_name': ['Billy', 'Brian', 'Bran', 'Bryce', 'Betty'], \n", | |
" 'last_name': ['Bonder', 'Black', 'Balwner', 'Brice', 'Btisan']}\n", | |
"\n", | |
"raw_data_3 = {\n", | |
" 'subject_id': ['1', '2', '3', '4', '5', '7', '8', '9', '10', '11'],\n", | |
" 'test_id': [51, 15, 15, 61, 16, 14, 15, 1, 61, 16]}" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"data1_pd = pd.DataFrame(raw_data_1, columns = ['subject_id', 'first_name', 'last_name'])\n", | |
"data2_pd = pd.DataFrame(raw_data_2, columns = ['subject_id', 'first_name', 'last_name'])\n", | |
"data3_pd = pd.DataFrame(raw_data_3, columns = ['subject_id','test_id'])" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Create Spark DF" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"data_1 = op.spark.createDataFrame(data1_pd)\n", | |
"data_2 = op.spark.createDataFrame(data2_pd)\n", | |
"data_3 = op.spark.createDataFrame(data3_pd)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<style>\n", | |
" .data_type {\n", | |
" font-size: 0.8em;\n", | |
" font-weight: normal;\n", | |
" }\n", | |
"\n", | |
" .column_name {\n", | |
" font-size: 1.2em;\n", | |
" }\n", | |
"\n", | |
" .info_items {\n", | |
" margin: 10px 0;\n", | |
" font-size: 0.8em;\n", | |
" }\n", | |
"\n", | |
" .optimus_table td {\n", | |
" padding: 2px;\n", | |
" border-left: 1px solid #cccccc;\n", | |
" border-right: 1px solid #cccccc;\n", | |
" }\n", | |
"\n", | |
" .optimus_table tr:nth-child(even) {\n", | |
" background-color: #f2f2f2 !important;\n", | |
" }\n", | |
"\n", | |
" .optimus_table tr:nth-child(odd) {\n", | |
" background-color: #ffffff !important;\n", | |
" }\n", | |
"\n", | |
" .optimus_table thead {\n", | |
" border-bottom: 1px solid black;\n", | |
" }\n", | |
" .optimus_table{\n", | |
" font-size: 12px;\n", | |
" }\n", | |
"\n", | |
" .optimus_table tbody{\n", | |
" font-family: monospace;\n", | |
" border-bottom: 1px solid #cccccc;\n", | |
" }\n", | |
"\n", | |
"\n", | |
"</style>\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"<div class=\"info_items\">Viewing 3 of 5 rows / 3 columns</div>\n", | |
"<div class=\"info_items\">1 partition(s)</div>\n", | |
"\n", | |
"<table class=\"optimus_table\">\n", | |
" <thead>\n", | |
" <tr>\n", | |
" \n", | |
" <th>\n", | |
" <div class=\"column_name\">subject_id</div>\n", | |
" <div class=\"data_type\">1 (string)</div>\n", | |
" <div class=\"data_type\">\n", | |
" \n", | |
" nullable\n", | |
" \n", | |
" </div>\n", | |
" </th>\n", | |
" \n", | |
" <th>\n", | |
" <div class=\"column_name\">first_name</div>\n", | |
" <div class=\"data_type\">2 (string)</div>\n", | |
" <div class=\"data_type\">\n", | |
" \n", | |
" nullable\n", | |
" \n", | |
" </div>\n", | |
" </th>\n", | |
" \n", | |
" <th>\n", | |
" <div class=\"column_name\">last_name</div>\n", | |
" <div class=\"data_type\">3 (string)</div>\n", | |
" <div class=\"data_type\">\n", | |
" \n", | |
" nullable\n", | |
" \n", | |
" </div>\n", | |
" </th>\n", | |
" \n", | |
" </tr>\n", | |
"\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" \n", | |
" <tr>\n", | |
" \n", | |
" <td>\n", | |
" <div title='1'>1</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='Alex'>Alex</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='Anderson'>Anderson</div>\n", | |
" </td>\n", | |
" \n", | |
" </tr>\n", | |
" \n", | |
" <tr>\n", | |
" \n", | |
" <td>\n", | |
" <div title='2'>2</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='Amy'>Amy</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='Ackerman'>Ackerman</div>\n", | |
" </td>\n", | |
" \n", | |
" </tr>\n", | |
" \n", | |
" <tr>\n", | |
" \n", | |
" <td>\n", | |
" <div title='3'>3</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='Allen'>Allen</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='Ali'>Ali</div>\n", | |
" </td>\n", | |
" \n", | |
" </tr>\n", | |
" \n", | |
" </tbody>\n", | |
"</table>\n", | |
"\n", | |
"<div class=\"info_items\">Viewing 3 of 5 rows / 3 columns</div>\n", | |
"<div class=\"info_items\">1 partition(s)</div>\n" | |
], | |
"text/plain": [ | |
"<IPython.core.display.HTML object>" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"data_1.table(3)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<style>\n", | |
" .data_type {\n", | |
" font-size: 0.8em;\n", | |
" font-weight: normal;\n", | |
" }\n", | |
"\n", | |
" .column_name {\n", | |
" font-size: 1.2em;\n", | |
" }\n", | |
"\n", | |
" .info_items {\n", | |
" margin: 10px 0;\n", | |
" font-size: 0.8em;\n", | |
" }\n", | |
"\n", | |
" .optimus_table td {\n", | |
" padding: 2px;\n", | |
" border-left: 1px solid #cccccc;\n", | |
" border-right: 1px solid #cccccc;\n", | |
" }\n", | |
"\n", | |
" .optimus_table tr:nth-child(even) {\n", | |
" background-color: #f2f2f2 !important;\n", | |
" }\n", | |
"\n", | |
" .optimus_table tr:nth-child(odd) {\n", | |
" background-color: #ffffff !important;\n", | |
" }\n", | |
"\n", | |
" .optimus_table thead {\n", | |
" border-bottom: 1px solid black;\n", | |
" }\n", | |
" .optimus_table{\n", | |
" font-size: 12px;\n", | |
" }\n", | |
"\n", | |
" .optimus_table tbody{\n", | |
" font-family: monospace;\n", | |
" border-bottom: 1px solid #cccccc;\n", | |
" }\n", | |
"\n", | |
"\n", | |
"</style>\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"<div class=\"info_items\">Viewing 3 of 5 rows / 3 columns</div>\n", | |
"<div class=\"info_items\">1 partition(s)</div>\n", | |
"\n", | |
"<table class=\"optimus_table\">\n", | |
" <thead>\n", | |
" <tr>\n", | |
" \n", | |
" <th>\n", | |
" <div class=\"column_name\">subject_id</div>\n", | |
" <div class=\"data_type\">1 (string)</div>\n", | |
" <div class=\"data_type\">\n", | |
" \n", | |
" nullable\n", | |
" \n", | |
" </div>\n", | |
" </th>\n", | |
" \n", | |
" <th>\n", | |
" <div class=\"column_name\">first_name</div>\n", | |
" <div class=\"data_type\">2 (string)</div>\n", | |
" <div class=\"data_type\">\n", | |
" \n", | |
" nullable\n", | |
" \n", | |
" </div>\n", | |
" </th>\n", | |
" \n", | |
" <th>\n", | |
" <div class=\"column_name\">last_name</div>\n", | |
" <div class=\"data_type\">3 (string)</div>\n", | |
" <div class=\"data_type\">\n", | |
" \n", | |
" nullable\n", | |
" \n", | |
" </div>\n", | |
" </th>\n", | |
" \n", | |
" </tr>\n", | |
"\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" \n", | |
" <tr>\n", | |
" \n", | |
" <td>\n", | |
" <div title='4'>4</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='Billy'>Billy</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='Bonder'>Bonder</div>\n", | |
" </td>\n", | |
" \n", | |
" </tr>\n", | |
" \n", | |
" <tr>\n", | |
" \n", | |
" <td>\n", | |
" <div title='5'>5</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='Brian'>Brian</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='Black'>Black</div>\n", | |
" </td>\n", | |
" \n", | |
" </tr>\n", | |
" \n", | |
" <tr>\n", | |
" \n", | |
" <td>\n", | |
" <div title='6'>6</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='Bran'>Bran</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='Balwner'>Balwner</div>\n", | |
" </td>\n", | |
" \n", | |
" </tr>\n", | |
" \n", | |
" </tbody>\n", | |
"</table>\n", | |
"\n", | |
"<div class=\"info_items\">Viewing 3 of 5 rows / 3 columns</div>\n", | |
"<div class=\"info_items\">1 partition(s)</div>\n" | |
], | |
"text/plain": [ | |
"<IPython.core.display.HTML object>" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"data_2.table(3)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<style>\n", | |
" .data_type {\n", | |
" font-size: 0.8em;\n", | |
" font-weight: normal;\n", | |
" }\n", | |
"\n", | |
" .column_name {\n", | |
" font-size: 1.2em;\n", | |
" }\n", | |
"\n", | |
" .info_items {\n", | |
" margin: 10px 0;\n", | |
" font-size: 0.8em;\n", | |
" }\n", | |
"\n", | |
" .optimus_table td {\n", | |
" padding: 2px;\n", | |
" border-left: 1px solid #cccccc;\n", | |
" border-right: 1px solid #cccccc;\n", | |
" }\n", | |
"\n", | |
" .optimus_table tr:nth-child(even) {\n", | |
" background-color: #f2f2f2 !important;\n", | |
" }\n", | |
"\n", | |
" .optimus_table tr:nth-child(odd) {\n", | |
" background-color: #ffffff !important;\n", | |
" }\n", | |
"\n", | |
" .optimus_table thead {\n", | |
" border-bottom: 1px solid black;\n", | |
" }\n", | |
" .optimus_table{\n", | |
" font-size: 12px;\n", | |
" }\n", | |
"\n", | |
" .optimus_table tbody{\n", | |
" font-family: monospace;\n", | |
" border-bottom: 1px solid #cccccc;\n", | |
" }\n", | |
"\n", | |
"\n", | |
"</style>\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"<div class=\"info_items\">Viewing 3 of 10 rows / 2 columns</div>\n", | |
"<div class=\"info_items\">1 partition(s)</div>\n", | |
"\n", | |
"<table class=\"optimus_table\">\n", | |
" <thead>\n", | |
" <tr>\n", | |
" \n", | |
" <th>\n", | |
" <div class=\"column_name\">subject_id</div>\n", | |
" <div class=\"data_type\">1 (string)</div>\n", | |
" <div class=\"data_type\">\n", | |
" \n", | |
" nullable\n", | |
" \n", | |
" </div>\n", | |
" </th>\n", | |
" \n", | |
" <th>\n", | |
" <div class=\"column_name\">test_id</div>\n", | |
" <div class=\"data_type\">2 (bigint)</div>\n", | |
" <div class=\"data_type\">\n", | |
" \n", | |
" nullable\n", | |
" \n", | |
" </div>\n", | |
" </th>\n", | |
" \n", | |
" </tr>\n", | |
"\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" \n", | |
" <tr>\n", | |
" \n", | |
" <td>\n", | |
" <div title='1'>1</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='51'>51</div>\n", | |
" </td>\n", | |
" \n", | |
" </tr>\n", | |
" \n", | |
" <tr>\n", | |
" \n", | |
" <td>\n", | |
" <div title='2'>2</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='15'>15</div>\n", | |
" </td>\n", | |
" \n", | |
" </tr>\n", | |
" \n", | |
" <tr>\n", | |
" \n", | |
" <td>\n", | |
" <div title='3'>3</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='15'>15</div>\n", | |
" </td>\n", | |
" \n", | |
" </tr>\n", | |
" \n", | |
" </tbody>\n", | |
"</table>\n", | |
"\n", | |
"<div class=\"info_items\">Viewing 3 of 10 rows / 2 columns</div>\n", | |
"<div class=\"info_items\">1 partition(s)</div>\n" | |
], | |
"text/plain": [ | |
"<IPython.core.display.HTML object>" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"data_3.table(3)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Join the two dataframes along rows and assign all_data" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"all_data = data_1.union(data_2)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<style>\n", | |
" .data_type {\n", | |
" font-size: 0.8em;\n", | |
" font-weight: normal;\n", | |
" }\n", | |
"\n", | |
" .column_name {\n", | |
" font-size: 1.2em;\n", | |
" }\n", | |
"\n", | |
" .info_items {\n", | |
" margin: 10px 0;\n", | |
" font-size: 0.8em;\n", | |
" }\n", | |
"\n", | |
" .optimus_table td {\n", | |
" padding: 2px;\n", | |
" border-left: 1px solid #cccccc;\n", | |
" border-right: 1px solid #cccccc;\n", | |
" }\n", | |
"\n", | |
" .optimus_table tr:nth-child(even) {\n", | |
" background-color: #f2f2f2 !important;\n", | |
" }\n", | |
"\n", | |
" .optimus_table tr:nth-child(odd) {\n", | |
" background-color: #ffffff !important;\n", | |
" }\n", | |
"\n", | |
" .optimus_table thead {\n", | |
" border-bottom: 1px solid black;\n", | |
" }\n", | |
" .optimus_table{\n", | |
" font-size: 12px;\n", | |
" }\n", | |
"\n", | |
" .optimus_table tbody{\n", | |
" font-family: monospace;\n", | |
" border-bottom: 1px solid #cccccc;\n", | |
" }\n", | |
"\n", | |
"\n", | |
"</style>\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"<div class=\"info_items\">Viewing 10 of 10 rows / 3 columns</div>\n", | |
"<div class=\"info_items\">2 partition(s)</div>\n", | |
"\n", | |
"<table class=\"optimus_table\">\n", | |
" <thead>\n", | |
" <tr>\n", | |
" \n", | |
" <th>\n", | |
" <div class=\"column_name\">subject_id</div>\n", | |
" <div class=\"data_type\">1 (string)</div>\n", | |
" <div class=\"data_type\">\n", | |
" \n", | |
" nullable\n", | |
" \n", | |
" </div>\n", | |
" </th>\n", | |
" \n", | |
" <th>\n", | |
" <div class=\"column_name\">first_name</div>\n", | |
" <div class=\"data_type\">2 (string)</div>\n", | |
" <div class=\"data_type\">\n", | |
" \n", | |
" nullable\n", | |
" \n", | |
" </div>\n", | |
" </th>\n", | |
" \n", | |
" <th>\n", | |
" <div class=\"column_name\">last_name</div>\n", | |
" <div class=\"data_type\">3 (string)</div>\n", | |
" <div class=\"data_type\">\n", | |
" \n", | |
" nullable\n", | |
" \n", | |
" </div>\n", | |
" </th>\n", | |
" \n", | |
" </tr>\n", | |
"\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" \n", | |
" <tr>\n", | |
" \n", | |
" <td>\n", | |
" <div title='1'>1</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='Alex'>Alex</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='Anderson'>Anderson</div>\n", | |
" </td>\n", | |
" \n", | |
" </tr>\n", | |
" \n", | |
" <tr>\n", | |
" \n", | |
" <td>\n", | |
" <div title='2'>2</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='Amy'>Amy</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='Ackerman'>Ackerman</div>\n", | |
" </td>\n", | |
" \n", | |
" </tr>\n", | |
" \n", | |
" <tr>\n", | |
" \n", | |
" <td>\n", | |
" <div title='3'>3</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='Allen'>Allen</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='Ali'>Ali</div>\n", | |
" </td>\n", | |
" \n", | |
" </tr>\n", | |
" \n", | |
" <tr>\n", | |
" \n", | |
" <td>\n", | |
" <div title='4'>4</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='Alice'>Alice</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='Aoni'>Aoni</div>\n", | |
" </td>\n", | |
" \n", | |
" </tr>\n", | |
" \n", | |
" <tr>\n", | |
" \n", | |
" <td>\n", | |
" <div title='5'>5</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='Ayoung'>Ayoung</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='Atiches'>Atiches</div>\n", | |
" </td>\n", | |
" \n", | |
" </tr>\n", | |
" \n", | |
" <tr>\n", | |
" \n", | |
" <td>\n", | |
" <div title='4'>4</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='Billy'>Billy</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='Bonder'>Bonder</div>\n", | |
" </td>\n", | |
" \n", | |
" </tr>\n", | |
" \n", | |
" <tr>\n", | |
" \n", | |
" <td>\n", | |
" <div title='5'>5</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='Brian'>Brian</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='Black'>Black</div>\n", | |
" </td>\n", | |
" \n", | |
" </tr>\n", | |
" \n", | |
" <tr>\n", | |
" \n", | |
" <td>\n", | |
" <div title='6'>6</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='Bran'>Bran</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='Balwner'>Balwner</div>\n", | |
" </td>\n", | |
" \n", | |
" </tr>\n", | |
" \n", | |
" <tr>\n", | |
" \n", | |
" <td>\n", | |
" <div title='7'>7</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='Bryce'>Bryce</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='Brice'>Brice</div>\n", | |
" </td>\n", | |
" \n", | |
" </tr>\n", | |
" \n", | |
" <tr>\n", | |
" \n", | |
" <td>\n", | |
" <div title='8'>8</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='Betty'>Betty</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='Btisan'>Btisan</div>\n", | |
" </td>\n", | |
" \n", | |
" </tr>\n", | |
" \n", | |
" </tbody>\n", | |
"</table>\n", | |
"\n", | |
"<div class=\"info_items\">Viewing 10 of 10 rows / 3 columns</div>\n", | |
"<div class=\"info_items\">2 partition(s)</div>\n" | |
], | |
"text/plain": [ | |
"<IPython.core.display.HTML object>" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"all_data.table()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Join the two dataframes along columns and assing to all_data_col" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"The Spark way of doing this isn't great, check below for Optimus version" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from pyspark.sql.functions import monotonically_increasing_id" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 27, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# This is not that pretty\n", | |
"data_1_id = data_1.withColumn(\"id\", monotonically_increasing_id())\n", | |
"data_2_id = data_2.withColumn(\"id\", monotonically_increasing_id())\n", | |
"\n", | |
"all_data_col = data_1_id.join(data_2_id, \"id\", \"outer\").select(data_1_id.subject_id, data_1_id.first_name, \n", | |
" data_1_id.last_name, \n", | |
" data_2_id.subject_id.alias(\"subject_id_df2\"), \n", | |
" data_2_id.first_name.alias(\"first_name_df2\"), \n", | |
" data_2_id.last_name.alias(\"last_name_df2\")).drop(\"id\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 28, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<style>\n", | |
" .data_type {\n", | |
" font-size: 0.8em;\n", | |
" font-weight: normal;\n", | |
" }\n", | |
"\n", | |
" .column_name {\n", | |
" font-size: 1.2em;\n", | |
" }\n", | |
"\n", | |
" .info_items {\n", | |
" margin: 10px 0;\n", | |
" font-size: 0.8em;\n", | |
" }\n", | |
"\n", | |
" .optimus_table td {\n", | |
" padding: 2px;\n", | |
" border-left: 1px solid #cccccc;\n", | |
" border-right: 1px solid #cccccc;\n", | |
" }\n", | |
"\n", | |
" .optimus_table tr:nth-child(even) {\n", | |
" background-color: #f2f2f2 !important;\n", | |
" }\n", | |
"\n", | |
" .optimus_table tr:nth-child(odd) {\n", | |
" background-color: #ffffff !important;\n", | |
" }\n", | |
"\n", | |
" .optimus_table thead {\n", | |
" border-bottom: 1px solid black;\n", | |
" }\n", | |
" .optimus_table{\n", | |
" font-size: 12px;\n", | |
" }\n", | |
"\n", | |
" .optimus_table tbody{\n", | |
" font-family: monospace;\n", | |
" border-bottom: 1px solid #cccccc;\n", | |
" }\n", | |
"\n", | |
"\n", | |
"</style>\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"<div class=\"info_items\">Viewing 5 of 5 rows / 6 columns</div>\n", | |
"<div class=\"info_items\">200 partition(s)</div>\n", | |
"\n", | |
"<table class=\"optimus_table\">\n", | |
" <thead>\n", | |
" <tr>\n", | |
" \n", | |
" <th>\n", | |
" <div class=\"column_name\">subject_id</div>\n", | |
" <div class=\"data_type\">1 (string)</div>\n", | |
" <div class=\"data_type\">\n", | |
" \n", | |
" nullable\n", | |
" \n", | |
" </div>\n", | |
" </th>\n", | |
" \n", | |
" <th>\n", | |
" <div class=\"column_name\">first_name</div>\n", | |
" <div class=\"data_type\">2 (string)</div>\n", | |
" <div class=\"data_type\">\n", | |
" \n", | |
" nullable\n", | |
" \n", | |
" </div>\n", | |
" </th>\n", | |
" \n", | |
" <th>\n", | |
" <div class=\"column_name\">last_name</div>\n", | |
" <div class=\"data_type\">3 (string)</div>\n", | |
" <div class=\"data_type\">\n", | |
" \n", | |
" nullable\n", | |
" \n", | |
" </div>\n", | |
" </th>\n", | |
" \n", | |
" <th>\n", | |
" <div class=\"column_name\">subject_id_df2</div>\n", | |
" <div class=\"data_type\">4 (string)</div>\n", | |
" <div class=\"data_type\">\n", | |
" \n", | |
" nullable\n", | |
" \n", | |
" </div>\n", | |
" </th>\n", | |
" \n", | |
" <th>\n", | |
" <div class=\"column_name\">first_name_df2</div>\n", | |
" <div class=\"data_type\">5 (string)</div>\n", | |
" <div class=\"data_type\">\n", | |
" \n", | |
" nullable\n", | |
" \n", | |
" </div>\n", | |
" </th>\n", | |
" \n", | |
" <th>\n", | |
" <div class=\"column_name\">last_name_df2</div>\n", | |
" <div class=\"data_type\">6 (string)</div>\n", | |
" <div class=\"data_type\">\n", | |
" \n", | |
" nullable\n", | |
" \n", | |
" </div>\n", | |
" </th>\n", | |
" \n", | |
" </tr>\n", | |
"\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" \n", | |
" <tr>\n", | |
" \n", | |
" <td>\n", | |
" <div title='1'>1</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='Alex'>Alex</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='Anderson'>Anderson</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='4'>4</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='Billy'>Billy</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='Bonder'>Bonder</div>\n", | |
" </td>\n", | |
" \n", | |
" </tr>\n", | |
" \n", | |
" <tr>\n", | |
" \n", | |
" <td>\n", | |
" <div title='2'>2</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='Amy'>Amy</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='Ackerman'>Ackerman</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='5'>5</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='Brian'>Brian</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='Black'>Black</div>\n", | |
" </td>\n", | |
" \n", | |
" </tr>\n", | |
" \n", | |
" <tr>\n", | |
" \n", | |
" <td>\n", | |
" <div title='4'>4</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='Alice'>Alice</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='Aoni'>Aoni</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='7'>7</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='Bryce'>Bryce</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='Brice'>Brice</div>\n", | |
" </td>\n", | |
" \n", | |
" </tr>\n", | |
" \n", | |
" <tr>\n", | |
" \n", | |
" <td>\n", | |
" <div title='3'>3</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='Allen'>Allen</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='Ali'>Ali</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='6'>6</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='Bran'>Bran</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='Balwner'>Balwner</div>\n", | |
" </td>\n", | |
" \n", | |
" </tr>\n", | |
" \n", | |
" <tr>\n", | |
" \n", | |
" <td>\n", | |
" <div title='5'>5</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='Ayoung'>Ayoung</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='Atiches'>Atiches</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='8'>8</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='Betty'>Betty</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='Btisan'>Btisan</div>\n", | |
" </td>\n", | |
" \n", | |
" </tr>\n", | |
" \n", | |
" </tbody>\n", | |
"</table>\n", | |
"\n", | |
"<div class=\"info_items\">Viewing 5 of 5 rows / 6 columns</div>\n", | |
"<div class=\"info_items\">200 partition(s)</div>\n" | |
], | |
"text/plain": [ | |
"<IPython.core.display.HTML object>" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"all_data_col.table()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"What happened above is that Spark needs a unique identifier for each column, so you need to rename the first or the second set of columns for it to work. I know, not great, not terrible. But with Optimus is much simpler:" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"+----------+----------+---------+----------+----------+---------+\n", | |
"|subject_id|first_name|last_name|subject_id|first_name|last_name|\n", | |
"+----------+----------+---------+----------+----------+---------+\n", | |
"| 1| Alex| Anderson| 4| Billy| Bonder|\n", | |
"| 2| Amy| Ackerman| 5| Brian| Black|\n", | |
"| 4| Alice| Aoni| 7| Bryce| Brice|\n", | |
"| 3| Allen| Ali| 6| Bran| Balwner|\n", | |
"| 5| Ayoung| Atiches| 8| Betty| Btisan|\n", | |
"+----------+----------+---------+----------+----------+---------+\n", | |
"\n" | |
] | |
} | |
], | |
"source": [ | |
"data_1.cols.append([data_2]).show()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Better :)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Merge all_data and data_3 along the subject_id value" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 30, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<style>\n", | |
" .data_type {\n", | |
" font-size: 0.8em;\n", | |
" font-weight: normal;\n", | |
" }\n", | |
"\n", | |
" .column_name {\n", | |
" font-size: 1.2em;\n", | |
" }\n", | |
"\n", | |
" .info_items {\n", | |
" margin: 10px 0;\n", | |
" font-size: 0.8em;\n", | |
" }\n", | |
"\n", | |
" .optimus_table td {\n", | |
" padding: 2px;\n", | |
" border-left: 1px solid #cccccc;\n", | |
" border-right: 1px solid #cccccc;\n", | |
" }\n", | |
"\n", | |
" .optimus_table tr:nth-child(even) {\n", | |
" background-color: #f2f2f2 !important;\n", | |
" }\n", | |
"\n", | |
" .optimus_table tr:nth-child(odd) {\n", | |
" background-color: #ffffff !important;\n", | |
" }\n", | |
"\n", | |
" .optimus_table thead {\n", | |
" border-bottom: 1px solid black;\n", | |
" }\n", | |
" .optimus_table{\n", | |
" font-size: 12px;\n", | |
" }\n", | |
"\n", | |
" .optimus_table tbody{\n", | |
" font-family: monospace;\n", | |
" border-bottom: 1px solid #cccccc;\n", | |
" }\n", | |
"\n", | |
"\n", | |
"</style>\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"<div class=\"info_items\">Viewing 9 of 9 rows / 4 columns</div>\n", | |
"<div class=\"info_items\">200 partition(s)</div>\n", | |
"\n", | |
"<table class=\"optimus_table\">\n", | |
" <thead>\n", | |
" <tr>\n", | |
" \n", | |
" <th>\n", | |
" <div class=\"column_name\">subject_id</div>\n", | |
" <div class=\"data_type\">1 (string)</div>\n", | |
" <div class=\"data_type\">\n", | |
" \n", | |
" nullable\n", | |
" \n", | |
" </div>\n", | |
" </th>\n", | |
" \n", | |
" <th>\n", | |
" <div class=\"column_name\">first_name</div>\n", | |
" <div class=\"data_type\">2 (string)</div>\n", | |
" <div class=\"data_type\">\n", | |
" \n", | |
" nullable\n", | |
" \n", | |
" </div>\n", | |
" </th>\n", | |
" \n", | |
" <th>\n", | |
" <div class=\"column_name\">last_name</div>\n", | |
" <div class=\"data_type\">3 (string)</div>\n", | |
" <div class=\"data_type\">\n", | |
" \n", | |
" nullable\n", | |
" \n", | |
" </div>\n", | |
" </th>\n", | |
" \n", | |
" <th>\n", | |
" <div class=\"column_name\">test_id</div>\n", | |
" <div class=\"data_type\">4 (bigint)</div>\n", | |
" <div class=\"data_type\">\n", | |
" \n", | |
" nullable\n", | |
" \n", | |
" </div>\n", | |
" </th>\n", | |
" \n", | |
" </tr>\n", | |
"\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" \n", | |
" <tr>\n", | |
" \n", | |
" <td>\n", | |
" <div title='7'>7</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='Bryce'>Bryce</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='Brice'>Brice</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='14'>14</div>\n", | |
" </td>\n", | |
" \n", | |
" </tr>\n", | |
" \n", | |
" <tr>\n", | |
" \n", | |
" <td>\n", | |
" <div title='3'>3</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='Allen'>Allen</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='Ali'>Ali</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='15'>15</div>\n", | |
" </td>\n", | |
" \n", | |
" </tr>\n", | |
" \n", | |
" <tr>\n", | |
" \n", | |
" <td>\n", | |
" <div title='8'>8</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='Betty'>Betty</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='Btisan'>Btisan</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='15'>15</div>\n", | |
" </td>\n", | |
" \n", | |
" </tr>\n", | |
" \n", | |
" <tr>\n", | |
" \n", | |
" <td>\n", | |
" <div title='5'>5</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='Ayoung'>Ayoung</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='Atiches'>Atiches</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='16'>16</div>\n", | |
" </td>\n", | |
" \n", | |
" </tr>\n", | |
" \n", | |
" <tr>\n", | |
" \n", | |
" <td>\n", | |
" <div title='5'>5</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='Brian'>Brian</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='Black'>Black</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='16'>16</div>\n", | |
" </td>\n", | |
" \n", | |
" </tr>\n", | |
" \n", | |
" <tr>\n", | |
" \n", | |
" <td>\n", | |
" <div title='1'>1</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='Alex'>Alex</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='Anderson'>Anderson</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='51'>51</div>\n", | |
" </td>\n", | |
" \n", | |
" </tr>\n", | |
" \n", | |
" <tr>\n", | |
" \n", | |
" <td>\n", | |
" <div title='4'>4</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='Alice'>Alice</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='Aoni'>Aoni</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='61'>61</div>\n", | |
" </td>\n", | |
" \n", | |
" </tr>\n", | |
" \n", | |
" <tr>\n", | |
" \n", | |
" <td>\n", | |
" <div title='4'>4</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='Billy'>Billy</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='Bonder'>Bonder</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='61'>61</div>\n", | |
" </td>\n", | |
" \n", | |
" </tr>\n", | |
" \n", | |
" <tr>\n", | |
" \n", | |
" <td>\n", | |
" <div title='2'>2</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='Amy'>Amy</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='Ackerman'>Ackerman</div>\n", | |
" </td>\n", | |
" \n", | |
" <td>\n", | |
" <div title='15'>15</div>\n", | |
" </td>\n", | |
" \n", | |
" </tr>\n", | |
" \n", | |
" </tbody>\n", | |
"</table>\n", | |
"\n", | |
"<div class=\"info_items\">Viewing 9 of 9 rows / 4 columns</div>\n", | |
"<div class=\"info_items\">200 partition(s)</div>\n" | |
], | |
"text/plain": [ | |
"<IPython.core.display.HTML object>" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"all_data.join(data_3, on=\"subject_id\").table()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Merge only the data that has the same 'subject_id' on both data_1 and data_2" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 53, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"+----------+----------+---------+----------+----------+---------+\n", | |
"|subject_id|first_name|last_name|subject_id|first_name|last_name|\n", | |
"+----------+----------+---------+----------+----------+---------+\n", | |
"| 5| Ayoung| Atiches| 5| Brian| Black|\n", | |
"| 4| Alice| Aoni| 4| Billy| Bonder|\n", | |
"+----------+----------+---------+----------+----------+---------+\n", | |
"\n" | |
] | |
} | |
], | |
"source": [ | |
"data_1.alias(\"a\").join(data_2.alias(\"b\"), \\\n", | |
" on= col(\"a.subject_id\") == col(\"b.subject_id\"), \\\n", | |
" how= \"inner\").show() " | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Here the trick is to create an alias for each dataframe, otherwise Spark will complain." | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Merge all values in data1 and data2, with matching records from both sides where available." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 54, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"+----------+----------+---------+----------+----------+---------+\n", | |
"|subject_id|first_name|last_name|subject_id|first_name|last_name|\n", | |
"+----------+----------+---------+----------+----------+---------+\n", | |
"| null| null| null| 7| Bryce| Brice|\n", | |
"| 3| Allen| Ali| null| null| null|\n", | |
"| null| null| null| 8| Betty| Btisan|\n", | |
"| 5| Ayoung| Atiches| 5| Brian| Black|\n", | |
"| null| null| null| 6| Bran| Balwner|\n", | |
"| 1| Alex| Anderson| null| null| null|\n", | |
"| 4| Alice| Aoni| 4| Billy| Bonder|\n", | |
"| 2| Amy| Ackerman| null| null| null|\n", | |
"+----------+----------+---------+----------+----------+---------+\n", | |
"\n" | |
] | |
} | |
], | |
"source": [ | |
"data_1.alias(\"a\").join(data_2.alias(\"b\"), \\\n", | |
" on= col(\"a.subject_id\") == col(\"b.subject_id\"), \\\n", | |
" how= \"outer\").show() " | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.6.8" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment