Skip to content

Instantly share code, notes, and snippets.

@FavioVazquez
Created July 19, 2019 19:29
Show Gist options
  • Save FavioVazquez/9cb1a93b59813c3323b665d47de30242 to your computer and use it in GitHub Desktop.
Save FavioVazquez/9cb1a93b59813c3323b665d47de30242 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<style> /* Tables*/\n",
"\n",
" .data_type {\n",
" font-size: 0.8em;\n",
" font-weight: normal;\n",
" }\n",
"\n",
" .column_name {\n",
" font-size: 1.2em;\n",
" }\n",
"\n",
" .info_items {\n",
" margin: 10px 0;\n",
" font-size: 0.8em;\n",
" }\n",
"\n",
" .optimus_table td {\n",
" border: 0px;\n",
" }\n",
"\n",
" .optimus_table tr:nth-child(even) {\n",
" background-color: #f2f2f2 !important;\n",
" }\n",
"\n",
" .optimus_table tr:nth-child(odd) {\n",
" background-color: #ffffff !important;\n",
" }\n",
"\n",
" .optimus_table thead {\n",
" border-bottom: 1px solid black;\n",
" }\n",
" .optimus_table{\n",
" font-size: 12px;\n",
" }\n",
"\n",
" .optimus_table tbody{\n",
" font-family: monospace;\n",
" border-bottom: 1px solid #cccccc;\n",
" }\n",
"\n",
" /* Profiler */\n",
" .main{\n",
" width:100%;\n",
" overflow:auto;\n",
" border-bottom:1px solid #eeeeee;\n",
" padding: 10px 0;\n",
" }\n",
" .panel_profiler{\n",
" margin-right:2%;\n",
" float:left;\n",
" padding-bottom:2%;\n",
" }\n",
" .panel_profiler tbody{\n",
" font-family:monospace;\n",
" }\n",
" .title_profiler{\n",
" padding:20px;\n",
" background-color: #eeeeee\n",
" }\n",
" .info{\n",
" overflow: auto\n",
" }\n",
" .main td, main th{\n",
" padding:0em\n",
" }\n",
" .panel_profiler td {\n",
" padding:0.2em\n",
" }\n",
" .none, .true{\n",
" color:#0000ff\n",
" }\n",
" .optimus_table th {\n",
" font-family:sans-serif;\n",
" }\n",
"\n",
" .info_items{\n",
" font-family:sans-serif;\n",
" font-size:10px;\n",
" }\n",
"</style>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Import the necessary libraries\n",
"from optimus import Optimus\n",
"from pyspark.sql.functions import *\n",
"import pandas as pd\n",
"import numpy as np\n",
"op = Optimus()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Create sample data"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"raw_data_1 = {\n",
" 'subject_id': ['1', '2', '3', '4', '5'],\n",
" 'first_name': ['Alex', 'Amy', 'Allen', 'Alice', 'Ayoung'], \n",
" 'last_name': ['Anderson', 'Ackerman', 'Ali', 'Aoni', 'Atiches']}\n",
"\n",
"raw_data_2 = {\n",
" 'subject_id': ['4', '5', '6', '7', '8'],\n",
" 'first_name': ['Billy', 'Brian', 'Bran', 'Bryce', 'Betty'], \n",
" 'last_name': ['Bonder', 'Black', 'Balwner', 'Brice', 'Btisan']}\n",
"\n",
"raw_data_3 = {\n",
" 'subject_id': ['1', '2', '3', '4', '5', '7', '8', '9', '10', '11'],\n",
" 'test_id': [51, 15, 15, 61, 16, 14, 15, 1, 61, 16]}"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"data1_pd = pd.DataFrame(raw_data_1, columns = ['subject_id', 'first_name', 'last_name'])\n",
"data2_pd = pd.DataFrame(raw_data_2, columns = ['subject_id', 'first_name', 'last_name'])\n",
"data3_pd = pd.DataFrame(raw_data_3, columns = ['subject_id','test_id'])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Create Spark DF"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"data_1 = op.spark.createDataFrame(data1_pd)\n",
"data_2 = op.spark.createDataFrame(data2_pd)\n",
"data_3 = op.spark.createDataFrame(data3_pd)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<style>\n",
" .data_type {\n",
" font-size: 0.8em;\n",
" font-weight: normal;\n",
" }\n",
"\n",
" .column_name {\n",
" font-size: 1.2em;\n",
" }\n",
"\n",
" .info_items {\n",
" margin: 10px 0;\n",
" font-size: 0.8em;\n",
" }\n",
"\n",
" .optimus_table td {\n",
" padding: 2px;\n",
" border-left: 1px solid #cccccc;\n",
" border-right: 1px solid #cccccc;\n",
" }\n",
"\n",
" .optimus_table tr:nth-child(even) {\n",
" background-color: #f2f2f2 !important;\n",
" }\n",
"\n",
" .optimus_table tr:nth-child(odd) {\n",
" background-color: #ffffff !important;\n",
" }\n",
"\n",
" .optimus_table thead {\n",
" border-bottom: 1px solid black;\n",
" }\n",
" .optimus_table{\n",
" font-size: 12px;\n",
" }\n",
"\n",
" .optimus_table tbody{\n",
" font-family: monospace;\n",
" border-bottom: 1px solid #cccccc;\n",
" }\n",
"\n",
"\n",
"</style>\n",
"\n",
"\n",
"\n",
"\n",
"<div class=\"info_items\">Viewing 3 of 5 rows / 3 columns</div>\n",
"<div class=\"info_items\">1 partition(s)</div>\n",
"\n",
"<table class=\"optimus_table\">\n",
" <thead>\n",
" <tr>\n",
" \n",
" <th>\n",
" <div class=\"column_name\">subject_id</div>\n",
" <div class=\"data_type\">1 (string)</div>\n",
" <div class=\"data_type\">\n",
" \n",
" nullable\n",
" \n",
" </div>\n",
" </th>\n",
" \n",
" <th>\n",
" <div class=\"column_name\">first_name</div>\n",
" <div class=\"data_type\">2 (string)</div>\n",
" <div class=\"data_type\">\n",
" \n",
" nullable\n",
" \n",
" </div>\n",
" </th>\n",
" \n",
" <th>\n",
" <div class=\"column_name\">last_name</div>\n",
" <div class=\"data_type\">3 (string)</div>\n",
" <div class=\"data_type\">\n",
" \n",
" nullable\n",
" \n",
" </div>\n",
" </th>\n",
" \n",
" </tr>\n",
"\n",
" </thead>\n",
" <tbody>\n",
" \n",
" <tr>\n",
" \n",
" <td>\n",
" <div title='1'>1</div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div title='Alex'>Alex</div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div title='Anderson'>Anderson</div>\n",
" </td>\n",
" \n",
" </tr>\n",
" \n",
" <tr>\n",
" \n",
" <td>\n",
" <div title='2'>2</div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div title='Amy'>Amy</div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div title='Ackerman'>Ackerman</div>\n",
" </td>\n",
" \n",
" </tr>\n",
" \n",
" <tr>\n",
" \n",
" <td>\n",
" <div title='3'>3</div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div title='Allen'>Allen</div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div title='Ali'>Ali</div>\n",
" </td>\n",
" \n",
" </tr>\n",
" \n",
" </tbody>\n",
"</table>\n",
"\n",
"<div class=\"info_items\">Viewing 3 of 5 rows / 3 columns</div>\n",
"<div class=\"info_items\">1 partition(s)</div>\n"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"data_1.table(3)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<style>\n",
" .data_type {\n",
" font-size: 0.8em;\n",
" font-weight: normal;\n",
" }\n",
"\n",
" .column_name {\n",
" font-size: 1.2em;\n",
" }\n",
"\n",
" .info_items {\n",
" margin: 10px 0;\n",
" font-size: 0.8em;\n",
" }\n",
"\n",
" .optimus_table td {\n",
" padding: 2px;\n",
" border-left: 1px solid #cccccc;\n",
" border-right: 1px solid #cccccc;\n",
" }\n",
"\n",
" .optimus_table tr:nth-child(even) {\n",
" background-color: #f2f2f2 !important;\n",
" }\n",
"\n",
" .optimus_table tr:nth-child(odd) {\n",
" background-color: #ffffff !important;\n",
" }\n",
"\n",
" .optimus_table thead {\n",
" border-bottom: 1px solid black;\n",
" }\n",
" .optimus_table{\n",
" font-size: 12px;\n",
" }\n",
"\n",
" .optimus_table tbody{\n",
" font-family: monospace;\n",
" border-bottom: 1px solid #cccccc;\n",
" }\n",
"\n",
"\n",
"</style>\n",
"\n",
"\n",
"\n",
"\n",
"<div class=\"info_items\">Viewing 3 of 5 rows / 3 columns</div>\n",
"<div class=\"info_items\">1 partition(s)</div>\n",
"\n",
"<table class=\"optimus_table\">\n",
" <thead>\n",
" <tr>\n",
" \n",
" <th>\n",
" <div class=\"column_name\">subject_id</div>\n",
" <div class=\"data_type\">1 (string)</div>\n",
" <div class=\"data_type\">\n",
" \n",
" nullable\n",
" \n",
" </div>\n",
" </th>\n",
" \n",
" <th>\n",
" <div class=\"column_name\">first_name</div>\n",
" <div class=\"data_type\">2 (string)</div>\n",
" <div class=\"data_type\">\n",
" \n",
" nullable\n",
" \n",
" </div>\n",
" </th>\n",
" \n",
" <th>\n",
" <div class=\"column_name\">last_name</div>\n",
" <div class=\"data_type\">3 (string)</div>\n",
" <div class=\"data_type\">\n",
" \n",
" nullable\n",
" \n",
" </div>\n",
" </th>\n",
" \n",
" </tr>\n",
"\n",
" </thead>\n",
" <tbody>\n",
" \n",
" <tr>\n",
" \n",
" <td>\n",
" <div title='4'>4</div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div title='Billy'>Billy</div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div title='Bonder'>Bonder</div>\n",
" </td>\n",
" \n",
" </tr>\n",
" \n",
" <tr>\n",
" \n",
" <td>\n",
" <div title='5'>5</div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div title='Brian'>Brian</div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div title='Black'>Black</div>\n",
" </td>\n",
" \n",
" </tr>\n",
" \n",
" <tr>\n",
" \n",
" <td>\n",
" <div title='6'>6</div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div title='Bran'>Bran</div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div title='Balwner'>Balwner</div>\n",
" </td>\n",
" \n",
" </tr>\n",
" \n",
" </tbody>\n",
"</table>\n",
"\n",
"<div class=\"info_items\">Viewing 3 of 5 rows / 3 columns</div>\n",
"<div class=\"info_items\">1 partition(s)</div>\n"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"data_2.table(3)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<style>\n",
" .data_type {\n",
" font-size: 0.8em;\n",
" font-weight: normal;\n",
" }\n",
"\n",
" .column_name {\n",
" font-size: 1.2em;\n",
" }\n",
"\n",
" .info_items {\n",
" margin: 10px 0;\n",
" font-size: 0.8em;\n",
" }\n",
"\n",
" .optimus_table td {\n",
" padding: 2px;\n",
" border-left: 1px solid #cccccc;\n",
" border-right: 1px solid #cccccc;\n",
" }\n",
"\n",
" .optimus_table tr:nth-child(even) {\n",
" background-color: #f2f2f2 !important;\n",
" }\n",
"\n",
" .optimus_table tr:nth-child(odd) {\n",
" background-color: #ffffff !important;\n",
" }\n",
"\n",
" .optimus_table thead {\n",
" border-bottom: 1px solid black;\n",
" }\n",
" .optimus_table{\n",
" font-size: 12px;\n",
" }\n",
"\n",
" .optimus_table tbody{\n",
" font-family: monospace;\n",
" border-bottom: 1px solid #cccccc;\n",
" }\n",
"\n",
"\n",
"</style>\n",
"\n",
"\n",
"\n",
"\n",
"<div class=\"info_items\">Viewing 3 of 10 rows / 2 columns</div>\n",
"<div class=\"info_items\">1 partition(s)</div>\n",
"\n",
"<table class=\"optimus_table\">\n",
" <thead>\n",
" <tr>\n",
" \n",
" <th>\n",
" <div class=\"column_name\">subject_id</div>\n",
" <div class=\"data_type\">1 (string)</div>\n",
" <div class=\"data_type\">\n",
" \n",
" nullable\n",
" \n",
" </div>\n",
" </th>\n",
" \n",
" <th>\n",
" <div class=\"column_name\">test_id</div>\n",
" <div class=\"data_type\">2 (bigint)</div>\n",
" <div class=\"data_type\">\n",
" \n",
" nullable\n",
" \n",
" </div>\n",
" </th>\n",
" \n",
" </tr>\n",
"\n",
" </thead>\n",
" <tbody>\n",
" \n",
" <tr>\n",
" \n",
" <td>\n",
" <div title='1'>1</div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div title='51'>51</div>\n",
" </td>\n",
" \n",
" </tr>\n",
" \n",
" <tr>\n",
" \n",
" <td>\n",
" <div title='2'>2</div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div title='15'>15</div>\n",
" </td>\n",
" \n",
" </tr>\n",
" \n",
" <tr>\n",
" \n",
" <td>\n",
" <div title='3'>3</div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div title='15'>15</div>\n",
" </td>\n",
" \n",
" </tr>\n",
" \n",
" </tbody>\n",
"</table>\n",
"\n",
"<div class=\"info_items\">Viewing 3 of 10 rows / 2 columns</div>\n",
"<div class=\"info_items\">1 partition(s)</div>\n"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"data_3.table(3)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Join the two dataframes along rows and assign all_data"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"all_data = data_1.union(data_2)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<style>\n",
" .data_type {\n",
" font-size: 0.8em;\n",
" font-weight: normal;\n",
" }\n",
"\n",
" .column_name {\n",
" font-size: 1.2em;\n",
" }\n",
"\n",
" .info_items {\n",
" margin: 10px 0;\n",
" font-size: 0.8em;\n",
" }\n",
"\n",
" .optimus_table td {\n",
" padding: 2px;\n",
" border-left: 1px solid #cccccc;\n",
" border-right: 1px solid #cccccc;\n",
" }\n",
"\n",
" .optimus_table tr:nth-child(even) {\n",
" background-color: #f2f2f2 !important;\n",
" }\n",
"\n",
" .optimus_table tr:nth-child(odd) {\n",
" background-color: #ffffff !important;\n",
" }\n",
"\n",
" .optimus_table thead {\n",
" border-bottom: 1px solid black;\n",
" }\n",
" .optimus_table{\n",
" font-size: 12px;\n",
" }\n",
"\n",
" .optimus_table tbody{\n",
" font-family: monospace;\n",
" border-bottom: 1px solid #cccccc;\n",
" }\n",
"\n",
"\n",
"</style>\n",
"\n",
"\n",
"\n",
"\n",
"<div class=\"info_items\">Viewing 10 of 10 rows / 3 columns</div>\n",
"<div class=\"info_items\">2 partition(s)</div>\n",
"\n",
"<table class=\"optimus_table\">\n",
" <thead>\n",
" <tr>\n",
" \n",
" <th>\n",
" <div class=\"column_name\">subject_id</div>\n",
" <div class=\"data_type\">1 (string)</div>\n",
" <div class=\"data_type\">\n",
" \n",
" nullable\n",
" \n",
" </div>\n",
" </th>\n",
" \n",
" <th>\n",
" <div class=\"column_name\">first_name</div>\n",
" <div class=\"data_type\">2 (string)</div>\n",
" <div class=\"data_type\">\n",
" \n",
" nullable\n",
" \n",
" </div>\n",
" </th>\n",
" \n",
" <th>\n",
" <div class=\"column_name\">last_name</div>\n",
" <div class=\"data_type\">3 (string)</div>\n",
" <div class=\"data_type\">\n",
" \n",
" nullable\n",
" \n",
" </div>\n",
" </th>\n",
" \n",
" </tr>\n",
"\n",
" </thead>\n",
" <tbody>\n",
" \n",
" <tr>\n",
" \n",
" <td>\n",
" <div title='1'>1</div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div title='Alex'>Alex</div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div title='Anderson'>Anderson</div>\n",
" </td>\n",
" \n",
" </tr>\n",
" \n",
" <tr>\n",
" \n",
" <td>\n",
" <div title='2'>2</div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div title='Amy'>Amy</div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div title='Ackerman'>Ackerman</div>\n",
" </td>\n",
" \n",
" </tr>\n",
" \n",
" <tr>\n",
" \n",
" <td>\n",
" <div title='3'>3</div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div title='Allen'>Allen</div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div title='Ali'>Ali</div>\n",
" </td>\n",
" \n",
" </tr>\n",
" \n",
" <tr>\n",
" \n",
" <td>\n",
" <div title='4'>4</div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div title='Alice'>Alice</div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div title='Aoni'>Aoni</div>\n",
" </td>\n",
" \n",
" </tr>\n",
" \n",
" <tr>\n",
" \n",
" <td>\n",
" <div title='5'>5</div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div title='Ayoung'>Ayoung</div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div title='Atiches'>Atiches</div>\n",
" </td>\n",
" \n",
" </tr>\n",
" \n",
" <tr>\n",
" \n",
" <td>\n",
" <div title='4'>4</div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div title='Billy'>Billy</div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div title='Bonder'>Bonder</div>\n",
" </td>\n",
" \n",
" </tr>\n",
" \n",
" <tr>\n",
" \n",
" <td>\n",
" <div title='5'>5</div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div title='Brian'>Brian</div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div title='Black'>Black</div>\n",
" </td>\n",
" \n",
" </tr>\n",
" \n",
" <tr>\n",
" \n",
" <td>\n",
" <div title='6'>6</div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div title='Bran'>Bran</div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div title='Balwner'>Balwner</div>\n",
" </td>\n",
" \n",
" </tr>\n",
" \n",
" <tr>\n",
" \n",
" <td>\n",
" <div title='7'>7</div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div title='Bryce'>Bryce</div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div title='Brice'>Brice</div>\n",
" </td>\n",
" \n",
" </tr>\n",
" \n",
" <tr>\n",
" \n",
" <td>\n",
" <div title='8'>8</div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div title='Betty'>Betty</div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div title='Btisan'>Btisan</div>\n",
" </td>\n",
" \n",
" </tr>\n",
" \n",
" </tbody>\n",
"</table>\n",
"\n",
"<div class=\"info_items\">Viewing 10 of 10 rows / 3 columns</div>\n",
"<div class=\"info_items\">2 partition(s)</div>\n"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"all_data.table()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Join the two dataframes along columns and assing to all_data_col"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The Spark way of doing this isn't great, check below for Optimus version"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"from pyspark.sql.functions import monotonically_increasing_id"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [],
"source": [
"# This is not that pretty\n",
"data_1_id = data_1.withColumn(\"id\", monotonically_increasing_id())\n",
"data_2_id = data_2.withColumn(\"id\", monotonically_increasing_id())\n",
"\n",
"all_data_col = data_1_id.join(data_2_id, \"id\", \"outer\").select(data_1_id.subject_id, data_1_id.first_name, \n",
" data_1_id.last_name, \n",
" data_2_id.subject_id.alias(\"subject_id_df2\"), \n",
" data_2_id.first_name.alias(\"first_name_df2\"), \n",
" data_2_id.last_name.alias(\"last_name_df2\")).drop(\"id\")"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<style>\n",
" .data_type {\n",
" font-size: 0.8em;\n",
" font-weight: normal;\n",
" }\n",
"\n",
" .column_name {\n",
" font-size: 1.2em;\n",
" }\n",
"\n",
" .info_items {\n",
" margin: 10px 0;\n",
" font-size: 0.8em;\n",
" }\n",
"\n",
" .optimus_table td {\n",
" padding: 2px;\n",
" border-left: 1px solid #cccccc;\n",
" border-right: 1px solid #cccccc;\n",
" }\n",
"\n",
" .optimus_table tr:nth-child(even) {\n",
" background-color: #f2f2f2 !important;\n",
" }\n",
"\n",
" .optimus_table tr:nth-child(odd) {\n",
" background-color: #ffffff !important;\n",
" }\n",
"\n",
" .optimus_table thead {\n",
" border-bottom: 1px solid black;\n",
" }\n",
" .optimus_table{\n",
" font-size: 12px;\n",
" }\n",
"\n",
" .optimus_table tbody{\n",
" font-family: monospace;\n",
" border-bottom: 1px solid #cccccc;\n",
" }\n",
"\n",
"\n",
"</style>\n",
"\n",
"\n",
"\n",
"\n",
"<div class=\"info_items\">Viewing 5 of 5 rows / 6 columns</div>\n",
"<div class=\"info_items\">200 partition(s)</div>\n",
"\n",
"<table class=\"optimus_table\">\n",
" <thead>\n",
" <tr>\n",
" \n",
" <th>\n",
" <div class=\"column_name\">subject_id</div>\n",
" <div class=\"data_type\">1 (string)</div>\n",
" <div class=\"data_type\">\n",
" \n",
" nullable\n",
" \n",
" </div>\n",
" </th>\n",
" \n",
" <th>\n",
" <div class=\"column_name\">first_name</div>\n",
" <div class=\"data_type\">2 (string)</div>\n",
" <div class=\"data_type\">\n",
" \n",
" nullable\n",
" \n",
" </div>\n",
" </th>\n",
" \n",
" <th>\n",
" <div class=\"column_name\">last_name</div>\n",
" <div class=\"data_type\">3 (string)</div>\n",
" <div class=\"data_type\">\n",
" \n",
" nullable\n",
" \n",
" </div>\n",
" </th>\n",
" \n",
" <th>\n",
" <div class=\"column_name\">subject_id_df2</div>\n",
" <div class=\"data_type\">4 (string)</div>\n",
" <div class=\"data_type\">\n",
" \n",
" nullable\n",
" \n",
" </div>\n",
" </th>\n",
" \n",
" <th>\n",
" <div class=\"column_name\">first_name_df2</div>\n",
" <div class=\"data_type\">5 (string)</div>\n",
" <div class=\"data_type\">\n",
" \n",
" nullable\n",
" \n",
" </div>\n",
" </th>\n",
" \n",
" <th>\n",
" <div class=\"column_name\">last_name_df2</div>\n",
" <div class=\"data_type\">6 (string)</div>\n",
" <div class=\"data_type\">\n",
" \n",
" nullable\n",
" \n",
" </div>\n",
" </th>\n",
" \n",
" </tr>\n",
"\n",
" </thead>\n",
" <tbody>\n",
" \n",
" <tr>\n",
" \n",
" <td>\n",
" <div title='1'>1</div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div title='Alex'>Alex</div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div title='Anderson'>Anderson</div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div title='4'>4</div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div title='Billy'>Billy</div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div title='Bonder'>Bonder</div>\n",
" </td>\n",
" \n",
" </tr>\n",
" \n",
" <tr>\n",
" \n",
" <td>\n",
" <div title='2'>2</div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div title='Amy'>Amy</div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div title='Ackerman'>Ackerman</div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div title='5'>5</div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div title='Brian'>Brian</div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div title='Black'>Black</div>\n",
" </td>\n",
" \n",
" </tr>\n",
" \n",
" <tr>\n",
" \n",
" <td>\n",
" <div title='4'>4</div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div title='Alice'>Alice</div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div title='Aoni'>Aoni</div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div title='7'>7</div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div title='Bryce'>Bryce</div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div title='Brice'>Brice</div>\n",
" </td>\n",
" \n",
" </tr>\n",
" \n",
" <tr>\n",
" \n",
" <td>\n",
" <div title='3'>3</div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div title='Allen'>Allen</div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div title='Ali'>Ali</div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div title='6'>6</div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div title='Bran'>Bran</div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div title='Balwner'>Balwner</div>\n",
" </td>\n",
" \n",
" </tr>\n",
" \n",
" <tr>\n",
" \n",
" <td>\n",
" <div title='5'>5</div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div title='Ayoung'>Ayoung</div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div title='Atiches'>Atiches</div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div title='8'>8</div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div title='Betty'>Betty</div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div title='Btisan'>Btisan</div>\n",
" </td>\n",
" \n",
" </tr>\n",
" \n",
" </tbody>\n",
"</table>\n",
"\n",
"<div class=\"info_items\">Viewing 5 of 5 rows / 6 columns</div>\n",
"<div class=\"info_items\">200 partition(s)</div>\n"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"all_data_col.table()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"What happened above is that Spark needs a unique identifier for each column, so you need to rename the first or the second set of columns for it to work. I know, not great, not terrible. But with Optimus is much simpler:"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+----------+----------+---------+----------+----------+---------+\n",
"|subject_id|first_name|last_name|subject_id|first_name|last_name|\n",
"+----------+----------+---------+----------+----------+---------+\n",
"| 1| Alex| Anderson| 4| Billy| Bonder|\n",
"| 2| Amy| Ackerman| 5| Brian| Black|\n",
"| 4| Alice| Aoni| 7| Bryce| Brice|\n",
"| 3| Allen| Ali| 6| Bran| Balwner|\n",
"| 5| Ayoung| Atiches| 8| Betty| Btisan|\n",
"+----------+----------+---------+----------+----------+---------+\n",
"\n"
]
}
],
"source": [
"data_1.cols.append([data_2]).show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Better :)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Merge all_data and data_3 along the subject_id value"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<style>\n",
" .data_type {\n",
" font-size: 0.8em;\n",
" font-weight: normal;\n",
" }\n",
"\n",
" .column_name {\n",
" font-size: 1.2em;\n",
" }\n",
"\n",
" .info_items {\n",
" margin: 10px 0;\n",
" font-size: 0.8em;\n",
" }\n",
"\n",
" .optimus_table td {\n",
" padding: 2px;\n",
" border-left: 1px solid #cccccc;\n",
" border-right: 1px solid #cccccc;\n",
" }\n",
"\n",
" .optimus_table tr:nth-child(even) {\n",
" background-color: #f2f2f2 !important;\n",
" }\n",
"\n",
" .optimus_table tr:nth-child(odd) {\n",
" background-color: #ffffff !important;\n",
" }\n",
"\n",
" .optimus_table thead {\n",
" border-bottom: 1px solid black;\n",
" }\n",
" .optimus_table{\n",
" font-size: 12px;\n",
" }\n",
"\n",
" .optimus_table tbody{\n",
" font-family: monospace;\n",
" border-bottom: 1px solid #cccccc;\n",
" }\n",
"\n",
"\n",
"</style>\n",
"\n",
"\n",
"\n",
"\n",
"<div class=\"info_items\">Viewing 9 of 9 rows / 4 columns</div>\n",
"<div class=\"info_items\">200 partition(s)</div>\n",
"\n",
"<table class=\"optimus_table\">\n",
" <thead>\n",
" <tr>\n",
" \n",
" <th>\n",
" <div class=\"column_name\">subject_id</div>\n",
" <div class=\"data_type\">1 (string)</div>\n",
" <div class=\"data_type\">\n",
" \n",
" nullable\n",
" \n",
" </div>\n",
" </th>\n",
" \n",
" <th>\n",
" <div class=\"column_name\">first_name</div>\n",
" <div class=\"data_type\">2 (string)</div>\n",
" <div class=\"data_type\">\n",
" \n",
" nullable\n",
" \n",
" </div>\n",
" </th>\n",
" \n",
" <th>\n",
" <div class=\"column_name\">last_name</div>\n",
" <div class=\"data_type\">3 (string)</div>\n",
" <div class=\"data_type\">\n",
" \n",
" nullable\n",
" \n",
" </div>\n",
" </th>\n",
" \n",
" <th>\n",
" <div class=\"column_name\">test_id</div>\n",
" <div class=\"data_type\">4 (bigint)</div>\n",
" <div class=\"data_type\">\n",
" \n",
" nullable\n",
" \n",
" </div>\n",
" </th>\n",
" \n",
" </tr>\n",
"\n",
" </thead>\n",
" <tbody>\n",
" \n",
" <tr>\n",
" \n",
" <td>\n",
" <div title='7'>7</div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div title='Bryce'>Bryce</div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div title='Brice'>Brice</div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div title='14'>14</div>\n",
" </td>\n",
" \n",
" </tr>\n",
" \n",
" <tr>\n",
" \n",
" <td>\n",
" <div title='3'>3</div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div title='Allen'>Allen</div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div title='Ali'>Ali</div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div title='15'>15</div>\n",
" </td>\n",
" \n",
" </tr>\n",
" \n",
" <tr>\n",
" \n",
" <td>\n",
" <div title='8'>8</div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div title='Betty'>Betty</div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div title='Btisan'>Btisan</div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div title='15'>15</div>\n",
" </td>\n",
" \n",
" </tr>\n",
" \n",
" <tr>\n",
" \n",
" <td>\n",
" <div title='5'>5</div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div title='Ayoung'>Ayoung</div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div title='Atiches'>Atiches</div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div title='16'>16</div>\n",
" </td>\n",
" \n",
" </tr>\n",
" \n",
" <tr>\n",
" \n",
" <td>\n",
" <div title='5'>5</div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div title='Brian'>Brian</div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div title='Black'>Black</div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div title='16'>16</div>\n",
" </td>\n",
" \n",
" </tr>\n",
" \n",
" <tr>\n",
" \n",
" <td>\n",
" <div title='1'>1</div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div title='Alex'>Alex</div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div title='Anderson'>Anderson</div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div title='51'>51</div>\n",
" </td>\n",
" \n",
" </tr>\n",
" \n",
" <tr>\n",
" \n",
" <td>\n",
" <div title='4'>4</div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div title='Alice'>Alice</div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div title='Aoni'>Aoni</div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div title='61'>61</div>\n",
" </td>\n",
" \n",
" </tr>\n",
" \n",
" <tr>\n",
" \n",
" <td>\n",
" <div title='4'>4</div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div title='Billy'>Billy</div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div title='Bonder'>Bonder</div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div title='61'>61</div>\n",
" </td>\n",
" \n",
" </tr>\n",
" \n",
" <tr>\n",
" \n",
" <td>\n",
" <div title='2'>2</div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div title='Amy'>Amy</div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div title='Ackerman'>Ackerman</div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div title='15'>15</div>\n",
" </td>\n",
" \n",
" </tr>\n",
" \n",
" </tbody>\n",
"</table>\n",
"\n",
"<div class=\"info_items\">Viewing 9 of 9 rows / 4 columns</div>\n",
"<div class=\"info_items\">200 partition(s)</div>\n"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"all_data.join(data_3, on=\"subject_id\").table()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Merge only the data that has the same 'subject_id' on both data_1 and data_2"
]
},
{
"cell_type": "code",
"execution_count": 53,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+----------+----------+---------+----------+----------+---------+\n",
"|subject_id|first_name|last_name|subject_id|first_name|last_name|\n",
"+----------+----------+---------+----------+----------+---------+\n",
"| 5| Ayoung| Atiches| 5| Brian| Black|\n",
"| 4| Alice| Aoni| 4| Billy| Bonder|\n",
"+----------+----------+---------+----------+----------+---------+\n",
"\n"
]
}
],
"source": [
"data_1.alias(\"a\").join(data_2.alias(\"b\"), \\\n",
" on= col(\"a.subject_id\") == col(\"b.subject_id\"), \\\n",
" how= \"inner\").show() "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Here the trick is to create an alias for each dataframe, otherwise Spark will complain."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Merge all values in data1 and data2, with matching records from both sides where available."
]
},
{
"cell_type": "code",
"execution_count": 54,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+----------+----------+---------+----------+----------+---------+\n",
"|subject_id|first_name|last_name|subject_id|first_name|last_name|\n",
"+----------+----------+---------+----------+----------+---------+\n",
"| null| null| null| 7| Bryce| Brice|\n",
"| 3| Allen| Ali| null| null| null|\n",
"| null| null| null| 8| Betty| Btisan|\n",
"| 5| Ayoung| Atiches| 5| Brian| Black|\n",
"| null| null| null| 6| Bran| Balwner|\n",
"| 1| Alex| Anderson| null| null| null|\n",
"| 4| Alice| Aoni| 4| Billy| Bonder|\n",
"| 2| Amy| Ackerman| null| null| null|\n",
"+----------+----------+---------+----------+----------+---------+\n",
"\n"
]
}
],
"source": [
"data_1.alias(\"a\").join(data_2.alias(\"b\"), \\\n",
" on= col(\"a.subject_id\") == col(\"b.subject_id\"), \\\n",
" how= \"outer\").show() "
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.8"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment