vvgsrk · May 19, 2021 19:23
diff --git a/HR_Data_Employee_CSV_Parse_Flatten_JSON_Columns_Function.ipynb b/HR_Data_Employee_CSV_Parse_Flatten_JSON_Columns_Function.ipynb
 {
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "First of all, Import necessary modules"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import json\n",
    "import numpy as np\n",
    "from ast import literal_eval"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Set jupyter notebook row and column display"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Set ipython's max row display\n",
    "pd.set_option('display.max_row', 100)\n",
    "\n",
    "# Set iPython's max column width to 50\n",
    "pd.set_option('display.max_columns', 70)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Declare read schema to use while reading the CSV using pandas"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "employees_schema = {\n",
    "    'employee_id' : 'Int64',\n",
    "    'first_name' : 'str',\n",
    "    'last_name' : 'str',\n",
    "    'email' : 'str',\n",
    "    'offer_id' : 'str',\n",
    "    'phone_number' : 'str',\n",
    "    'hire_date' : 'str',\n",
    "    'job_id' : 'str',\n",
    "    'salary' : 'float64',\n",
    "    'commission_pct' : 'float64',\n",
    "    'manager_id' : 'Int64',\n",
    "    'department_id' : 'Int64'\n",
    "}"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Declare all date columns in a list"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "employees_date_cols = ['hire_date']"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Dclare all JSON columns in a list"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "employees_json_columns = ['custom_data']"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Declare expected schema of the JSON "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "employees_custom_data_schema = {\n",
    "    'custom_data_address_line_1': [],\n",
    "    'custom_data_address_line_2': [],\n",
    "    'custom_data_zip': [],\n",
    "    'custom_data_date_of_birth': [],\n",
    "    'custom_data_gender': [],\n",
    "    'custom_data_marital_status': []\n",
    "}"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Read CSV using pandas library and specify file path, data types and parse dates parameters"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "raw_df = pd.read_csv(filepath_or_buffer='employees.csv', dtype=employees_schema, parse_dates=employees_date_cols)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "See first 5 rows using below statement"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>employee_id</th>\n",
       "      <th>first_name</th>\n",
       "      <th>last_name</th>\n",
       "      <th>email</th>\n",
       "      <th>phone_number</th>\n",
       "      <th>hire_date</th>\n",
       "      <th>job_id</th>\n",
       "      <th>salary</th>\n",
       "      <th>commission_pct</th>\n",
       "      <th>manager_id</th>\n",
       "      <th>department_id</th>\n",
       "      <th>custom_data</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>100</td>\n",
       "      <td>Steven</td>\n",
       "      <td>King</td>\n",
       "      <td>SKING</td>\n",
       "      <td>515.123.4567</td>\n",
       "      <td>1987-06-17</td>\n",
       "      <td>AD_PRES</td>\n",
       "      <td>24000.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>90</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>101</td>\n",
       "      <td>Neena</td>\n",
       "      <td>Kochhar</td>\n",
       "      <td>NKOCHHAR</td>\n",
       "      <td>515.123.4568</td>\n",
       "      <td>1989-09-21</td>\n",
       "      <td>AD_VP</td>\n",
       "      <td>17000.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>100</td>\n",
       "      <td>90</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>102</td>\n",
       "      <td>Lex</td>\n",
       "      <td>De Haan</td>\n",
       "      <td>LDEHAAN</td>\n",
       "      <td>515.123.4569</td>\n",
       "      <td>1993-01-13</td>\n",
       "      <td>AD_VP</td>\n",
       "      <td>17000.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>100</td>\n",
       "      <td>90</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>103</td>\n",
       "      <td>Alexander</td>\n",
       "      <td>Hunold</td>\n",
       "      <td>AHUNOLD</td>\n",
       "      <td>590.423.4567</td>\n",
       "      <td>1990-01-03</td>\n",
       "      <td>IT_PROG</td>\n",
       "      <td>9000.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>102</td>\n",
       "      <td>60</td>\n",
       "      <td>{\"address_line_1\":\"6649 N Blue Gum St\",\"addres...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>104</td>\n",
       "      <td>Bruce</td>\n",
       "      <td>Ernst</td>\n",
       "      <td>BERNST</td>\n",
       "      <td>590.423.4568</td>\n",
       "      <td>1991-05-21</td>\n",
       "      <td>IT_PROG</td>\n",
       "      <td>6000.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>103</td>\n",
       "      <td>60</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   employee_id first_name last_name     email  phone_number  hire_date  \\\n",
       "0          100     Steven      King     SKING  515.123.4567 1987-06-17   \n",
       "1          101      Neena   Kochhar  NKOCHHAR  515.123.4568 1989-09-21   \n",
       "2          102        Lex   De Haan   LDEHAAN  515.123.4569 1993-01-13   \n",
       "3          103  Alexander    Hunold   AHUNOLD  590.423.4567 1990-01-03   \n",
       "4          104      Bruce     Ernst    BERNST  590.423.4568 1991-05-21   \n",
       "\n",
       "    job_id   salary  commission_pct  manager_id  department_id  \\\n",
       "0  AD_PRES  24000.0             NaN        <NA>             90   \n",
       "1    AD_VP  17000.0             NaN         100             90   \n",
       "2    AD_VP  17000.0             NaN         100             90   \n",
       "3  IT_PROG   9000.0             NaN         102             60   \n",
       "4  IT_PROG   6000.0             NaN         103             60   \n",
       "\n",
       "                                         custom_data  \n",
       "0                                                NaN  \n",
       "1                                                NaN  \n",
       "2                                                NaN  \n",
       "3  {\"address_line_1\":\"6649 N Blue Gum St\",\"addres...  \n",
       "4                                                NaN  "
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "raw_df.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Veriy the data frame data types "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "employee_id                Int64\n",
       "first_name                object\n",
       "last_name                 object\n",
       "email                     object\n",
       "phone_number              object\n",
       "hire_date         datetime64[ns]\n",
       "job_id                    object\n",
       "salary                   float64\n",
       "commission_pct           float64\n",
       "manager_id                 Int64\n",
       "department_id              Int64\n",
       "custom_data               object\n",
       "dtype: object"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "raw_df.dtypes"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Create a function to flattern JSON to a structured data frame"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "def flatten_json_columns(df, json_cols, custom_df):\n",
    "    \"\"\"\n",
    "    This function flattens JSON columns to individual columns\n",
    "    It merges the flattened dataframe with expected dataframe to capture missing columns from JSON\n",
    "    :param df: Employees CSV raw dataframe\n",
    "    :param json_cols: custom data columns in employees CSV\n",
    "    :param custom_df: expected dataframe\n",
    "    :return: returns df pandas dataframe\n",
    "    \"\"\"\n",
    "\n",
    "    # Loop through all JSON columns\n",
    "    for column in json_cols:\n",
    "        if not df[column].isnull().all():\n",
    "            # create a temp col to preserve the orginal data\n",
    "            df['custom_data_temp'] = df[column]\n",
    "            # Replace None and NaN with empty braces\n",
    "            df[column].fillna(value='{}', inplace=True)\n",
    "            try:\n",
    "                #Deserialize's a str instance containing a JSON document to a Python object\n",
    "                df[column] = [json.loads(row, strict=False) for row in df[column]]           \n",
    "            except TypeError:\n",
    "                # Convert all values to string using literal eval\n",
    "                df[column] = df[column].apply(lambda x: literal_eval(str(x)))\n",
    "            # Normalize semi-structured JSON data into a flat table\n",
    "            column_as_df = pd.json_normalize(df[column])\n",
    "            # Extract main column name and attach it to each sub column name\n",
    "            column_as_df.columns = [f\"{column}_{subcolumn}\" for subcolumn in column_as_df.columns]\n",
    "            # Replace empty strings with None\n",
    "            column_as_df.replace('', np.nan, inplace=True)\n",
    "            # Replace orginal data with temp data\n",
    "            df[column] = df['custom_data_temp']\n",
    "            # Merge extracted result from custom_data field with expected fields\n",
    "            result_df = pd.merge(column_as_df, custom_df, how='left')\n",
    "            # Drop the temp column and merge the flattened dataframe with orginal dataframe\n",
    "            df = df.drop('custom_data_temp', axis=1).merge(result_df, right_index=True, left_index=True)\n",
    "        else:\n",
    "            df = pd.concat([df, custom_df], axis=1)\n",
    "\n",
    "    # Return dataframe with flatten columns\n",
    "    return df"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Create expected dataframe from the dictonary"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "custom_data_expected_df=pd.DataFrame.from_dict(employees_custom_data_schema).astype(str, copy=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "View the empty data frame"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>custom_data_address_line_1</th>\n",
       "      <th>custom_data_address_line_2</th>\n",
       "      <th>custom_data_zip</th>\n",
       "      <th>custom_data_date_of_birth</th>\n",
       "      <th>custom_data_gender</th>\n",
       "      <th>custom_data_marital_status</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "Empty DataFrame\n",
       "Columns: [custom_data_address_line_1, custom_data_address_line_2, custom_data_zip, custom_data_date_of_birth, custom_data_gender, custom_data_marital_status]\n",
       "Index: []"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "custom_data_expected_df"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Verify the data frame data types"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "custom_data_address_line_1    object\n",
       "custom_data_address_line_2    object\n",
       "custom_data_zip               object\n",
       "custom_data_date_of_birth     object\n",
       "custom_data_gender            object\n",
       "custom_data_marital_status    object\n",
       "dtype: object"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "custom_data_expected_df.dtypes"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Invoke flatten function by passing employee raw data frame, json column and expected data frame"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "flattened_df = flatten_json_columns(df=raw_df,\n",
    "                                 json_cols=employees_json_columns,\n",
    "                                 custom_df=custom_data_expected_df)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Verify the data frame data types"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "employee_id                            Int64\n",
       "first_name                            object\n",
       "last_name                             object\n",
       "email                                 object\n",
       "phone_number                          object\n",
       "hire_date                     datetime64[ns]\n",
       "job_id                                object\n",
       "salary                               float64\n",
       "commission_pct                       float64\n",
       "manager_id                             Int64\n",
       "department_id                          Int64\n",
       "custom_data                           object\n",
       "custom_data_address_line_1            object\n",
       "custom_data_address_line_2            object\n",
       "custom_data_zip                       object\n",
       "custom_data_date_of_birth             object\n",
       "custom_data_gender                    object\n",
       "custom_data_marital_status            object\n",
       "dtype: object"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "flattened_df.dtypes"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Verify data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>employee_id</th>\n",
       "      <th>first_name</th>\n",
       "      <th>last_name</th>\n",
       "      <th>email</th>\n",
       "      <th>phone_number</th>\n",
       "      <th>hire_date</th>\n",
       "      <th>job_id</th>\n",
       "      <th>salary</th>\n",
       "      <th>commission_pct</th>\n",
       "      <th>manager_id</th>\n",
       "      <th>department_id</th>\n",
       "      <th>custom_data</th>\n",
       "      <th>custom_data_address_line_1</th>\n",
       "      <th>custom_data_address_line_2</th>\n",
       "      <th>custom_data_zip</th>\n",
       "      <th>custom_data_date_of_birth</th>\n",
       "      <th>custom_data_gender</th>\n",
       "      <th>custom_data_marital_status</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>100</td>\n",
       "      <td>Steven</td>\n",
       "      <td>King</td>\n",
       "      <td>SKING</td>\n",
       "      <td>515.123.4567</td>\n",
       "      <td>1987-06-17</td>\n",
       "      <td>AD_PRES</td>\n",
       "      <td>24000.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>90</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>101</td>\n",
       "      <td>Neena</td>\n",
       "      <td>Kochhar</td>\n",
       "      <td>NKOCHHAR</td>\n",
       "      <td>515.123.4568</td>\n",
       "      <td>1989-09-21</td>\n",
       "      <td>AD_VP</td>\n",
       "      <td>17000.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>100</td>\n",
       "      <td>90</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>102</td>\n",
       "      <td>Lex</td>\n",
       "      <td>De Haan</td>\n",
       "      <td>LDEHAAN</td>\n",
       "      <td>515.123.4569</td>\n",
       "      <td>1993-01-13</td>\n",
       "      <td>AD_VP</td>\n",
       "      <td>17000.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>100</td>\n",
       "      <td>90</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>103</td>\n",
       "      <td>Alexander</td>\n",
       "      <td>Hunold</td>\n",
       "      <td>AHUNOLD</td>\n",
       "      <td>590.423.4567</td>\n",
       "      <td>1990-01-03</td>\n",
       "      <td>IT_PROG</td>\n",
       "      <td>9000.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>102</td>\n",
       "      <td>60</td>\n",
       "      <td>{\"address_line_1\":\"6649 N Blue Gum St\",\"addres...</td>\n",
       "      <td>6649 N Blue Gum St</td>\n",
       "      <td>C 22</td>\n",
       "      <td>70116</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>104</td>\n",
       "      <td>Bruce</td>\n",
       "      <td>Ernst</td>\n",
       "      <td>BERNST</td>\n",
       "      <td>590.423.4568</td>\n",
       "      <td>1991-05-21</td>\n",
       "      <td>IT_PROG</td>\n",
       "      <td>6000.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>103</td>\n",
       "      <td>60</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>102</th>\n",
       "      <td>202</td>\n",
       "      <td>Pat</td>\n",
       "      <td>Fay</td>\n",
       "      <td>PFAY</td>\n",
       "      <td>603.123.6666</td>\n",
       "      <td>1997-08-17</td>\n",
       "      <td>MK_REP</td>\n",
       "      <td>6000.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>201</td>\n",
       "      <td>20</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>103</th>\n",
       "      <td>203</td>\n",
       "      <td>Susan</td>\n",
       "      <td>Mavris</td>\n",
       "      <td>SMAVRIS</td>\n",
       "      <td>515.123.7777</td>\n",
       "      <td>1994-06-07</td>\n",
       "      <td>HR_REP</td>\n",
       "      <td>6500.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>101</td>\n",
       "      <td>40</td>\n",
       "      <td>{\"address_line_1\":\"2371 Jerrold Ave\",\"address_...</td>\n",
       "      <td>2371 Jerrold Ave</td>\n",
       "      <td>Block D</td>\n",
       "      <td>19443</td>\n",
       "      <td>1954-06-07</td>\n",
       "      <td>F</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>104</th>\n",
       "      <td>204</td>\n",
       "      <td>Hermann</td>\n",
       "      <td>Baer</td>\n",
       "      <td>HBAER</td>\n",
       "      <td>515.123.8888</td>\n",
       "      <td>1994-06-07</td>\n",
       "      <td>PR_REP</td>\n",
       "      <td>10000.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>101</td>\n",
       "      <td>70</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>105</th>\n",
       "      <td>205</td>\n",
       "      <td>Shelley</td>\n",
       "      <td>Higgins</td>\n",
       "      <td>SHIGGINS</td>\n",
       "      <td>515.123.8080</td>\n",
       "      <td>1994-06-07</td>\n",
       "      <td>AC_MGR</td>\n",
       "      <td>12000.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>101</td>\n",
       "      <td>110</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>106</th>\n",
       "      <td>206</td>\n",
       "      <td>William</td>\n",
       "      <td>Gietz</td>\n",
       "      <td>WGIETZ</td>\n",
       "      <td>515.123.8181</td>\n",
       "      <td>1994-06-07</td>\n",
       "      <td>AC_ACCOUNT</td>\n",
       "      <td>8300.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>205</td>\n",
       "      <td>110</td>\n",
       "      <td>{\"address_line_1\":\"37275 St Rt 17m M\",\"address...</td>\n",
       "      <td>37275 St Rt 17m M</td>\n",
       "      <td>Block C</td>\n",
       "      <td>11953</td>\n",
       "      <td>1964-01-07</td>\n",
       "      <td>M</td>\n",
       "      <td>Married</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>107 rows × 18 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "     employee_id first_name last_name     email  phone_number  hire_date  \\\n",
       "0            100     Steven      King     SKING  515.123.4567 1987-06-17   \n",
       "1            101      Neena   Kochhar  NKOCHHAR  515.123.4568 1989-09-21   \n",
       "2            102        Lex   De Haan   LDEHAAN  515.123.4569 1993-01-13   \n",
       "3            103  Alexander    Hunold   AHUNOLD  590.423.4567 1990-01-03   \n",
       "4            104      Bruce     Ernst    BERNST  590.423.4568 1991-05-21   \n",
       "..           ...        ...       ...       ...           ...        ...   \n",
       "102          202        Pat       Fay      PFAY  603.123.6666 1997-08-17   \n",
       "103          203      Susan    Mavris   SMAVRIS  515.123.7777 1994-06-07   \n",
       "104          204    Hermann      Baer     HBAER  515.123.8888 1994-06-07   \n",
       "105          205    Shelley   Higgins  SHIGGINS  515.123.8080 1994-06-07   \n",
       "106          206    William     Gietz    WGIETZ  515.123.8181 1994-06-07   \n",
       "\n",
       "         job_id   salary  commission_pct  manager_id  department_id  \\\n",
       "0       AD_PRES  24000.0             NaN        <NA>             90   \n",
       "1         AD_VP  17000.0             NaN         100             90   \n",
       "2         AD_VP  17000.0             NaN         100             90   \n",
       "3       IT_PROG   9000.0             NaN         102             60   \n",
       "4       IT_PROG   6000.0             NaN         103             60   \n",
       "..          ...      ...             ...         ...            ...   \n",
       "102      MK_REP   6000.0             NaN         201             20   \n",
       "103      HR_REP   6500.0             NaN         101             40   \n",
       "104      PR_REP  10000.0             NaN         101             70   \n",
       "105      AC_MGR  12000.0             NaN         101            110   \n",
       "106  AC_ACCOUNT   8300.0             NaN         205            110   \n",
       "\n",
       "                                           custom_data  \\\n",
       "0                                                  NaN   \n",
       "1                                                  NaN   \n",
       "2                                                  NaN   \n",
       "3    {\"address_line_1\":\"6649 N Blue Gum St\",\"addres...   \n",
       "4                                                  NaN   \n",
       "..                                                 ...   \n",
       "102                                                NaN   \n",
       "103  {\"address_line_1\":\"2371 Jerrold Ave\",\"address_...   \n",
       "104                                                NaN   \n",
       "105                                                NaN   \n",
       "106  {\"address_line_1\":\"37275 St Rt 17m M\",\"address...   \n",
       "\n",
       "    custom_data_address_line_1 custom_data_address_line_2 custom_data_zip  \\\n",
       "0                          NaN                        NaN             NaN   \n",
       "1                          NaN                        NaN             NaN   \n",
       "2                          NaN                        NaN             NaN   \n",
       "3           6649 N Blue Gum St                       C 22           70116   \n",
       "4                          NaN                        NaN             NaN   \n",
       "..                         ...                        ...             ...   \n",
       "102                        NaN                        NaN             NaN   \n",
       "103           2371 Jerrold Ave                    Block D           19443   \n",
       "104                        NaN                        NaN             NaN   \n",
       "105                        NaN                        NaN             NaN   \n",
       "106          37275 St Rt 17m M                    Block C           11953   \n",
       "\n",
       "    custom_data_date_of_birth custom_data_gender custom_data_marital_status  \n",
       "0                         NaN                NaN                        NaN  \n",
       "1                         NaN                NaN                        NaN  \n",
       "2                         NaN                NaN                        NaN  \n",
       "3                         NaN                NaN                        NaN  \n",
       "4                         NaN                NaN                        NaN  \n",
       "..                        ...                ...                        ...  \n",
       "102                       NaN                NaN                        NaN  \n",
       "103                1954-06-07                  F                        NaN  \n",
       "104                       NaN                NaN                        NaN  \n",
       "105                       NaN                NaN                        NaN  \n",
       "106                1964-01-07                  M                    Married  \n",
       "\n",
       "[107 rows x 18 columns]"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "flattened_df"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
 }
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"First of all, Import necessary modules"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {},
	"outputs": [],
	"source": [
	"import pandas as pd\n",
	"import json\n",
	"import numpy as np\n",
	"from ast import literal_eval"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Set jupyter notebook row and column display"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {},
	"outputs": [],
	"source": [
	"# Set ipython's max row display\n",
	"pd.set_option('display.max_row', 100)\n",
	"\n",
	"# Set iPython's max column width to 50\n",
	"pd.set_option('display.max_columns', 70)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Declare read schema to use while reading the CSV using pandas"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {},
	"outputs": [],
	"source": [
	"employees_schema = {\n",
	" 'employee_id' : 'Int64',\n",
	" 'first_name' : 'str',\n",
	" 'last_name' : 'str',\n",
	" 'email' : 'str',\n",
	" 'offer_id' : 'str',\n",
	" 'phone_number' : 'str',\n",
	" 'hire_date' : 'str',\n",
	" 'job_id' : 'str',\n",
	" 'salary' : 'float64',\n",
	" 'commission_pct' : 'float64',\n",
	" 'manager_id' : 'Int64',\n",
	" 'department_id' : 'Int64'\n",
	"}"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Declare all date columns in a list"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {},
	"outputs": [],
	"source": [
	"employees_date_cols = ['hire_date']"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Dclare all JSON columns in a list"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"metadata": {},
	"outputs": [],
	"source": [
	"employees_json_columns = ['custom_data']"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Declare expected schema of the JSON "
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"metadata": {},
	"outputs": [],
	"source": [
	"employees_custom_data_schema = {\n",
	" 'custom_data_address_line_1': [],\n",
	" 'custom_data_address_line_2': [],\n",
	" 'custom_data_zip': [],\n",
	" 'custom_data_date_of_birth': [],\n",
	" 'custom_data_gender': [],\n",
	" 'custom_data_marital_status': []\n",
	"}"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Read CSV using pandas library and specify file path, data types and parse dates parameters"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 7,
	"metadata": {},
	"outputs": [],
	"source": [
	"raw_df = pd.read_csv(filepath_or_buffer='employees.csv', dtype=employees_schema, parse_dates=employees_date_cols)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"See first 5 rows using below statement"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 8,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<style scoped>\n",
	" .dataframe tbody tr th:only-of-type {\n",
	" vertical-align: middle;\n",
	" }\n",
	"\n",
	" .dataframe tbody tr th {\n",
	" vertical-align: top;\n",
	" }\n",
	"\n",
	" .dataframe thead th {\n",
	" text-align: right;\n",
	" }\n",
	"</style>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>employee_id</th>\n",
	" <th>first_name</th>\n",
	" <th>last_name</th>\n",
	" <th>email</th>\n",
	" <th>phone_number</th>\n",
	" <th>hire_date</th>\n",
	" <th>job_id</th>\n",
	" <th>salary</th>\n",
	" <th>commission_pct</th>\n",
	" <th>manager_id</th>\n",
	" <th>department_id</th>\n",
	" <th>custom_data</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>0</th>\n",
	" <td>100</td>\n",
	" <td>Steven</td>\n",
	" <td>King</td>\n",
	" <td>SKING</td>\n",
	" <td>515.123.4567</td>\n",
	" <td>1987-06-17</td>\n",
	" <td>AD_PRES</td>\n",
	" <td>24000.0</td>\n",
	" <td>NaN</td>\n",
	" <td><NA></td>\n",
	" <td>90</td>\n",
	" <td>NaN</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1</th>\n",
	" <td>101</td>\n",
	" <td>Neena</td>\n",
	" <td>Kochhar</td>\n",
	" <td>NKOCHHAR</td>\n",
	" <td>515.123.4568</td>\n",
	" <td>1989-09-21</td>\n",
	" <td>AD_VP</td>\n",
	" <td>17000.0</td>\n",
	" <td>NaN</td>\n",
	" <td>100</td>\n",
	" <td>90</td>\n",
	" <td>NaN</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>2</th>\n",
	" <td>102</td>\n",
	" <td>Lex</td>\n",
	" <td>De Haan</td>\n",
	" <td>LDEHAAN</td>\n",
	" <td>515.123.4569</td>\n",
	" <td>1993-01-13</td>\n",
	" <td>AD_VP</td>\n",
	" <td>17000.0</td>\n",
	" <td>NaN</td>\n",
	" <td>100</td>\n",
	" <td>90</td>\n",
	" <td>NaN</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>3</th>\n",
	" <td>103</td>\n",
	" <td>Alexander</td>\n",
	" <td>Hunold</td>\n",
	" <td>AHUNOLD</td>\n",
	" <td>590.423.4567</td>\n",
	" <td>1990-01-03</td>\n",
	" <td>IT_PROG</td>\n",
	" <td>9000.0</td>\n",
	" <td>NaN</td>\n",
	" <td>102</td>\n",
	" <td>60</td>\n",
	" <td>{\"address_line_1\":\"6649 N Blue Gum St\",\"addres...</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>4</th>\n",
	" <td>104</td>\n",
	" <td>Bruce</td>\n",
	" <td>Ernst</td>\n",
	" <td>BERNST</td>\n",
	" <td>590.423.4568</td>\n",
	" <td>1991-05-21</td>\n",
	" <td>IT_PROG</td>\n",
	" <td>6000.0</td>\n",
	" <td>NaN</td>\n",
	" <td>103</td>\n",
	" <td>60</td>\n",
	" <td>NaN</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"text/plain": [
	" employee_id first_name last_name email phone_number hire_date \\\n",
	"0 100 Steven King SKING 515.123.4567 1987-06-17 \n",
	"1 101 Neena Kochhar NKOCHHAR 515.123.4568 1989-09-21 \n",
	"2 102 Lex De Haan LDEHAAN 515.123.4569 1993-01-13 \n",
	"3 103 Alexander Hunold AHUNOLD 590.423.4567 1990-01-03 \n",
	"4 104 Bruce Ernst BERNST 590.423.4568 1991-05-21 \n",
	"\n",
	" job_id salary commission_pct manager_id department_id \\\n",
	"0 AD_PRES 24000.0 NaN <NA> 90 \n",
	"1 AD_VP 17000.0 NaN 100 90 \n",
	"2 AD_VP 17000.0 NaN 100 90 \n",
	"3 IT_PROG 9000.0 NaN 102 60 \n",
	"4 IT_PROG 6000.0 NaN 103 60 \n",
	"\n",
	" custom_data \n",
	"0 NaN \n",
	"1 NaN \n",
	"2 NaN \n",
	"3 {\"address_line_1\":\"6649 N Blue Gum St\",\"addres... \n",
	"4 NaN "
	]
	},
	"execution_count": 8,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"raw_df.head()"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Veriy the data frame data types "
	]
	},
	{
	"cell_type": "code",
	"execution_count": 9,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"employee_id Int64\n",
	"first_name object\n",
	"last_name object\n",
	"email object\n",
	"phone_number object\n",
	"hire_date datetime64[ns]\n",
	"job_id object\n",
	"salary float64\n",
	"commission_pct float64\n",
	"manager_id Int64\n",
	"department_id Int64\n",
	"custom_data object\n",
	"dtype: object"
	]
	},
	"execution_count": 9,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"raw_df.dtypes"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Create a function to flattern JSON to a structured data frame"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 10,
	"metadata": {},
	"outputs": [],
	"source": [
	"def flatten_json_columns(df, json_cols, custom_df):\n",
	" \"\"\"\n",
	" This function flattens JSON columns to individual columns\n",
	" It merges the flattened dataframe with expected dataframe to capture missing columns from JSON\n",
	" :param df: Employees CSV raw dataframe\n",
	" :param json_cols: custom data columns in employees CSV\n",
	" :param custom_df: expected dataframe\n",
	" :return: returns df pandas dataframe\n",
	" \"\"\"\n",
	"\n",
	" # Loop through all JSON columns\n",
	" for column in json_cols:\n",
	" if not df[column].isnull().all():\n",
	" # create a temp col to preserve the orginal data\n",
	" df['custom_data_temp'] = df[column]\n",
	" # Replace None and NaN with empty braces\n",
	" df[column].fillna(value='{}', inplace=True)\n",
	" try:\n",
	" #Deserialize's a str instance containing a JSON document to a Python object\n",
	" df[column] = [json.loads(row, strict=False) for row in df[column]] \n",
	" except TypeError:\n",
	" # Convert all values to string using literal eval\n",
	" df[column] = df[column].apply(lambda x: literal_eval(str(x)))\n",
	" # Normalize semi-structured JSON data into a flat table\n",
	" column_as_df = pd.json_normalize(df[column])\n",
	" # Extract main column name and attach it to each sub column name\n",
	" column_as_df.columns = [f\"{column}_{subcolumn}\" for subcolumn in column_as_df.columns]\n",
	" # Replace empty strings with None\n",
	" column_as_df.replace('', np.nan, inplace=True)\n",
	" # Replace orginal data with temp data\n",
	" df[column] = df['custom_data_temp']\n",
	" # Merge extracted result from custom_data field with expected fields\n",
	" result_df = pd.merge(column_as_df, custom_df, how='left')\n",
	" # Drop the temp column and merge the flattened dataframe with orginal dataframe\n",
	" df = df.drop('custom_data_temp', axis=1).merge(result_df, right_index=True, left_index=True)\n",
	" else:\n",
	" df = pd.concat([df, custom_df], axis=1)\n",
	"\n",
	" # Return dataframe with flatten columns\n",
	" return df"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Create expected dataframe from the dictonary"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 11,
	"metadata": {},
	"outputs": [],
	"source": [
	"custom_data_expected_df=pd.DataFrame.from_dict(employees_custom_data_schema).astype(str, copy=True)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"View the empty data frame"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 12,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<style scoped>\n",
	" .dataframe tbody tr th:only-of-type {\n",
	" vertical-align: middle;\n",
	" }\n",
	"\n",
	" .dataframe tbody tr th {\n",
	" vertical-align: top;\n",
	" }\n",
	"\n",
	" .dataframe thead th {\n",
	" text-align: right;\n",
	" }\n",
	"</style>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>custom_data_address_line_1</th>\n",
	" <th>custom_data_address_line_2</th>\n",
	" <th>custom_data_zip</th>\n",
	" <th>custom_data_date_of_birth</th>\n",
	" <th>custom_data_gender</th>\n",
	" <th>custom_data_marital_status</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"text/plain": [
	"Empty DataFrame\n",
	"Columns: [custom_data_address_line_1, custom_data_address_line_2, custom_data_zip, custom_data_date_of_birth, custom_data_gender, custom_data_marital_status]\n",
	"Index: []"
	]
	},
	"execution_count": 12,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"custom_data_expected_df"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Verify the data frame data types"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 13,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"custom_data_address_line_1 object\n",
	"custom_data_address_line_2 object\n",
	"custom_data_zip object\n",
	"custom_data_date_of_birth object\n",
	"custom_data_gender object\n",
	"custom_data_marital_status object\n",
	"dtype: object"
	]
	},
	"execution_count": 13,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"custom_data_expected_df.dtypes"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Invoke flatten function by passing employee raw data frame, json column and expected data frame"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 14,
	"metadata": {},
	"outputs": [],
	"source": [
	"flattened_df = flatten_json_columns(df=raw_df,\n",
	" json_cols=employees_json_columns,\n",
	" custom_df=custom_data_expected_df)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Verify the data frame data types"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 15,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"employee_id Int64\n",
	"first_name object\n",
	"last_name object\n",
	"email object\n",
	"phone_number object\n",
	"hire_date datetime64[ns]\n",
	"job_id object\n",
	"salary float64\n",
	"commission_pct float64\n",
	"manager_id Int64\n",
	"department_id Int64\n",
	"custom_data object\n",
	"custom_data_address_line_1 object\n",
	"custom_data_address_line_2 object\n",
	"custom_data_zip object\n",
	"custom_data_date_of_birth object\n",
	"custom_data_gender object\n",
	"custom_data_marital_status object\n",
	"dtype: object"
	]
	},
	"execution_count": 15,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"flattened_df.dtypes"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Verify data"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 16,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<style scoped>\n",
	" .dataframe tbody tr th:only-of-type {\n",
	" vertical-align: middle;\n",
	" }\n",
	"\n",
	" .dataframe tbody tr th {\n",
	" vertical-align: top;\n",
	" }\n",
	"\n",
	" .dataframe thead th {\n",
	" text-align: right;\n",
	" }\n",
	"</style>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>employee_id</th>\n",
	" <th>first_name</th>\n",
	" <th>last_name</th>\n",
	" <th>email</th>\n",
	" <th>phone_number</th>\n",
	" <th>hire_date</th>\n",
	" <th>job_id</th>\n",
	" <th>salary</th>\n",
	" <th>commission_pct</th>\n",
	" <th>manager_id</th>\n",
	" <th>department_id</th>\n",
	" <th>custom_data</th>\n",
	" <th>custom_data_address_line_1</th>\n",
	" <th>custom_data_address_line_2</th>\n",
	" <th>custom_data_zip</th>\n",
	" <th>custom_data_date_of_birth</th>\n",
	" <th>custom_data_gender</th>\n",
	" <th>custom_data_marital_status</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>0</th>\n",
	" <td>100</td>\n",
	" <td>Steven</td>\n",
	" <td>King</td>\n",
	" <td>SKING</td>\n",
	" <td>515.123.4567</td>\n",
	" <td>1987-06-17</td>\n",
	" <td>AD_PRES</td>\n",
	" <td>24000.0</td>\n",
	" <td>NaN</td>\n",
	" <td><NA></td>\n",
	" <td>90</td>\n",
	" <td>NaN</td>\n",
	" <td>NaN</td>\n",
	" <td>NaN</td>\n",
	" <td>NaN</td>\n",
	" <td>NaN</td>\n",
	" <td>NaN</td>\n",
	" <td>NaN</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1</th>\n",
	" <td>101</td>\n",
	" <td>Neena</td>\n",
	" <td>Kochhar</td>\n",
	" <td>NKOCHHAR</td>\n",
	" <td>515.123.4568</td>\n",
	" <td>1989-09-21</td>\n",
	" <td>AD_VP</td>\n",
	" <td>17000.0</td>\n",
	" <td>NaN</td>\n",
	" <td>100</td>\n",
	" <td>90</td>\n",
	" <td>NaN</td>\n",
	" <td>NaN</td>\n",
	" <td>NaN</td>\n",
	" <td>NaN</td>\n",
	" <td>NaN</td>\n",
	" <td>NaN</td>\n",
	" <td>NaN</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>2</th>\n",
	" <td>102</td>\n",
	" <td>Lex</td>\n",
	" <td>De Haan</td>\n",
	" <td>LDEHAAN</td>\n",
	" <td>515.123.4569</td>\n",
	" <td>1993-01-13</td>\n",
	" <td>AD_VP</td>\n",
	" <td>17000.0</td>\n",
	" <td>NaN</td>\n",
	" <td>100</td>\n",
	" <td>90</td>\n",
	" <td>NaN</td>\n",
	" <td>NaN</td>\n",
	" <td>NaN</td>\n",
	" <td>NaN</td>\n",
	" <td>NaN</td>\n",
	" <td>NaN</td>\n",
	" <td>NaN</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>3</th>\n",
	" <td>103</td>\n",
	" <td>Alexander</td>\n",
	" <td>Hunold</td>\n",
	" <td>AHUNOLD</td>\n",
	" <td>590.423.4567</td>\n",
	" <td>1990-01-03</td>\n",
	" <td>IT_PROG</td>\n",
	" <td>9000.0</td>\n",
	" <td>NaN</td>\n",
	" <td>102</td>\n",
	" <td>60</td>\n",
	" <td>{\"address_line_1\":\"6649 N Blue Gum St\",\"addres...</td>\n",
	" <td>6649 N Blue Gum St</td>\n",
	" <td>C 22</td>\n",
	" <td>70116</td>\n",
	" <td>NaN</td>\n",
	" <td>NaN</td>\n",
	" <td>NaN</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>4</th>\n",
	" <td>104</td>\n",
	" <td>Bruce</td>\n",
	" <td>Ernst</td>\n",
	" <td>BERNST</td>\n",
	" <td>590.423.4568</td>\n",
	" <td>1991-05-21</td>\n",
	" <td>IT_PROG</td>\n",
	" <td>6000.0</td>\n",
	" <td>NaN</td>\n",
	" <td>103</td>\n",
	" <td>60</td>\n",
	" <td>NaN</td>\n",
	" <td>NaN</td>\n",
	" <td>NaN</td>\n",
	" <td>NaN</td>\n",
	" <td>NaN</td>\n",
	" <td>NaN</td>\n",
	" <td>NaN</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>...</th>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>102</th>\n",
	" <td>202</td>\n",
	" <td>Pat</td>\n",
	" <td>Fay</td>\n",
	" <td>PFAY</td>\n",
	" <td>603.123.6666</td>\n",
	" <td>1997-08-17</td>\n",
	" <td>MK_REP</td>\n",
	" <td>6000.0</td>\n",
	" <td>NaN</td>\n",
	" <td>201</td>\n",
	" <td>20</td>\n",
	" <td>NaN</td>\n",
	" <td>NaN</td>\n",
	" <td>NaN</td>\n",
	" <td>NaN</td>\n",
	" <td>NaN</td>\n",
	" <td>NaN</td>\n",
	" <td>NaN</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>103</th>\n",
	" <td>203</td>\n",
	" <td>Susan</td>\n",
	" <td>Mavris</td>\n",
	" <td>SMAVRIS</td>\n",
	" <td>515.123.7777</td>\n",
	" <td>1994-06-07</td>\n",
	" <td>HR_REP</td>\n",
	" <td>6500.0</td>\n",
	" <td>NaN</td>\n",
	" <td>101</td>\n",
	" <td>40</td>\n",
	" <td>{\"address_line_1\":\"2371 Jerrold Ave\",\"address_...</td>\n",
	" <td>2371 Jerrold Ave</td>\n",
	" <td>Block D</td>\n",
	" <td>19443</td>\n",
	" <td>1954-06-07</td>\n",
	" <td>F</td>\n",
	" <td>NaN</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>104</th>\n",
	" <td>204</td>\n",
	" <td>Hermann</td>\n",
	" <td>Baer</td>\n",
	" <td>HBAER</td>\n",
	" <td>515.123.8888</td>\n",
	" <td>1994-06-07</td>\n",
	" <td>PR_REP</td>\n",
	" <td>10000.0</td>\n",
	" <td>NaN</td>\n",
	" <td>101</td>\n",
	" <td>70</td>\n",
	" <td>NaN</td>\n",
	" <td>NaN</td>\n",
	" <td>NaN</td>\n",
	" <td>NaN</td>\n",
	" <td>NaN</td>\n",
	" <td>NaN</td>\n",
	" <td>NaN</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>105</th>\n",
	" <td>205</td>\n",
	" <td>Shelley</td>\n",
	" <td>Higgins</td>\n",
	" <td>SHIGGINS</td>\n",
	" <td>515.123.8080</td>\n",
	" <td>1994-06-07</td>\n",
	" <td>AC_MGR</td>\n",
	" <td>12000.0</td>\n",
	" <td>NaN</td>\n",
	" <td>101</td>\n",
	" <td>110</td>\n",
	" <td>NaN</td>\n",
	" <td>NaN</td>\n",
	" <td>NaN</td>\n",
	" <td>NaN</td>\n",
	" <td>NaN</td>\n",
	" <td>NaN</td>\n",
	" <td>NaN</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>106</th>\n",
	" <td>206</td>\n",
	" <td>William</td>\n",
	" <td>Gietz</td>\n",
	" <td>WGIETZ</td>\n",
	" <td>515.123.8181</td>\n",
	" <td>1994-06-07</td>\n",
	" <td>AC_ACCOUNT</td>\n",
	" <td>8300.0</td>\n",
	" <td>NaN</td>\n",
	" <td>205</td>\n",
	" <td>110</td>\n",
	" <td>{\"address_line_1\":\"37275 St Rt 17m M\",\"address...</td>\n",
	" <td>37275 St Rt 17m M</td>\n",
	" <td>Block C</td>\n",
	" <td>11953</td>\n",
	" <td>1964-01-07</td>\n",
	" <td>M</td>\n",
	" <td>Married</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"<p>107 rows × 18 columns</p>\n",
	"</div>"
	],
	"text/plain": [
	" employee_id first_name last_name email phone_number hire_date \\\n",
	"0 100 Steven King SKING 515.123.4567 1987-06-17 \n",
	"1 101 Neena Kochhar NKOCHHAR 515.123.4568 1989-09-21 \n",
	"2 102 Lex De Haan LDEHAAN 515.123.4569 1993-01-13 \n",
	"3 103 Alexander Hunold AHUNOLD 590.423.4567 1990-01-03 \n",
	"4 104 Bruce Ernst BERNST 590.423.4568 1991-05-21 \n",
	".. ... ... ... ... ... ... \n",
	"102 202 Pat Fay PFAY 603.123.6666 1997-08-17 \n",
	"103 203 Susan Mavris SMAVRIS 515.123.7777 1994-06-07 \n",
	"104 204 Hermann Baer HBAER 515.123.8888 1994-06-07 \n",
	"105 205 Shelley Higgins SHIGGINS 515.123.8080 1994-06-07 \n",
	"106 206 William Gietz WGIETZ 515.123.8181 1994-06-07 \n",
	"\n",
	" job_id salary commission_pct manager_id department_id \\\n",
	"0 AD_PRES 24000.0 NaN <NA> 90 \n",
	"1 AD_VP 17000.0 NaN 100 90 \n",
	"2 AD_VP 17000.0 NaN 100 90 \n",
	"3 IT_PROG 9000.0 NaN 102 60 \n",
	"4 IT_PROG 6000.0 NaN 103 60 \n",
	".. ... ... ... ... ... \n",
	"102 MK_REP 6000.0 NaN 201 20 \n",
	"103 HR_REP 6500.0 NaN 101 40 \n",
	"104 PR_REP 10000.0 NaN 101 70 \n",
	"105 AC_MGR 12000.0 NaN 101 110 \n",
	"106 AC_ACCOUNT 8300.0 NaN 205 110 \n",
	"\n",
	" custom_data \\\n",
	"0 NaN \n",
	"1 NaN \n",
	"2 NaN \n",
	"3 {\"address_line_1\":\"6649 N Blue Gum St\",\"addres... \n",
	"4 NaN \n",
	".. ... \n",
	"102 NaN \n",
	"103 {\"address_line_1\":\"2371 Jerrold Ave\",\"address_... \n",
	"104 NaN \n",
	"105 NaN \n",
	"106 {\"address_line_1\":\"37275 St Rt 17m M\",\"address... \n",
	"\n",
	" custom_data_address_line_1 custom_data_address_line_2 custom_data_zip \\\n",
	"0 NaN NaN NaN \n",
	"1 NaN NaN NaN \n",
	"2 NaN NaN NaN \n",
	"3 6649 N Blue Gum St C 22 70116 \n",
	"4 NaN NaN NaN \n",
	".. ... ... ... \n",
	"102 NaN NaN NaN \n",
	"103 2371 Jerrold Ave Block D 19443 \n",
	"104 NaN NaN NaN \n",
	"105 NaN NaN NaN \n",
	"106 37275 St Rt 17m M Block C 11953 \n",
	"\n",
	" custom_data_date_of_birth custom_data_gender custom_data_marital_status \n",
	"0 NaN NaN NaN \n",
	"1 NaN NaN NaN \n",
	"2 NaN NaN NaN \n",
	"3 NaN NaN NaN \n",
	"4 NaN NaN NaN \n",
	".. ... ... ... \n",
	"102 NaN NaN NaN \n",
	"103 1954-06-07 F NaN \n",
	"104 NaN NaN NaN \n",
	"105 NaN NaN NaN \n",
	"106 1964-01-07 M Married \n",
	"\n",
	"[107 rows x 18 columns]"
	]
	},
	"execution_count": 16,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"flattened_df"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.6.9"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 4
	}