Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save vvgsrk/d70f64b5ba7e70314f9dde4dd07f3138 to your computer and use it in GitHub Desktop.
Save vvgsrk/d70f64b5ba7e70314f9dde4dd07f3138 to your computer and use it in GitHub Desktop.
Parse CSV file column contains JSON
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"First of all, Import necessary modules"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import json\n",
"import numpy as np\n",
"from ast import literal_eval"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Set jupyter notebook row and column display"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"# Set ipython's max row display\n",
"pd.set_option('display.max_row', 100)\n",
"\n",
"# Set iPython's max column width to 50\n",
"pd.set_option('display.max_columns', 70)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Declare read schema to use while reading the CSV using pandas"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"employees_schema = {\n",
" 'employee_id' : 'Int64',\n",
" 'first_name' : 'str',\n",
" 'last_name' : 'str',\n",
" 'email' : 'str',\n",
" 'offer_id' : 'str',\n",
" 'phone_number' : 'str',\n",
" 'hire_date' : 'str',\n",
" 'job_id' : 'str',\n",
" 'salary' : 'float64',\n",
" 'commission_pct' : 'float64',\n",
" 'manager_id' : 'Int64',\n",
" 'department_id' : 'Int64'\n",
"}"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Declare all date columns in a list"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"employees_date_cols = ['hire_date']"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Dclare all JSON columns in a list"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"employees_json_columns = ['custom_data']"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Declare expected schema of the JSON "
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"employees_custom_data_schema = {\n",
" 'custom_data_address_line_1': [],\n",
" 'custom_data_address_line_2': [],\n",
" 'custom_data_zip': [],\n",
" 'custom_data_date_of_birth': [],\n",
" 'custom_data_gender': [],\n",
" 'custom_data_marital_status': []\n",
"}"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Read CSV using pandas library and specify file path, data types and parse dates parameters"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"raw_df = pd.read_csv(filepath_or_buffer='employees.csv', dtype=employees_schema, parse_dates=employees_date_cols)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"See first 5 rows using below statement"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>employee_id</th>\n",
" <th>first_name</th>\n",
" <th>last_name</th>\n",
" <th>email</th>\n",
" <th>phone_number</th>\n",
" <th>hire_date</th>\n",
" <th>job_id</th>\n",
" <th>salary</th>\n",
" <th>commission_pct</th>\n",
" <th>manager_id</th>\n",
" <th>department_id</th>\n",
" <th>custom_data</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>100</td>\n",
" <td>Steven</td>\n",
" <td>King</td>\n",
" <td>SKING</td>\n",
" <td>515.123.4567</td>\n",
" <td>1987-06-17</td>\n",
" <td>AD_PRES</td>\n",
" <td>24000.0</td>\n",
" <td>NaN</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>90</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>101</td>\n",
" <td>Neena</td>\n",
" <td>Kochhar</td>\n",
" <td>NKOCHHAR</td>\n",
" <td>515.123.4568</td>\n",
" <td>1989-09-21</td>\n",
" <td>AD_VP</td>\n",
" <td>17000.0</td>\n",
" <td>NaN</td>\n",
" <td>100</td>\n",
" <td>90</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>102</td>\n",
" <td>Lex</td>\n",
" <td>De Haan</td>\n",
" <td>LDEHAAN</td>\n",
" <td>515.123.4569</td>\n",
" <td>1993-01-13</td>\n",
" <td>AD_VP</td>\n",
" <td>17000.0</td>\n",
" <td>NaN</td>\n",
" <td>100</td>\n",
" <td>90</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>103</td>\n",
" <td>Alexander</td>\n",
" <td>Hunold</td>\n",
" <td>AHUNOLD</td>\n",
" <td>590.423.4567</td>\n",
" <td>1990-01-03</td>\n",
" <td>IT_PROG</td>\n",
" <td>9000.0</td>\n",
" <td>NaN</td>\n",
" <td>102</td>\n",
" <td>60</td>\n",
" <td>{\"address_line_1\":\"6649 N Blue Gum St\",\"addres...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>104</td>\n",
" <td>Bruce</td>\n",
" <td>Ernst</td>\n",
" <td>BERNST</td>\n",
" <td>590.423.4568</td>\n",
" <td>1991-05-21</td>\n",
" <td>IT_PROG</td>\n",
" <td>6000.0</td>\n",
" <td>NaN</td>\n",
" <td>103</td>\n",
" <td>60</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" employee_id first_name last_name email phone_number hire_date \\\n",
"0 100 Steven King SKING 515.123.4567 1987-06-17 \n",
"1 101 Neena Kochhar NKOCHHAR 515.123.4568 1989-09-21 \n",
"2 102 Lex De Haan LDEHAAN 515.123.4569 1993-01-13 \n",
"3 103 Alexander Hunold AHUNOLD 590.423.4567 1990-01-03 \n",
"4 104 Bruce Ernst BERNST 590.423.4568 1991-05-21 \n",
"\n",
" job_id salary commission_pct manager_id department_id \\\n",
"0 AD_PRES 24000.0 NaN <NA> 90 \n",
"1 AD_VP 17000.0 NaN 100 90 \n",
"2 AD_VP 17000.0 NaN 100 90 \n",
"3 IT_PROG 9000.0 NaN 102 60 \n",
"4 IT_PROG 6000.0 NaN 103 60 \n",
"\n",
" custom_data \n",
"0 NaN \n",
"1 NaN \n",
"2 NaN \n",
"3 {\"address_line_1\":\"6649 N Blue Gum St\",\"addres... \n",
"4 NaN "
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"raw_df.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Veriy the data frame data types "
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"employee_id Int64\n",
"first_name object\n",
"last_name object\n",
"email object\n",
"phone_number object\n",
"hire_date datetime64[ns]\n",
"job_id object\n",
"salary float64\n",
"commission_pct float64\n",
"manager_id Int64\n",
"department_id Int64\n",
"custom_data object\n",
"dtype: object"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"raw_df.dtypes"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Create a function to flattern JSON to a structured data frame"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"def flatten_json_columns(df, json_cols, custom_df):\n",
" \"\"\"\n",
" This function flattens JSON columns to individual columns\n",
" It merges the flattened dataframe with expected dataframe to capture missing columns from JSON\n",
" :param df: Employees CSV raw dataframe\n",
" :param json_cols: custom data columns in employees CSV\n",
" :param custom_df: expected dataframe\n",
" :return: returns df pandas dataframe\n",
" \"\"\"\n",
"\n",
" # Loop through all JSON columns\n",
" for column in json_cols:\n",
" if not df[column].isnull().all():\n",
" # create a temp col to preserve the orginal data\n",
" df['custom_data_temp'] = df[column]\n",
" # Replace None and NaN with empty braces\n",
" df[column].fillna(value='{}', inplace=True)\n",
" try:\n",
" #Deserialize's a str instance containing a JSON document to a Python object\n",
" df[column] = [json.loads(row, strict=False) for row in df[column]] \n",
" except TypeError:\n",
" # Convert all values to string using literal eval\n",
" df[column] = df[column].apply(lambda x: literal_eval(str(x)))\n",
" # Normalize semi-structured JSON data into a flat table\n",
" column_as_df = pd.json_normalize(df[column])\n",
" # Extract main column name and attach it to each sub column name\n",
" column_as_df.columns = [f\"{column}_{subcolumn}\" for subcolumn in column_as_df.columns]\n",
" # Replace empty strings with None\n",
" column_as_df.replace('', np.nan, inplace=True)\n",
" # Replace orginal data with temp data\n",
" df[column] = df['custom_data_temp']\n",
" # Merge extracted result from custom_data field with expected fields\n",
" result_df = pd.merge(column_as_df, custom_df, how='left')\n",
" # Drop the temp column and merge the flattened dataframe with orginal dataframe\n",
" df = df.drop('custom_data_temp', axis=1).merge(result_df, right_index=True, left_index=True)\n",
" else:\n",
" df = pd.concat([df, custom_df], axis=1)\n",
"\n",
" # Return dataframe with flatten columns\n",
" return df"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Create expected dataframe from the dictonary"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"custom_data_expected_df=pd.DataFrame.from_dict(employees_custom_data_schema).astype(str, copy=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"View the empty data frame"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>custom_data_address_line_1</th>\n",
" <th>custom_data_address_line_2</th>\n",
" <th>custom_data_zip</th>\n",
" <th>custom_data_date_of_birth</th>\n",
" <th>custom_data_gender</th>\n",
" <th>custom_data_marital_status</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
"Empty DataFrame\n",
"Columns: [custom_data_address_line_1, custom_data_address_line_2, custom_data_zip, custom_data_date_of_birth, custom_data_gender, custom_data_marital_status]\n",
"Index: []"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"custom_data_expected_df"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Verify the data frame data types"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"custom_data_address_line_1 object\n",
"custom_data_address_line_2 object\n",
"custom_data_zip object\n",
"custom_data_date_of_birth object\n",
"custom_data_gender object\n",
"custom_data_marital_status object\n",
"dtype: object"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"custom_data_expected_df.dtypes"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Invoke flatten function by passing employee raw data frame, json column and expected data frame"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"flattened_df = flatten_json_columns(df=raw_df,\n",
" json_cols=employees_json_columns,\n",
" custom_df=custom_data_expected_df)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Verify the data frame data types"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"employee_id Int64\n",
"first_name object\n",
"last_name object\n",
"email object\n",
"phone_number object\n",
"hire_date datetime64[ns]\n",
"job_id object\n",
"salary float64\n",
"commission_pct float64\n",
"manager_id Int64\n",
"department_id Int64\n",
"custom_data object\n",
"custom_data_address_line_1 object\n",
"custom_data_address_line_2 object\n",
"custom_data_zip object\n",
"custom_data_date_of_birth object\n",
"custom_data_gender object\n",
"custom_data_marital_status object\n",
"dtype: object"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"flattened_df.dtypes"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Verify data"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>employee_id</th>\n",
" <th>first_name</th>\n",
" <th>last_name</th>\n",
" <th>email</th>\n",
" <th>phone_number</th>\n",
" <th>hire_date</th>\n",
" <th>job_id</th>\n",
" <th>salary</th>\n",
" <th>commission_pct</th>\n",
" <th>manager_id</th>\n",
" <th>department_id</th>\n",
" <th>custom_data</th>\n",
" <th>custom_data_address_line_1</th>\n",
" <th>custom_data_address_line_2</th>\n",
" <th>custom_data_zip</th>\n",
" <th>custom_data_date_of_birth</th>\n",
" <th>custom_data_gender</th>\n",
" <th>custom_data_marital_status</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>100</td>\n",
" <td>Steven</td>\n",
" <td>King</td>\n",
" <td>SKING</td>\n",
" <td>515.123.4567</td>\n",
" <td>1987-06-17</td>\n",
" <td>AD_PRES</td>\n",
" <td>24000.0</td>\n",
" <td>NaN</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>90</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>101</td>\n",
" <td>Neena</td>\n",
" <td>Kochhar</td>\n",
" <td>NKOCHHAR</td>\n",
" <td>515.123.4568</td>\n",
" <td>1989-09-21</td>\n",
" <td>AD_VP</td>\n",
" <td>17000.0</td>\n",
" <td>NaN</td>\n",
" <td>100</td>\n",
" <td>90</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>102</td>\n",
" <td>Lex</td>\n",
" <td>De Haan</td>\n",
" <td>LDEHAAN</td>\n",
" <td>515.123.4569</td>\n",
" <td>1993-01-13</td>\n",
" <td>AD_VP</td>\n",
" <td>17000.0</td>\n",
" <td>NaN</td>\n",
" <td>100</td>\n",
" <td>90</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>103</td>\n",
" <td>Alexander</td>\n",
" <td>Hunold</td>\n",
" <td>AHUNOLD</td>\n",
" <td>590.423.4567</td>\n",
" <td>1990-01-03</td>\n",
" <td>IT_PROG</td>\n",
" <td>9000.0</td>\n",
" <td>NaN</td>\n",
" <td>102</td>\n",
" <td>60</td>\n",
" <td>{\"address_line_1\":\"6649 N Blue Gum St\",\"addres...</td>\n",
" <td>6649 N Blue Gum St</td>\n",
" <td>C 22</td>\n",
" <td>70116</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>104</td>\n",
" <td>Bruce</td>\n",
" <td>Ernst</td>\n",
" <td>BERNST</td>\n",
" <td>590.423.4568</td>\n",
" <td>1991-05-21</td>\n",
" <td>IT_PROG</td>\n",
" <td>6000.0</td>\n",
" <td>NaN</td>\n",
" <td>103</td>\n",
" <td>60</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>102</th>\n",
" <td>202</td>\n",
" <td>Pat</td>\n",
" <td>Fay</td>\n",
" <td>PFAY</td>\n",
" <td>603.123.6666</td>\n",
" <td>1997-08-17</td>\n",
" <td>MK_REP</td>\n",
" <td>6000.0</td>\n",
" <td>NaN</td>\n",
" <td>201</td>\n",
" <td>20</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>103</th>\n",
" <td>203</td>\n",
" <td>Susan</td>\n",
" <td>Mavris</td>\n",
" <td>SMAVRIS</td>\n",
" <td>515.123.7777</td>\n",
" <td>1994-06-07</td>\n",
" <td>HR_REP</td>\n",
" <td>6500.0</td>\n",
" <td>NaN</td>\n",
" <td>101</td>\n",
" <td>40</td>\n",
" <td>{\"address_line_1\":\"2371 Jerrold Ave\",\"address_...</td>\n",
" <td>2371 Jerrold Ave</td>\n",
" <td>Block D</td>\n",
" <td>19443</td>\n",
" <td>1954-06-07</td>\n",
" <td>F</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>104</th>\n",
" <td>204</td>\n",
" <td>Hermann</td>\n",
" <td>Baer</td>\n",
" <td>HBAER</td>\n",
" <td>515.123.8888</td>\n",
" <td>1994-06-07</td>\n",
" <td>PR_REP</td>\n",
" <td>10000.0</td>\n",
" <td>NaN</td>\n",
" <td>101</td>\n",
" <td>70</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>105</th>\n",
" <td>205</td>\n",
" <td>Shelley</td>\n",
" <td>Higgins</td>\n",
" <td>SHIGGINS</td>\n",
" <td>515.123.8080</td>\n",
" <td>1994-06-07</td>\n",
" <td>AC_MGR</td>\n",
" <td>12000.0</td>\n",
" <td>NaN</td>\n",
" <td>101</td>\n",
" <td>110</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>106</th>\n",
" <td>206</td>\n",
" <td>William</td>\n",
" <td>Gietz</td>\n",
" <td>WGIETZ</td>\n",
" <td>515.123.8181</td>\n",
" <td>1994-06-07</td>\n",
" <td>AC_ACCOUNT</td>\n",
" <td>8300.0</td>\n",
" <td>NaN</td>\n",
" <td>205</td>\n",
" <td>110</td>\n",
" <td>{\"address_line_1\":\"37275 St Rt 17m M\",\"address...</td>\n",
" <td>37275 St Rt 17m M</td>\n",
" <td>Block C</td>\n",
" <td>11953</td>\n",
" <td>1964-01-07</td>\n",
" <td>M</td>\n",
" <td>Married</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>107 rows × 18 columns</p>\n",
"</div>"
],
"text/plain": [
" employee_id first_name last_name email phone_number hire_date \\\n",
"0 100 Steven King SKING 515.123.4567 1987-06-17 \n",
"1 101 Neena Kochhar NKOCHHAR 515.123.4568 1989-09-21 \n",
"2 102 Lex De Haan LDEHAAN 515.123.4569 1993-01-13 \n",
"3 103 Alexander Hunold AHUNOLD 590.423.4567 1990-01-03 \n",
"4 104 Bruce Ernst BERNST 590.423.4568 1991-05-21 \n",
".. ... ... ... ... ... ... \n",
"102 202 Pat Fay PFAY 603.123.6666 1997-08-17 \n",
"103 203 Susan Mavris SMAVRIS 515.123.7777 1994-06-07 \n",
"104 204 Hermann Baer HBAER 515.123.8888 1994-06-07 \n",
"105 205 Shelley Higgins SHIGGINS 515.123.8080 1994-06-07 \n",
"106 206 William Gietz WGIETZ 515.123.8181 1994-06-07 \n",
"\n",
" job_id salary commission_pct manager_id department_id \\\n",
"0 AD_PRES 24000.0 NaN <NA> 90 \n",
"1 AD_VP 17000.0 NaN 100 90 \n",
"2 AD_VP 17000.0 NaN 100 90 \n",
"3 IT_PROG 9000.0 NaN 102 60 \n",
"4 IT_PROG 6000.0 NaN 103 60 \n",
".. ... ... ... ... ... \n",
"102 MK_REP 6000.0 NaN 201 20 \n",
"103 HR_REP 6500.0 NaN 101 40 \n",
"104 PR_REP 10000.0 NaN 101 70 \n",
"105 AC_MGR 12000.0 NaN 101 110 \n",
"106 AC_ACCOUNT 8300.0 NaN 205 110 \n",
"\n",
" custom_data \\\n",
"0 NaN \n",
"1 NaN \n",
"2 NaN \n",
"3 {\"address_line_1\":\"6649 N Blue Gum St\",\"addres... \n",
"4 NaN \n",
".. ... \n",
"102 NaN \n",
"103 {\"address_line_1\":\"2371 Jerrold Ave\",\"address_... \n",
"104 NaN \n",
"105 NaN \n",
"106 {\"address_line_1\":\"37275 St Rt 17m M\",\"address... \n",
"\n",
" custom_data_address_line_1 custom_data_address_line_2 custom_data_zip \\\n",
"0 NaN NaN NaN \n",
"1 NaN NaN NaN \n",
"2 NaN NaN NaN \n",
"3 6649 N Blue Gum St C 22 70116 \n",
"4 NaN NaN NaN \n",
".. ... ... ... \n",
"102 NaN NaN NaN \n",
"103 2371 Jerrold Ave Block D 19443 \n",
"104 NaN NaN NaN \n",
"105 NaN NaN NaN \n",
"106 37275 St Rt 17m M Block C 11953 \n",
"\n",
" custom_data_date_of_birth custom_data_gender custom_data_marital_status \n",
"0 NaN NaN NaN \n",
"1 NaN NaN NaN \n",
"2 NaN NaN NaN \n",
"3 NaN NaN NaN \n",
"4 NaN NaN NaN \n",
".. ... ... ... \n",
"102 NaN NaN NaN \n",
"103 1954-06-07 F NaN \n",
"104 NaN NaN NaN \n",
"105 NaN NaN NaN \n",
"106 1964-01-07 M Married \n",
"\n",
"[107 rows x 18 columns]"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"flattened_df"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.9"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment