vvgsrk · July 29, 2021 14:04
diff --git a/HR_CSV_DATA_PARSER_USING_PYTHON_PANDAS.ipynb b/HR_CSV_DATA_PARSER_USING_PYTHON_PANDAS.ipynb
 {
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Import all standard library modules"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import logging\n",
    "import os\n",
    "import pandas as pd\n",
    "import awswrangler as wr\n",
    "import boto3\n",
    "import sys\n",
    "import json\n",
    "import numpy as np\n",
    "from ast import literal_eval\n",
    "from datetime import datetime"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Declare Event logging"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "logger = logging.getLogger(__name__)\n",
    "logging.getLogger().setLevel(logging.INFO)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Set jupyter notebook row and column display"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Set ipython's max row display\n",
    "pd.set_option('display.max_row', 100)\n",
    "\n",
    "# Set iPython's max column width to 50\n",
    "pd.set_option('display.max_columns', 70)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Get OS environment credentials for AWS "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "os.environ['AWS_PROFILE'] = \"fdp-dev\""
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Amazon Simple Storage Service (S3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "s3 = boto3.client('s3')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "CSV Schema for each enitiy"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "read_schema = {\n",
    "\n",
    "    'countries' : {\n",
    "        'country_id' : 'str',\n",
    "        'country_name' : 'str',\n",
    "        'region_id' : 'Int64'\n",
    "    },\n",
    "    \n",
    "    'departments' : {\n",
    "        'department_id' : 'Int64',\n",
    "        'department_name' : 'str',\n",
    "        'manager_id' : 'Int64',\n",
    "        'location_id' : 'Int64'\t\n",
    "    },\n",
    "\n",
    "    'employees' : {\n",
    "        'employee_id' : 'Int64',\n",
    "        'first_name' : 'str',\n",
    "        'last_name' : 'str',\n",
    "        'email' : 'str',\n",
    "        'phone_number' : 'str',\n",
    "        'hire_date' : 'str',\n",
    "        'job_id' : 'str',\n",
    "        'salary' : 'float64',\n",
    "        'commission_pct' : 'float64',\n",
    "        'manager_id' : 'Int64',\n",
    "        'department_id' : 'Int64',\n",
    "        'custom_data' : 'str'\n",
    "    },\n",
    "    \n",
    "    'job_history' : {\n",
    "        'employee_id' : 'Int64',\n",
    "        'start_date' : 'str',\n",
    "        'end_date' : 'str',\n",
    "        'job_id' : 'str',\n",
    "        'department_id' : 'Int64'\t\n",
    "    },\n",
    "    \n",
    "    'jobs' : {\n",
    "        'job_id' : 'str',\n",
    "        'job_title' : 'str',\n",
    "        'min_salary' : 'Int64',\n",
    "        'max_salary' : 'Int64'\n",
    "    },\n",
    "    \n",
    "    'locations' : {\n",
    "        'location_id' : 'Int64',\n",
    "        'street_address' : 'str',\n",
    "        'postal_code' : 'str',\n",
    "        'city' : 'str',\n",
    "        'state_province' : 'str',\n",
    "        'country_id' : 'str'\n",
    "    },\n",
    "    \n",
    "    'regions' : {\n",
    "        'region_id' : 'Int64', \n",
    "        'region_name' : 'str'\n",
    "    }\n",
    "}\n",
    "\n",
    "date_columns_schema = {\n",
    "\n",
    "    'employees' : ['hire_date'],\t\n",
    "    'job_history' : ['start_date', 'end_date'],\n",
    "    'countries' : [],\n",
    "    'departments' : [],\n",
    "    'jobs' : [], \n",
    "    'locations' : [], \n",
    "    'regions' : []\n",
    "}\n",
    "\n",
    "json_columns_schema = {\n",
    "    'employees' : ['custom_data']\n",
    "}\n",
    "\n",
    "custom_data_schema = {\n",
    "\n",
    "    'employees' : {\n",
    "        'custom_data_address_line_1': [],\n",
    "        'custom_data_address_line_2': [],\n",
    "        'custom_data_zip': [],\n",
    "        'custom_data_date_of_birth': [],\n",
    "        'custom_data_gender': [],\n",
    "        'custom_data_marital_status': []        \n",
    "    }\n",
    "}\n",
    "\n",
    "write_schema = {\n",
    "    'countries' : {\n",
    "        'country_id' : 'str',\n",
    "        'country_name' : 'str',\n",
    "        'region_id' : 'Int64'\n",
    "    },\n",
    "    \n",
    "    'departments' : {\n",
    "        'department_id' : 'Int64',\n",
    "        'department_name' : 'str',\n",
    "        'manager_id' : 'Int64',\n",
    "        'location_id' : 'Int64'\t\n",
    "    },\n",
    "    \n",
    "    'employees' : {\n",
    "        'employee_id' : 'Int64',\n",
    "        'first_name' : 'str',\n",
    "        'last_name' : 'str',\n",
    "        'email' : 'str',\n",
    "        'phone_number' : 'str',\n",
    "        'hire_date' : 'datetime64[ns]',\n",
    "        'job_id' : 'str',\n",
    "        'salary' : 'float64',\n",
    "        'commission_pct' : 'float64',\n",
    "        'manager_id' : 'Int64',\n",
    "        'department_id' : 'Int64',\n",
    "        'custom_data' : 'str'\n",
    "    },\n",
    "    \n",
    "    'job_history' : {\n",
    "        'employee_id' : 'Int64',\n",
    "        'start_date' : 'datetime64[ns]',\n",
    "        'end_date' : 'datetime64[ns]',\n",
    "        'job_id' : 'str',\n",
    "        'department_id' : 'Int64'\t\n",
    "    },\n",
    "    \n",
    "    'jobs' : {\n",
    "        'job_id' : 'str',\n",
    "        'job_title' : 'str',\n",
    "        'min_salary' : 'Int64',\n",
    "        'max_salary' : 'Int64'\n",
    "    },\n",
    "    \n",
    "    'locations' : {\n",
    "        'location_id' : 'Int64',\n",
    "        'street_address' : 'str',\n",
    "        'postal_code' : 'str',\n",
    "        'city' : 'str',\n",
    "        'state_province' : 'str',\n",
    "        'country_id' : 'str'\n",
    "    },\n",
    "    \n",
    "    'regions' : {\n",
    "        'region_id' : 'Int64', \n",
    "        'region_name' : 'str'\n",
    "    }\t\n",
    "}"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Helper functions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "def write_to_s3(df, target_path):\n",
    "    \"\"\"\n",
    "    Write pandas dataframe data as parquet file(s) on Amazon S3.\n",
    "    :param df: Pandas DataFrame name\n",
    "    :param target path: full target path to store data\n",
    "    :return: Dictionary with the following information:\n",
    "        ‘paths’: List of all stored files paths on S3.\n",
    "        ‘partitions_values’: Dictionary of partitions added with keys as S3 path locations and values as a list of partitions values as str.\n",
    "    \"\"\"\n",
    "\n",
    "    try:\n",
    "        stored_files = wr.s3.to_parquet(\n",
    "            df=df,\n",
    "            path=target_path,\n",
    "            index=False,\n",
    "            compression=\"snappy\",\n",
    "            use_threads=False,\n",
    "            dataset=True,\n",
    "            mode=\"overwrite\"\n",
    "        )\n",
    "    except Exception as e:\n",
    "        raise(e)\n",
    "\n",
    "    logging.info(\"stored_files {}\".format(stored_files))\n",
    "    return stored_files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "def read_hr_data_csv(file_path, file_schema, date_cols):\n",
    "    \"\"\"\n",
    "    Extract HR Data column values from the CSV file\n",
    "    :param file_path: HR Data file path on the S3\n",
    "    :param file_schema: CSV schema\n",
    "    :param date_cols: date columns to parse\n",
    "    :return: returns raw_df pandas dataframe\n",
    "    \"\"\"\n",
    "\n",
    "    # Read HR Data CSV on S3 using pandas dataframe\n",
    "    raw_df = wr.s3.read_csv(path=file_path, dtype=file_schema, parse_dates=date_cols)\n",
    "\n",
    "    # Return pandas dataframe\n",
    "    return raw_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_file_name(ib_bucket_name, ib_bucket_prefix):\n",
    "    \"\"\"\n",
    "    Extract HR Data CSV file name from source path\n",
    "    :param ib_bucket_name: HR Data S3 bucket name\n",
    "    :param ib_bucket_prefix: HR Data CSV file path prefix on S3\n",
    "    :return: returns file name as a string\n",
    "    \"\"\"\n",
    "    try:\n",
    "        result = s3.list_objects_v2(Bucket=ib_bucket_name, Prefix=ib_bucket_prefix)\n",
    "        result_list = result['Contents'][0]['Key'].split(\"/\")\n",
    "\n",
    "        # Return file name based on source file path\n",
    "        return result_list[-1]\n",
    "    except:\n",
    "        return None"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_source_file_date_time(file_name):\n",
    "    \"\"\"\n",
    "    Extract HR Data CSV file date from file name\n",
    "    :param file_name: HR Data CSV file name on S3\n",
    "    :return: returns file date as timestamp\n",
    "    \"\"\"\n",
    "    try:\n",
    "        # Split the file name based on .csv and _ to extract file name elements\n",
    "        file_name_list = file_name.split(\".csv\")[0].split(\"_\")\n",
    "\n",
    "        # Extract date and date time from list\n",
    "        source_file_date_str = file_name_list[-2]\n",
    "        source_file_date_time_str = file_name_list[-2] + file_name_list[-1]\n",
    "\n",
    "        logging.info(\"source_file_date_str= {}\".format(source_file_date_str))\n",
    "        logging.info(\"source_file_date_time_str= {}\".format(source_file_date_time_str))\n",
    "\n",
    "        # Return list of timestamp by converting string to datetime\n",
    "        return [pd.to_datetime(source_file_date_str), pd.to_datetime(source_file_date_time_str)]\n",
    "    except:\n",
    "        return [None, None]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "def flatten_json_columns(df, json_cols, custom_df):\n",
    "    \"\"\"\n",
    "    This function flattens JSON columns to individual columns\n",
    "    It merges the flattened dataframe with expected dataframe to capture missing columns from JSON\n",
    "    :param df: HR Data CSV raw dataframe\n",
    "    :param json_cols: custom data columns in CSV's\n",
    "    :param custom_df: expected dataframe\n",
    "    :return: returns df pandas dataframe\n",
    "    \"\"\"\n",
    "\n",
    "    # Loop through all JSON columns\n",
    "    for column in json_cols:\n",
    "        if not df[column].isnull().all():\n",
    "            # create a temp col to preserve the orginal data\n",
    "            df['custom_data_temp'] = df[column]\n",
    "            # Replace None and NaN with empty braces\n",
    "            df[column].fillna(value='{}', inplace=True)\n",
    "            try:\n",
    "                # Deserialize's a str instance containing a JSON document to a Python object\n",
    "                df[column] = [json.loads(row, strict=False) for row in df[column]]\n",
    "            except TypeError:\n",
    "                # Convert all values to string using literal eval\n",
    "                df[column] = df[column].apply(lambda x: literal_eval(str(x)))\n",
    "            # Normalize semi-structured JSON data into a flat table\n",
    "            column_as_df = pd.json_normalize(df[column])\n",
    "            # Extract main column name and attach it to each sub column name\n",
    "            column_as_df.columns = [f\"{column}_{subcolumn}\" for subcolumn in column_as_df.columns]\n",
    "            # Replace empty strings with None\n",
    "            column_as_df.replace('', np.nan, inplace=True)\n",
    "            # Replace orginal data with temp data\n",
    "            df[column] = df['custom_data_temp']\n",
    "            # Merge extracted result from custom_data field with expected fields\n",
    "            result_df = pd.merge(column_as_df, custom_df, how='left')\n",
    "            # Drop the temp column and merge the flattened dataframe with orginal dataframe\n",
    "            df = df.drop('custom_data_temp', axis=1).merge(result_df, right_index=True, left_index=True)\n",
    "        else:\n",
    "            df = pd.concat([df, custom_df], axis=1)\n",
    "\n",
    "    # Return dataframe with flatten columns\n",
    "    return df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "def parse_hr_data_csv_df(df, bucket_name, bucket_prefix, entity_name):\n",
    "    \"\"\"\n",
    "    Parse HR Data CSV column(s) values from the dataframe\n",
    "    :param df: HR Data CSV file path on the S3\n",
    "    :param bucket_name: AWS S3 bucket name\n",
    "    :param bucket_prefix: AWS S3 bucket prefix\n",
    "    :param entity_name: HR Data CSV entity name\n",
    "    :return: returns df pandas dataframe\n",
    "    \"\"\"\n",
    "\n",
    "    # Check if the entity is part of json columns schema\n",
    "    if entity_name in json_columns_schema.keys():\n",
    "\n",
    "        # Pre defined custom data expected data frame\n",
    "        custom_data_expected_df=pd.DataFrame.from_dict(custom_data_schema[entity_name])\n",
    "\n",
    "        if entity_name == 'user_point_transactions':\n",
    "            df['reference_type_json'] = [column if column[0] == '{' else None for column in df['reference_type']]\n",
    "            df['reference_type'] = [None if column[0] == '{' else column for column in df['reference_type']]\n",
    "\n",
    "            # Flatten reference_type_json CSV column to individual columns\n",
    "            upt_df = flatten_json_columns(df=df,\n",
    "                                          json_cols=json_columns_schema[entity_name],\n",
    "                                          custom_df=custom_data_expected_df)\n",
    "\n",
    "            parsed_df = upt_df.drop('reference_type_json', axis=1)\n",
    "        else:\n",
    "            # Flatten CSV JSON Columns to individual columns\n",
    "            parsed_df = flatten_json_columns(df=df,\n",
    "                                             json_cols=json_columns_schema[entity_name],\n",
    "                                             custom_df=custom_data_expected_df)\n",
    "    else:\n",
    "        # Return raw dataframe\n",
    "        parsed_df = df\n",
    "\n",
    "    # Get file name based on s3 source path\n",
    "    hr_data_csv_file_name = get_file_name(ib_bucket_name=bucket_name, ib_bucket_prefix=bucket_prefix)\n",
    "    logging.info(\"hr_data_csv_file_name= {}\".format(hr_data_csv_file_name))\n",
    "\n",
    "    # Create file name column in parsed data frame\n",
    "    parsed_df['source_file_name'] = hr_data_csv_file_name\n",
    "\n",
    "    # Get source file date and datetime into a list based on HR Data csv file name\n",
    "    hr_data_csv_source_file_date_and_datetime = get_source_file_date_time(file_name = hr_data_csv_file_name)\n",
    "\n",
    "    # Create source file date and datetime column(s) in parsed data frame\n",
    "    parsed_df['source_file_date'] = hr_data_csv_source_file_date_and_datetime[0]\n",
    "    parsed_df['source_file_datetime'] = hr_data_csv_source_file_date_and_datetime[1]\n",
    "\n",
    "    # Return pandas dataframe\n",
    "    return parsed_df"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "CSV Parsing function with parameters"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "def parsing_launcher(ib_bucket_name, ib_bucket_name_prefix, dlp_bucket_name, dlp_bucket_name_prefix, hive_style_partitions, source_entities_list):\n",
    "    \"\"\"\n",
    "    Invokes the HR Data CSV Parser to ingest data to data lake\n",
    "    :param ib_bucket_name: AWS S3 bucket name in data platform accounts\n",
    "    :param ib_bucket_name_prefix: AWS S3 bucket name prefix\n",
    "    :param dlp_bucket_name: AWS S3 data lake processing bucket name in data platform accounts\n",
    "    :param dlp_bucket_name_prefix: AWS S3 datalake processing bucket name prefix\n",
    "    :param hive_style_partitions: S3 partitions to read the file\n",
    "    :param source_entities_list: List of entities to process\n",
    "    :return: None\n",
    "    \"\"\"\n",
    "\n",
    "    for entity in source_entities_list:\n",
    "\n",
    "        logging.info(\"entity_name= {}\".format(entity))\n",
    "\n",
    "        # Constructing source path based on source entity\n",
    "        s3_bucket = \"s3://\" + ib_bucket_name + \"/\"\n",
    "        s3_bucket_prefix = ib_bucket_name_prefix + \"/\" + entity + \"/\" + hive_style_partitions\n",
    "        source_path = s3_bucket + s3_bucket_prefix\n",
    "        logging.info(\"Full source_path {}\".format(source_path))\n",
    "\n",
    "        # Constructing target path based on source entity\n",
    "        target_path =  \"s3://\" + dlp_bucket_name + \"/\" + dlp_bucket_name_prefix + \"/\" + entity + \"/\" + hive_style_partitions\n",
    "        logging.info(\"Full target_path {}\".format(target_path))\n",
    "\n",
    "        # Read HR Data CSV data from S3 path\n",
    "        raw_df = read_hr_data_csv(file_path=source_path,\n",
    "                                                file_schema=read_schema[entity],\n",
    "                                                date_cols=date_columns_schema[entity])\n",
    "\n",
    "        # If raw data frame is not empty then parse it\n",
    "        if not raw_df.empty:\n",
    "            # Parse CSV data\n",
    "            parsed_hr_data_df = parse_hr_data_csv_df(df=raw_df,\n",
    "                                                     bucket_name=ib_bucket_name,\n",
    "                                                     bucket_prefix=s3_bucket_prefix,\n",
    "                                                     entity_name= entity)\n",
    "\n",
    "            try:\n",
    "                # Convert column data types using astype with write schema\n",
    "                hr_data_astype_dtypes = parsed_hr_data_df.astype(write_schema[entity], copy=True)\n",
    "            except:\n",
    "                # Convert column data types using astype with write schema and ignore errors\n",
    "                hr_data_astype_dtypes = parsed_hr_data_df.astype(write_schema[entity], copy=True, errors='ignore')\n",
    "\n",
    "            # Write result to S3\n",
    "            write_to_s3(df=hr_data_astype_dtypes,\n",
    "                        target_path=target_path)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Invoke launcher"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:root:entity_name= countries\n",
      "INFO:root:Full source_path s3://ay-dp-dev-test-bucket/test-hr-data/countries/yyyy=2021/mm=04/dd=22\n",
      "INFO:root:Full target_path s3://ay-dp-dev-test-bucket/test-hr-data-parquet/countries/yyyy=2021/mm=04/dd=22\n",
      "INFO:root:hr_data_csv_file_name= \n",
      "INFO:root:stored_files {'paths': ['s3://ay-dp-dev-test-bucket/test-hr-data-parquet/countries/yyyy=2021/mm=04/dd=22/59b1fa0b998e411e98e8ff6948f9c780.snappy.parquet'], 'partitions_values': {}}\n",
      "INFO:root:entity_name= departments\n",
      "INFO:root:Full source_path s3://ay-dp-dev-test-bucket/test-hr-data/departments/yyyy=2021/mm=04/dd=22\n",
      "INFO:root:Full target_path s3://ay-dp-dev-test-bucket/test-hr-data-parquet/departments/yyyy=2021/mm=04/dd=22\n",
      "INFO:root:hr_data_csv_file_name= \n",
      "INFO:root:stored_files {'paths': ['s3://ay-dp-dev-test-bucket/test-hr-data-parquet/departments/yyyy=2021/mm=04/dd=22/fcb5f951bd4f4077b37402b3f802d503.snappy.parquet'], 'partitions_values': {}}\n",
      "INFO:root:entity_name= employees\n",
      "INFO:root:Full source_path s3://ay-dp-dev-test-bucket/test-hr-data/employees/yyyy=2021/mm=04/dd=22\n",
      "INFO:root:Full target_path s3://ay-dp-dev-test-bucket/test-hr-data-parquet/employees/yyyy=2021/mm=04/dd=22\n",
      "INFO:root:hr_data_csv_file_name= \n",
      "INFO:root:stored_files {'paths': ['s3://ay-dp-dev-test-bucket/test-hr-data-parquet/employees/yyyy=2021/mm=04/dd=22/6484a1a54bd54651939a658a06f1c76e.snappy.parquet'], 'partitions_values': {}}\n",
      "INFO:root:entity_name= job_history\n",
      "INFO:root:Full source_path s3://ay-dp-dev-test-bucket/test-hr-data/job_history/yyyy=2021/mm=04/dd=22\n",
      "INFO:root:Full target_path s3://ay-dp-dev-test-bucket/test-hr-data-parquet/job_history/yyyy=2021/mm=04/dd=22\n",
      "INFO:root:hr_data_csv_file_name= \n",
      "INFO:root:stored_files {'paths': ['s3://ay-dp-dev-test-bucket/test-hr-data-parquet/job_history/yyyy=2021/mm=04/dd=22/d79def2d905342228e866a593361cb70.snappy.parquet'], 'partitions_values': {}}\n",
      "INFO:root:entity_name= jobs\n",
      "INFO:root:Full source_path s3://ay-dp-dev-test-bucket/test-hr-data/jobs/yyyy=2021/mm=04/dd=22\n",
      "INFO:root:Full target_path s3://ay-dp-dev-test-bucket/test-hr-data-parquet/jobs/yyyy=2021/mm=04/dd=22\n",
      "INFO:root:hr_data_csv_file_name= \n",
      "INFO:root:stored_files {'paths': ['s3://ay-dp-dev-test-bucket/test-hr-data-parquet/jobs/yyyy=2021/mm=04/dd=22/671b60c0ad834a579bd0ca04e258257b.snappy.parquet'], 'partitions_values': {}}\n",
      "INFO:root:entity_name= locations\n",
      "INFO:root:Full source_path s3://ay-dp-dev-test-bucket/test-hr-data/locations/yyyy=2021/mm=04/dd=22\n",
      "INFO:root:Full target_path s3://ay-dp-dev-test-bucket/test-hr-data-parquet/locations/yyyy=2021/mm=04/dd=22\n",
      "INFO:root:hr_data_csv_file_name= \n",
      "INFO:root:stored_files {'paths': ['s3://ay-dp-dev-test-bucket/test-hr-data-parquet/locations/yyyy=2021/mm=04/dd=22/f6a65a13ab30447a80a021da5c25156f.snappy.parquet'], 'partitions_values': {}}\n",
      "INFO:root:entity_name= regions\n",
      "INFO:root:Full source_path s3://ay-dp-dev-test-bucket/test-hr-data/regions/yyyy=2021/mm=04/dd=22\n",
      "INFO:root:Full target_path s3://ay-dp-dev-test-bucket/test-hr-data-parquet/regions/yyyy=2021/mm=04/dd=22\n",
      "INFO:root:hr_data_csv_file_name= \n",
      "INFO:root:stored_files {'paths': ['s3://ay-dp-dev-test-bucket/test-hr-data-parquet/regions/yyyy=2021/mm=04/dd=22/28ba548b8b454de7a6acd69a4d1d9b83.snappy.parquet'], 'partitions_values': {}}\n"
     ]
    }
   ],
   "source": [
    "parsing_launcher(ib_bucket_name=\"ay-dp-dev-test-bucket\",\n",
    "                 ib_bucket_name_prefix=\"test-hr-data\",\n",
    "                 dlp_bucket_name=\"ay-dp-dev-test-bucket\",\n",
    "                 dlp_bucket_name_prefix=\"test-hr-data-parquet\",\n",
    "                 hive_style_partitions=\"yyyy=2021/mm=04/dd=22\",\n",
    "                 source_entities_list=literal_eval(\"['countries', 'departments', 'employees', 'job_history', 'jobs', 'locations', 'regions']\"))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Read parquet data from target path"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "countries_df = wr.s3.read_parquet(path=\"s3://ay-dp-dev-test-bucket/test-hr-data-parquet/countries/yyyy=2021/mm=04/dd=22\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>country_id</th>\n",
       "      <th>country_name</th>\n",
       "      <th>region_id</th>\n",
       "      <th>source_file_name</th>\n",
       "      <th>source_file_date</th>\n",
       "      <th>source_file_datetime</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>IT</td>\n",
       "      <td>Italy</td>\n",
       "      <td>1</td>\n",
       "      <td></td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>JP</td>\n",
       "      <td>Japan</td>\n",
       "      <td>3</td>\n",
       "      <td></td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>US</td>\n",
       "      <td>United States of America</td>\n",
       "      <td>2</td>\n",
       "      <td></td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>CA</td>\n",
       "      <td>Canada</td>\n",
       "      <td>2</td>\n",
       "      <td></td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>CN</td>\n",
       "      <td>China</td>\n",
       "      <td>3</td>\n",
       "      <td></td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>IN</td>\n",
       "      <td>India</td>\n",
       "      <td>3</td>\n",
       "      <td></td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>AU</td>\n",
       "      <td>Australia</td>\n",
       "      <td>3</td>\n",
       "      <td></td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>ZW</td>\n",
       "      <td>Zimbabwe</td>\n",
       "      <td>4</td>\n",
       "      <td></td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>SG</td>\n",
       "      <td>Singapore</td>\n",
       "      <td>3</td>\n",
       "      <td></td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>UK</td>\n",
       "      <td>United Kingdom</td>\n",
       "      <td>1</td>\n",
       "      <td></td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>FR</td>\n",
       "      <td>France</td>\n",
       "      <td>1</td>\n",
       "      <td></td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>DE</td>\n",
       "      <td>Germany</td>\n",
       "      <td>1</td>\n",
       "      <td></td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>ZM</td>\n",
       "      <td>Zambia</td>\n",
       "      <td>4</td>\n",
       "      <td></td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>EG</td>\n",
       "      <td>Egypt</td>\n",
       "      <td>4</td>\n",
       "      <td></td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>BR</td>\n",
       "      <td>Brazil</td>\n",
       "      <td>2</td>\n",
       "      <td></td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>CH</td>\n",
       "      <td>Switzerland</td>\n",
       "      <td>1</td>\n",
       "      <td></td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>NL</td>\n",
       "      <td>Netherlands</td>\n",
       "      <td>1</td>\n",
       "      <td></td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>MX</td>\n",
       "      <td>Mexico</td>\n",
       "      <td>2</td>\n",
       "      <td></td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>KW</td>\n",
       "      <td>Kuwait</td>\n",
       "      <td>4</td>\n",
       "      <td></td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>IL</td>\n",
       "      <td>Israel</td>\n",
       "      <td>4</td>\n",
       "      <td></td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20</th>\n",
       "      <td>DK</td>\n",
       "      <td>Denmark</td>\n",
       "      <td>1</td>\n",
       "      <td></td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>21</th>\n",
       "      <td>HK</td>\n",
       "      <td>HongKong</td>\n",
       "      <td>3</td>\n",
       "      <td></td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>22</th>\n",
       "      <td>NG</td>\n",
       "      <td>Nigeria</td>\n",
       "      <td>4</td>\n",
       "      <td></td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>23</th>\n",
       "      <td>AR</td>\n",
       "      <td>Argentina</td>\n",
       "      <td>2</td>\n",
       "      <td></td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24</th>\n",
       "      <td>BE</td>\n",
       "      <td>Belgium</td>\n",
       "      <td>1</td>\n",
       "      <td></td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   country_id              country_name  region_id source_file_name  \\\n",
       "0          IT                     Italy          1                    \n",
       "1          JP                     Japan          3                    \n",
       "2          US  United States of America          2                    \n",
       "3          CA                    Canada          2                    \n",
       "4          CN                     China          3                    \n",
       "5          IN                     India          3                    \n",
       "6          AU                 Australia          3                    \n",
       "7          ZW                  Zimbabwe          4                    \n",
       "8          SG                 Singapore          3                    \n",
       "9          UK            United Kingdom          1                    \n",
       "10         FR                    France          1                    \n",
       "11         DE                   Germany          1                    \n",
       "12         ZM                    Zambia          4                    \n",
       "13         EG                     Egypt          4                    \n",
       "14         BR                    Brazil          2                    \n",
       "15         CH               Switzerland          1                    \n",
       "16         NL               Netherlands          1                    \n",
       "17         MX                    Mexico          2                    \n",
       "18         KW                    Kuwait          4                    \n",
       "19         IL                    Israel          4                    \n",
       "20         DK                   Denmark          1                    \n",
       "21         HK                  HongKong          3                    \n",
       "22         NG                   Nigeria          4                    \n",
       "23         AR                 Argentina          2                    \n",
       "24         BE                   Belgium          1                    \n",
       "\n",
       "   source_file_date source_file_datetime  \n",
       "0              None                 None  \n",
       "1              None                 None  \n",
       "2              None                 None  \n",
       "3              None                 None  \n",
       "4              None                 None  \n",
       "5              None                 None  \n",
       "6              None                 None  \n",
       "7              None                 None  \n",
       "8              None                 None  \n",
       "9              None                 None  \n",
       "10             None                 None  \n",
       "11             None                 None  \n",
       "12             None                 None  \n",
       "13             None                 None  \n",
       "14             None                 None  \n",
       "15             None                 None  \n",
       "16             None                 None  \n",
       "17             None                 None  \n",
       "18             None                 None  \n",
       "19             None                 None  \n",
       "20             None                 None  \n",
       "21             None                 None  \n",
       "22             None                 None  \n",
       "23             None                 None  \n",
       "24             None                 None  "
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "countries_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "departments_df = wr.s3.read_parquet(path=\"s3://ay-dp-dev-test-bucket/test-hr-data-parquet/departments/yyyy=2021/mm=04/dd=22\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>department_id</th>\n",
       "      <th>department_name</th>\n",
       "      <th>manager_id</th>\n",
       "      <th>location_id</th>\n",
       "      <th>source_file_name</th>\n",
       "      <th>source_file_date</th>\n",
       "      <th>source_file_datetime</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>10</td>\n",
       "      <td>Administration</td>\n",
       "      <td>200</td>\n",
       "      <td>1700</td>\n",
       "      <td></td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>20</td>\n",
       "      <td>Marketing</td>\n",
       "      <td>201</td>\n",
       "      <td>1800</td>\n",
       "      <td></td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>30</td>\n",
       "      <td>Purchasing</td>\n",
       "      <td>114</td>\n",
       "      <td>1700</td>\n",
       "      <td></td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>40</td>\n",
       "      <td>Human Resources</td>\n",
       "      <td>203</td>\n",
       "      <td>2400</td>\n",
       "      <td></td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>50</td>\n",
       "      <td>Shipping</td>\n",
       "      <td>121</td>\n",
       "      <td>1500</td>\n",
       "      <td></td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>60</td>\n",
       "      <td>IT</td>\n",
       "      <td>103</td>\n",
       "      <td>1400</td>\n",
       "      <td></td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>70</td>\n",
       "      <td>Public Relations</td>\n",
       "      <td>204</td>\n",
       "      <td>2700</td>\n",
       "      <td></td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>80</td>\n",
       "      <td>Sales</td>\n",
       "      <td>145</td>\n",
       "      <td>2500</td>\n",
       "      <td></td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>90</td>\n",
       "      <td>Executive</td>\n",
       "      <td>100</td>\n",
       "      <td>1700</td>\n",
       "      <td></td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>100</td>\n",
       "      <td>Finance</td>\n",
       "      <td>108</td>\n",
       "      <td>1700</td>\n",
       "      <td></td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>110</td>\n",
       "      <td>Accounting</td>\n",
       "      <td>205</td>\n",
       "      <td>1700</td>\n",
       "      <td></td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>120</td>\n",
       "      <td>Treasury</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>1700</td>\n",
       "      <td></td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>130</td>\n",
       "      <td>Corporate Tax</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>1700</td>\n",
       "      <td></td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>140</td>\n",
       "      <td>Control And Credit</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>1700</td>\n",
       "      <td></td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>150</td>\n",
       "      <td>Shareholder Services</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>1700</td>\n",
       "      <td></td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>160</td>\n",
       "      <td>Benefits</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>1700</td>\n",
       "      <td></td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>170</td>\n",
       "      <td>Manufacturing</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>1700</td>\n",
       "      <td></td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>180</td>\n",
       "      <td>Construction</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>1700</td>\n",
       "      <td></td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>190</td>\n",
       "      <td>Contracting</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>1700</td>\n",
       "      <td></td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>200</td>\n",
       "      <td>Operations</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>1700</td>\n",
       "      <td></td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20</th>\n",
       "      <td>210</td>\n",
       "      <td>IT Support</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>1700</td>\n",
       "      <td></td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>21</th>\n",
       "      <td>220</td>\n",
       "      <td>NOC</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>1700</td>\n",
       "      <td></td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>22</th>\n",
       "      <td>230</td>\n",
       "      <td>IT Helpdesk</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>1700</td>\n",
       "      <td></td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>23</th>\n",
       "      <td>240</td>\n",
       "      <td>Government Sales</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>1700</td>\n",
       "      <td></td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24</th>\n",
       "      <td>250</td>\n",
       "      <td>Retail Sales</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>1700</td>\n",
       "      <td></td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25</th>\n",
       "      <td>260</td>\n",
       "      <td>Recruiting</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>1700</td>\n",
       "      <td></td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>26</th>\n",
       "      <td>270</td>\n",
       "      <td>Payroll</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>1700</td>\n",
       "      <td></td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    department_id       department_name  manager_id  location_id  \\\n",
       "0              10        Administration         200         1700   \n",
       "1              20             Marketing         201         1800   \n",
       "2              30            Purchasing         114         1700   \n",
       "3              40       Human Resources         203         2400   \n",
       "4              50              Shipping         121         1500   \n",
       "5              60                    IT         103         1400   \n",
       "6              70      Public Relations         204         2700   \n",
       "7              80                 Sales         145         2500   \n",
       "8              90             Executive         100         1700   \n",
       "9             100               Finance         108         1700   \n",
       "10            110            Accounting         205         1700   \n",
       "11            120              Treasury        <NA>         1700   \n",
       "12            130         Corporate Tax        <NA>         1700   \n",
       "13            140    Control And Credit        <NA>         1700   \n",
       "14            150  Shareholder Services        <NA>         1700   \n",
       "15            160              Benefits        <NA>         1700   \n",
       "16            170         Manufacturing        <NA>         1700   \n",
       "17            180          Construction        <NA>         1700   \n",
       "18            190           Contracting        <NA>         1700   \n",
       "19            200            Operations        <NA>         1700   \n",
       "20            210            IT Support        <NA>         1700   \n",
       "21            220                   NOC        <NA>         1700   \n",
       "22            230           IT Helpdesk        <NA>         1700   \n",
       "23            240      Government Sales        <NA>         1700   \n",
       "24            250          Retail Sales        <NA>         1700   \n",
       "25            260            Recruiting        <NA>         1700   \n",
       "26            270               Payroll        <NA>         1700   \n",
       "\n",
       "   source_file_name source_file_date source_file_datetime  \n",
       "0                               None                 None  \n",
       "1                               None                 None  \n",
       "2                               None                 None  \n",
       "3                               None                 None  \n",
       "4                               None                 None  \n",
       "5                               None                 None  \n",
       "6                               None                 None  \n",
       "7                               None                 None  \n",
       "8                               None                 None  \n",
       "9                               None                 None  \n",
       "10                              None                 None  \n",
       "11                              None                 None  \n",
       "12                              None                 None  \n",
       "13                              None                 None  \n",
       "14                              None                 None  \n",
       "15                              None                 None  \n",
       "16                              None                 None  \n",
       "17                              None                 None  \n",
       "18                              None                 None  \n",
       "19                              None                 None  \n",
       "20                              None                 None  \n",
       "21                              None                 None  \n",
       "22                              None                 None  \n",
       "23                              None                 None  \n",
       "24                              None                 None  \n",
       "25                              None                 None  \n",
       "26                              None                 None  "
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "departments_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "employees_df = wr.s3.read_parquet(path=\"s3://ay-dp-dev-test-bucket/test-hr-data-parquet/employees/yyyy=2021/mm=04/dd=22\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>employee_id</th>\n",
       "      <th>first_name</th>\n",
       "      <th>last_name</th>\n",
       "      <th>email</th>\n",
       "      <th>phone_number</th>\n",
       "      <th>hire_date</th>\n",
       "      <th>job_id</th>\n",
       "      <th>salary</th>\n",
       "      <th>commission_pct</th>\n",
       "      <th>manager_id</th>\n",
       "      <th>department_id</th>\n",
       "      <th>custom_data</th>\n",
       "      <th>custom_data_address_line_1</th>\n",
       "      <th>custom_data_address_line_2</th>\n",
       "      <th>custom_data_zip</th>\n",
       "      <th>custom_data_date_of_birth</th>\n",
       "      <th>custom_data_gender</th>\n",
       "      <th>custom_data_marital_status</th>\n",
       "      <th>source_file_name</th>\n",
       "      <th>source_file_date</th>\n",
       "      <th>source_file_datetime</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>100</td>\n",
       "      <td>Steven</td>\n",
       "      <td>King</td>\n",
       "      <td>SKING</td>\n",
       "      <td>515.123.4567</td>\n",
       "      <td>1987-06-17</td>\n",
       "      <td>AD_PRES</td>\n",
       "      <td>24000.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>90</td>\n",
       "      <td>nan</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td></td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>101</td>\n",
       "      <td>Neena</td>\n",
       "      <td>Kochhar</td>\n",
       "      <td>NKOCHHAR</td>\n",
       "      <td>515.123.4568</td>\n",
       "      <td>1989-09-21</td>\n",
       "      <td>AD_VP</td>\n",
       "      <td>17000.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>100</td>\n",
       "      <td>90</td>\n",
       "      <td>nan</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td></td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>102</td>\n",
       "      <td>Lex</td>\n",
       "      <td>De Haan</td>\n",
       "      <td>LDEHAAN</td>\n",
       "      <td>515.123.4569</td>\n",
       "      <td>1993-01-13</td>\n",
       "      <td>AD_VP</td>\n",
       "      <td>17000.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>100</td>\n",
       "      <td>90</td>\n",
       "      <td>nan</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td></td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>103</td>\n",
       "      <td>Alexander</td>\n",
       "      <td>Hunold</td>\n",
       "      <td>AHUNOLD</td>\n",
       "      <td>590.423.4567</td>\n",
       "      <td>1990-01-03</td>\n",
       "      <td>IT_PROG</td>\n",
       "      <td>9000.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>102</td>\n",
       "      <td>60</td>\n",
       "      <td>{\"address_line_1\":\"6649 N Blue Gum St\",\"addres...</td>\n",
       "      <td>6649 N Blue Gum St</td>\n",
       "      <td>C 22</td>\n",
       "      <td>70116</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td></td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>104</td>\n",
       "      <td>Bruce</td>\n",
       "      <td>Ernst</td>\n",
       "      <td>BERNST</td>\n",
       "      <td>590.423.4568</td>\n",
       "      <td>1991-05-21</td>\n",
       "      <td>IT_PROG</td>\n",
       "      <td>6000.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>103</td>\n",
       "      <td>60</td>\n",
       "      <td>nan</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td></td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>102</th>\n",
       "      <td>202</td>\n",
       "      <td>Pat</td>\n",
       "      <td>Fay</td>\n",
       "      <td>PFAY</td>\n",
       "      <td>603.123.6666</td>\n",
       "      <td>1997-08-17</td>\n",
       "      <td>MK_REP</td>\n",
       "      <td>6000.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>201</td>\n",
       "      <td>20</td>\n",
       "      <td>nan</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td></td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>103</th>\n",
       "      <td>203</td>\n",
       "      <td>Susan</td>\n",
       "      <td>Mavris</td>\n",
       "      <td>SMAVRIS</td>\n",
       "      <td>515.123.7777</td>\n",
       "      <td>1994-06-07</td>\n",
       "      <td>HR_REP</td>\n",
       "      <td>6500.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>101</td>\n",
       "      <td>40</td>\n",
       "      <td>{\"address_line_1\":\"2371 Jerrold Ave\",\"address_...</td>\n",
       "      <td>2371 Jerrold Ave</td>\n",
       "      <td>Block D</td>\n",
       "      <td>19443</td>\n",
       "      <td>1954-06-07</td>\n",
       "      <td>F</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td></td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>104</th>\n",
       "      <td>204</td>\n",
       "      <td>Hermann</td>\n",
       "      <td>Baer</td>\n",
       "      <td>HBAER</td>\n",
       "      <td>515.123.8888</td>\n",
       "      <td>1994-06-07</td>\n",
       "      <td>PR_REP</td>\n",
       "      <td>10000.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>101</td>\n",
       "      <td>70</td>\n",
       "      <td>nan</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td></td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>105</th>\n",
       "      <td>205</td>\n",
       "      <td>Shelley</td>\n",
       "      <td>Higgins</td>\n",
       "      <td>SHIGGINS</td>\n",
       "      <td>515.123.8080</td>\n",
       "      <td>1994-06-07</td>\n",
       "      <td>AC_MGR</td>\n",
       "      <td>12000.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>101</td>\n",
       "      <td>110</td>\n",
       "      <td>nan</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td></td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>106</th>\n",
       "      <td>206</td>\n",
       "      <td>William</td>\n",
       "      <td>Gietz</td>\n",
       "      <td>WGIETZ</td>\n",
       "      <td>515.123.8181</td>\n",
       "      <td>1994-06-07</td>\n",
       "      <td>AC_ACCOUNT</td>\n",
       "      <td>8300.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>205</td>\n",
       "      <td>110</td>\n",
       "      <td>{\"address_line_1\":\"37275 St Rt 17m M\",\"address...</td>\n",
       "      <td>37275 St Rt 17m M</td>\n",
       "      <td>Block C</td>\n",
       "      <td>11953</td>\n",
       "      <td>1964-01-07</td>\n",
       "      <td>M</td>\n",
       "      <td>Married</td>\n",
       "      <td></td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>107 rows × 21 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "     employee_id first_name last_name     email  phone_number  hire_date  \\\n",
       "0            100     Steven      King     SKING  515.123.4567 1987-06-17   \n",
       "1            101      Neena   Kochhar  NKOCHHAR  515.123.4568 1989-09-21   \n",
       "2            102        Lex   De Haan   LDEHAAN  515.123.4569 1993-01-13   \n",
       "3            103  Alexander    Hunold   AHUNOLD  590.423.4567 1990-01-03   \n",
       "4            104      Bruce     Ernst    BERNST  590.423.4568 1991-05-21   \n",
       "..           ...        ...       ...       ...           ...        ...   \n",
       "102          202        Pat       Fay      PFAY  603.123.6666 1997-08-17   \n",
       "103          203      Susan    Mavris   SMAVRIS  515.123.7777 1994-06-07   \n",
       "104          204    Hermann      Baer     HBAER  515.123.8888 1994-06-07   \n",
       "105          205    Shelley   Higgins  SHIGGINS  515.123.8080 1994-06-07   \n",
       "106          206    William     Gietz    WGIETZ  515.123.8181 1994-06-07   \n",
       "\n",
       "         job_id   salary  commission_pct  manager_id  department_id  \\\n",
       "0       AD_PRES  24000.0             NaN        <NA>             90   \n",
       "1         AD_VP  17000.0             NaN         100             90   \n",
       "2         AD_VP  17000.0             NaN         100             90   \n",
       "3       IT_PROG   9000.0             NaN         102             60   \n",
       "4       IT_PROG   6000.0             NaN         103             60   \n",
       "..          ...      ...             ...         ...            ...   \n",
       "102      MK_REP   6000.0             NaN         201             20   \n",
       "103      HR_REP   6500.0             NaN         101             40   \n",
       "104      PR_REP  10000.0             NaN         101             70   \n",
       "105      AC_MGR  12000.0             NaN         101            110   \n",
       "106  AC_ACCOUNT   8300.0             NaN         205            110   \n",
       "\n",
       "                                           custom_data  \\\n",
       "0                                                  nan   \n",
       "1                                                  nan   \n",
       "2                                                  nan   \n",
       "3    {\"address_line_1\":\"6649 N Blue Gum St\",\"addres...   \n",
       "4                                                  nan   \n",
       "..                                                 ...   \n",
       "102                                                nan   \n",
       "103  {\"address_line_1\":\"2371 Jerrold Ave\",\"address_...   \n",
       "104                                                nan   \n",
       "105                                                nan   \n",
       "106  {\"address_line_1\":\"37275 St Rt 17m M\",\"address...   \n",
       "\n",
       "    custom_data_address_line_1 custom_data_address_line_2 custom_data_zip  \\\n",
       "0                         <NA>                       <NA>            <NA>   \n",
       "1                         <NA>                       <NA>            <NA>   \n",
       "2                         <NA>                       <NA>            <NA>   \n",
       "3           6649 N Blue Gum St                       C 22           70116   \n",
       "4                         <NA>                       <NA>            <NA>   \n",
       "..                         ...                        ...             ...   \n",
       "102                       <NA>                       <NA>            <NA>   \n",
       "103           2371 Jerrold Ave                    Block D           19443   \n",
       "104                       <NA>                       <NA>            <NA>   \n",
       "105                       <NA>                       <NA>            <NA>   \n",
       "106          37275 St Rt 17m M                    Block C           11953   \n",
       "\n",
       "    custom_data_date_of_birth custom_data_gender custom_data_marital_status  \\\n",
       "0                        <NA>               <NA>                       <NA>   \n",
       "1                        <NA>               <NA>                       <NA>   \n",
       "2                        <NA>               <NA>                       <NA>   \n",
       "3                        <NA>               <NA>                       <NA>   \n",
       "4                        <NA>               <NA>                       <NA>   \n",
       "..                        ...                ...                        ...   \n",
       "102                      <NA>               <NA>                       <NA>   \n",
       "103                1954-06-07                  F                       <NA>   \n",
       "104                      <NA>               <NA>                       <NA>   \n",
       "105                      <NA>               <NA>                       <NA>   \n",
       "106                1964-01-07                  M                    Married   \n",
       "\n",
       "    source_file_name source_file_date source_file_datetime  \n",
       "0                                None                 None  \n",
       "1                                None                 None  \n",
       "2                                None                 None  \n",
       "3                                None                 None  \n",
       "4                                None                 None  \n",
       "..               ...              ...                  ...  \n",
       "102                              None                 None  \n",
       "103                              None                 None  \n",
       "104                              None                 None  \n",
       "105                              None                 None  \n",
       "106                              None                 None  \n",
       "\n",
       "[107 rows x 21 columns]"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "employees_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "job_history_df = wr.s3.read_parquet(path=\"s3://ay-dp-dev-test-bucket/test-hr-data-parquet/job_history/yyyy=2021/mm=04/dd=22\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>employee_id</th>\n",
       "      <th>start_date</th>\n",
       "      <th>end_date</th>\n",
       "      <th>job_id</th>\n",
       "      <th>department_id</th>\n",
       "      <th>source_file_name</th>\n",
       "      <th>source_file_date</th>\n",
       "      <th>source_file_datetime</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>102</td>\n",
       "      <td>1993-01-13</td>\n",
       "      <td>1998-07-24</td>\n",
       "      <td>IT_PROG</td>\n",
       "      <td>60</td>\n",
       "      <td></td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>101</td>\n",
       "      <td>1989-09-21</td>\n",
       "      <td>1993-10-27</td>\n",
       "      <td>AC_ACCOUNT</td>\n",
       "      <td>110</td>\n",
       "      <td></td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>101</td>\n",
       "      <td>1993-10-28</td>\n",
       "      <td>1997-03-15</td>\n",
       "      <td>AC_MGR</td>\n",
       "      <td>110</td>\n",
       "      <td></td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>201</td>\n",
       "      <td>1996-02-17</td>\n",
       "      <td>1999-12-19</td>\n",
       "      <td>MK_REP</td>\n",
       "      <td>20</td>\n",
       "      <td></td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>114</td>\n",
       "      <td>1998-03-24</td>\n",
       "      <td>1999-12-31</td>\n",
       "      <td>ST_CLERK</td>\n",
       "      <td>50</td>\n",
       "      <td></td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>122</td>\n",
       "      <td>1999-01-01</td>\n",
       "      <td>1999-12-31</td>\n",
       "      <td>ST_CLERK</td>\n",
       "      <td>50</td>\n",
       "      <td></td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>200</td>\n",
       "      <td>1987-09-17</td>\n",
       "      <td>1993-06-17</td>\n",
       "      <td>AD_ASST</td>\n",
       "      <td>90</td>\n",
       "      <td></td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>176</td>\n",
       "      <td>1998-03-24</td>\n",
       "      <td>1998-12-31</td>\n",
       "      <td>SA_REP</td>\n",
       "      <td>80</td>\n",
       "      <td></td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>176</td>\n",
       "      <td>1999-01-01</td>\n",
       "      <td>1999-12-31</td>\n",
       "      <td>SA_MAN</td>\n",
       "      <td>80</td>\n",
       "      <td></td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>200</td>\n",
       "      <td>1994-07-01</td>\n",
       "      <td>1998-12-31</td>\n",
       "      <td>AC_ACCOUNT</td>\n",
       "      <td>90</td>\n",
       "      <td></td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   employee_id start_date   end_date      job_id  department_id  \\\n",
       "0          102 1993-01-13 1998-07-24     IT_PROG             60   \n",
       "1          101 1989-09-21 1993-10-27  AC_ACCOUNT            110   \n",
       "2          101 1993-10-28 1997-03-15      AC_MGR            110   \n",
       "3          201 1996-02-17 1999-12-19      MK_REP             20   \n",
       "4          114 1998-03-24 1999-12-31    ST_CLERK             50   \n",
       "5          122 1999-01-01 1999-12-31    ST_CLERK             50   \n",
       "6          200 1987-09-17 1993-06-17     AD_ASST             90   \n",
       "7          176 1998-03-24 1998-12-31      SA_REP             80   \n",
       "8          176 1999-01-01 1999-12-31      SA_MAN             80   \n",
       "9          200 1994-07-01 1998-12-31  AC_ACCOUNT             90   \n",
       "\n",
       "  source_file_name source_file_date source_file_datetime  \n",
       "0                              None                 None  \n",
       "1                              None                 None  \n",
       "2                              None                 None  \n",
       "3                              None                 None  \n",
       "4                              None                 None  \n",
       "5                              None                 None  \n",
       "6                              None                 None  \n",
       "7                              None                 None  \n",
       "8                              None                 None  \n",
       "9                              None                 None  "
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "job_history_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "jobs_df = wr.s3.read_parquet(path=\"s3://ay-dp-dev-test-bucket/test-hr-data-parquet/jobs/yyyy=2021/mm=04/dd=22\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>job_id</th>\n",
       "      <th>job_title</th>\n",
       "      <th>min_salary</th>\n",
       "      <th>max_salary</th>\n",
       "      <th>source_file_name</th>\n",
       "      <th>source_file_date</th>\n",
       "      <th>source_file_datetime</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>AD_PRES</td>\n",
       "      <td>President</td>\n",
       "      <td>20000</td>\n",
       "      <td>40000</td>\n",
       "      <td></td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>AD_VP</td>\n",
       "      <td>Administration Vice President</td>\n",
       "      <td>15000</td>\n",
       "      <td>30000</td>\n",
       "      <td></td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>AD_ASST</td>\n",
       "      <td>Administration Assistant</td>\n",
       "      <td>3000</td>\n",
       "      <td>6000</td>\n",
       "      <td></td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>FI_MGR</td>\n",
       "      <td>Finance Manager</td>\n",
       "      <td>8200</td>\n",
       "      <td>16000</td>\n",
       "      <td></td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>FI_ACCOUNT</td>\n",
       "      <td>Accountant</td>\n",
       "      <td>4200</td>\n",
       "      <td>9000</td>\n",
       "      <td></td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>AC_MGR</td>\n",
       "      <td>Accounting Manager</td>\n",
       "      <td>8200</td>\n",
       "      <td>16000</td>\n",
       "      <td></td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>AC_ACCOUNT</td>\n",
       "      <td>Public Accountant</td>\n",
       "      <td>4200</td>\n",
       "      <td>9000</td>\n",
       "      <td></td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>SA_MAN</td>\n",
       "      <td>Sales Manager</td>\n",
       "      <td>10000</td>\n",
       "      <td>20000</td>\n",
       "      <td></td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>SA_REP</td>\n",
       "      <td>Sales Representative</td>\n",
       "      <td>6000</td>\n",
       "      <td>12000</td>\n",
       "      <td></td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>PU_MAN</td>\n",
       "      <td>Purchasing Manager</td>\n",
       "      <td>8000</td>\n",
       "      <td>15000</td>\n",
       "      <td></td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>PU_CLERK</td>\n",
       "      <td>Purchasing Clerk</td>\n",
       "      <td>2500</td>\n",
       "      <td>5500</td>\n",
       "      <td></td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>ST_MAN</td>\n",
       "      <td>Stock Manager</td>\n",
       "      <td>5500</td>\n",
       "      <td>8500</td>\n",
       "      <td></td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>ST_CLERK</td>\n",
       "      <td>Stock Clerk</td>\n",
       "      <td>2000</td>\n",
       "      <td>5000</td>\n",
       "      <td></td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>SH_CLERK</td>\n",
       "      <td>Shipping Clerk</td>\n",
       "      <td>2500</td>\n",
       "      <td>5500</td>\n",
       "      <td></td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>IT_PROG</td>\n",
       "      <td>Programmer</td>\n",
       "      <td>4000</td>\n",
       "      <td>10000</td>\n",
       "      <td></td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>MK_MAN</td>\n",
       "      <td>Marketing Manager</td>\n",
       "      <td>9000</td>\n",
       "      <td>15000</td>\n",
       "      <td></td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>MK_REP</td>\n",
       "      <td>Marketing Representative</td>\n",
       "      <td>4000</td>\n",
       "      <td>9000</td>\n",
       "      <td></td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>HR_REP</td>\n",
       "      <td>Human Resources Representative</td>\n",
       "      <td>4000</td>\n",
       "      <td>9000</td>\n",
       "      <td></td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>PR_REP</td>\n",
       "      <td>Public Relations Representative</td>\n",
       "      <td>4500</td>\n",
       "      <td>10500</td>\n",
       "      <td></td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "        job_id                        job_title  min_salary  max_salary  \\\n",
       "0      AD_PRES                        President       20000       40000   \n",
       "1        AD_VP    Administration Vice President       15000       30000   \n",
       "2      AD_ASST         Administration Assistant        3000        6000   \n",
       "3       FI_MGR                  Finance Manager        8200       16000   \n",
       "4   FI_ACCOUNT                       Accountant        4200        9000   \n",
       "5       AC_MGR               Accounting Manager        8200       16000   \n",
       "6   AC_ACCOUNT                Public Accountant        4200        9000   \n",
       "7       SA_MAN                    Sales Manager       10000       20000   \n",
       "8       SA_REP             Sales Representative        6000       12000   \n",
       "9       PU_MAN               Purchasing Manager        8000       15000   \n",
       "10    PU_CLERK                 Purchasing Clerk        2500        5500   \n",
       "11      ST_MAN                    Stock Manager        5500        8500   \n",
       "12    ST_CLERK                      Stock Clerk        2000        5000   \n",
       "13    SH_CLERK                   Shipping Clerk        2500        5500   \n",
       "14     IT_PROG                       Programmer        4000       10000   \n",
       "15      MK_MAN                Marketing Manager        9000       15000   \n",
       "16      MK_REP         Marketing Representative        4000        9000   \n",
       "17      HR_REP   Human Resources Representative        4000        9000   \n",
       "18      PR_REP  Public Relations Representative        4500       10500   \n",
       "\n",
       "   source_file_name source_file_date source_file_datetime  \n",
       "0                               None                 None  \n",
       "1                               None                 None  \n",
       "2                               None                 None  \n",
       "3                               None                 None  \n",
       "4                               None                 None  \n",
       "5                               None                 None  \n",
       "6                               None                 None  \n",
       "7                               None                 None  \n",
       "8                               None                 None  \n",
       "9                               None                 None  \n",
       "10                              None                 None  \n",
       "11                              None                 None  \n",
       "12                              None                 None  \n",
       "13                              None                 None  \n",
       "14                              None                 None  \n",
       "15                              None                 None  \n",
       "16                              None                 None  \n",
       "17                              None                 None  \n",
       "18                              None                 None  "
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "jobs_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "locations_df = wr.s3.read_parquet(path=\"s3://ay-dp-dev-test-bucket/test-hr-data-parquet/locations/yyyy=2021/mm=04/dd=22\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>location_id</th>\n",
       "      <th>street_address</th>\n",
       "      <th>postal_code</th>\n",
       "      <th>city</th>\n",
       "      <th>state_province</th>\n",
       "      <th>country_id</th>\n",
       "      <th>source_file_name</th>\n",
       "      <th>source_file_date</th>\n",
       "      <th>source_file_datetime</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1000</td>\n",
       "      <td>1297 Via Cola di Rie</td>\n",
       "      <td>989</td>\n",
       "      <td>Roma</td>\n",
       "      <td>nan</td>\n",
       "      <td>IT</td>\n",
       "      <td></td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1100</td>\n",
       "      <td>93091 Calle della Testa</td>\n",
       "      <td>10934</td>\n",
       "      <td>Venice</td>\n",
       "      <td>nan</td>\n",
       "      <td>IT</td>\n",
       "      <td></td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1200</td>\n",
       "      <td>2017 Shinjuku-ku</td>\n",
       "      <td>1689</td>\n",
       "      <td>Tokyo</td>\n",
       "      <td>Tokyo Prefecture</td>\n",
       "      <td>JP</td>\n",
       "      <td></td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1300</td>\n",
       "      <td>9450 Kamiya-cho</td>\n",
       "      <td>6823</td>\n",
       "      <td>Hiroshima</td>\n",
       "      <td>nan</td>\n",
       "      <td>JP</td>\n",
       "      <td></td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1400</td>\n",
       "      <td>2014 Jabberwocky Rd</td>\n",
       "      <td>26192</td>\n",
       "      <td>Southlake</td>\n",
       "      <td>Texas</td>\n",
       "      <td>US</td>\n",
       "      <td></td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>1500</td>\n",
       "      <td>2011 Interiors Blvd</td>\n",
       "      <td>99236</td>\n",
       "      <td>South San Francisco</td>\n",
       "      <td>California</td>\n",
       "      <td>US</td>\n",
       "      <td></td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>1600</td>\n",
       "      <td>2007 Zagora St</td>\n",
       "      <td>50090</td>\n",
       "      <td>South Brunswick</td>\n",
       "      <td>New Jersey</td>\n",
       "      <td>US</td>\n",
       "      <td></td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>1700</td>\n",
       "      <td>2004 Charade Rd</td>\n",
       "      <td>98199</td>\n",
       "      <td>Seattle</td>\n",
       "      <td>Washington</td>\n",
       "      <td>US</td>\n",
       "      <td></td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>1800</td>\n",
       "      <td>147 Spadina Ave</td>\n",
       "      <td>M5V 2L7</td>\n",
       "      <td>Toronto</td>\n",
       "      <td>Ontario</td>\n",
       "      <td>CA</td>\n",
       "      <td></td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>1900</td>\n",
       "      <td>6092 Boxwood St</td>\n",
       "      <td>YSW 9T2</td>\n",
       "      <td>Whitehorse</td>\n",
       "      <td>Yukon</td>\n",
       "      <td>CA</td>\n",
       "      <td></td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>2000</td>\n",
       "      <td>40-5-12 Laogianggen</td>\n",
       "      <td>190518</td>\n",
       "      <td>Beijing</td>\n",
       "      <td>nan</td>\n",
       "      <td>CN</td>\n",
       "      <td></td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>2100</td>\n",
       "      <td>1298 Vileparle (E)</td>\n",
       "      <td>490231</td>\n",
       "      <td>Bombay</td>\n",
       "      <td>Maharashtra</td>\n",
       "      <td>IN</td>\n",
       "      <td></td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>2200</td>\n",
       "      <td>12-98 Victoria Street</td>\n",
       "      <td>2901</td>\n",
       "      <td>Sydney</td>\n",
       "      <td>New South Wales</td>\n",
       "      <td>AU</td>\n",
       "      <td></td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>2300</td>\n",
       "      <td>198 Clementi North</td>\n",
       "      <td>540198</td>\n",
       "      <td>Singapore</td>\n",
       "      <td>nan</td>\n",
       "      <td>SG</td>\n",
       "      <td></td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>2400</td>\n",
       "      <td>8204 Arthur St</td>\n",
       "      <td>nan</td>\n",
       "      <td>London</td>\n",
       "      <td>nan</td>\n",
       "      <td>UK</td>\n",
       "      <td></td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>2500</td>\n",
       "      <td>Magdalen Centre, The Oxford Science Park</td>\n",
       "      <td>OX9 9ZB</td>\n",
       "      <td>Oxford</td>\n",
       "      <td>Oxford</td>\n",
       "      <td>UK</td>\n",
       "      <td></td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>2600</td>\n",
       "      <td>9702 Chester Road</td>\n",
       "      <td>9629850293</td>\n",
       "      <td>Stretford</td>\n",
       "      <td>Manchester</td>\n",
       "      <td>UK</td>\n",
       "      <td></td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>2700</td>\n",
       "      <td>Schwanthalerstr. 7031</td>\n",
       "      <td>80925</td>\n",
       "      <td>Munich</td>\n",
       "      <td>Bavaria</td>\n",
       "      <td>DE</td>\n",
       "      <td></td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>2800</td>\n",
       "      <td>Rua Frei Caneca 1360</td>\n",
       "      <td>01307-002</td>\n",
       "      <td>Sao Paulo</td>\n",
       "      <td>Sao Paulo</td>\n",
       "      <td>BR</td>\n",
       "      <td></td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>2900</td>\n",
       "      <td>20 Rue des Corps-Saints</td>\n",
       "      <td>1730</td>\n",
       "      <td>Geneva</td>\n",
       "      <td>Geneve</td>\n",
       "      <td>CH</td>\n",
       "      <td></td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20</th>\n",
       "      <td>3000</td>\n",
       "      <td>Murtenstrasse 921</td>\n",
       "      <td>3095</td>\n",
       "      <td>Bern</td>\n",
       "      <td>BE</td>\n",
       "      <td>CH</td>\n",
       "      <td></td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>21</th>\n",
       "      <td>3100</td>\n",
       "      <td>Pieter Breughelstraat 837</td>\n",
       "      <td>3029SK</td>\n",
       "      <td>Utrecht</td>\n",
       "      <td>Utrecht</td>\n",
       "      <td>NL</td>\n",
       "      <td></td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>22</th>\n",
       "      <td>3200</td>\n",
       "      <td>Mariano Escobedo 9991</td>\n",
       "      <td>11932</td>\n",
       "      <td>Mexico City</td>\n",
       "      <td>Distrito Federal,</td>\n",
       "      <td>MX</td>\n",
       "      <td></td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    location_id                            street_address postal_code  \\\n",
       "0          1000                      1297 Via Cola di Rie         989   \n",
       "1          1100                   93091 Calle della Testa       10934   \n",
       "2          1200                          2017 Shinjuku-ku        1689   \n",
       "3          1300                           9450 Kamiya-cho        6823   \n",
       "4          1400                       2014 Jabberwocky Rd       26192   \n",
       "5          1500                       2011 Interiors Blvd       99236   \n",
       "6          1600                            2007 Zagora St       50090   \n",
       "7          1700                           2004 Charade Rd       98199   \n",
       "8          1800                           147 Spadina Ave     M5V 2L7   \n",
       "9          1900                           6092 Boxwood St     YSW 9T2   \n",
       "10         2000                       40-5-12 Laogianggen      190518   \n",
       "11         2100                        1298 Vileparle (E)      490231   \n",
       "12         2200                     12-98 Victoria Street        2901   \n",
       "13         2300                        198 Clementi North      540198   \n",
       "14         2400                            8204 Arthur St         nan   \n",
       "15         2500  Magdalen Centre, The Oxford Science Park     OX9 9ZB   \n",
       "16         2600                         9702 Chester Road  9629850293   \n",
       "17         2700                     Schwanthalerstr. 7031       80925   \n",
       "18         2800                     Rua Frei Caneca 1360    01307-002   \n",
       "19         2900                   20 Rue des Corps-Saints        1730   \n",
       "20         3000                         Murtenstrasse 921        3095   \n",
       "21         3100                 Pieter Breughelstraat 837      3029SK   \n",
       "22         3200                     Mariano Escobedo 9991       11932   \n",
       "\n",
       "                   city     state_province country_id source_file_name  \\\n",
       "0                  Roma                nan         IT                    \n",
       "1                Venice                nan         IT                    \n",
       "2                 Tokyo   Tokyo Prefecture         JP                    \n",
       "3             Hiroshima                nan         JP                    \n",
       "4             Southlake              Texas         US                    \n",
       "5   South San Francisco         California         US                    \n",
       "6       South Brunswick         New Jersey         US                    \n",
       "7               Seattle         Washington         US                    \n",
       "8               Toronto            Ontario         CA                    \n",
       "9            Whitehorse              Yukon         CA                    \n",
       "10              Beijing                nan         CN                    \n",
       "11               Bombay        Maharashtra         IN                    \n",
       "12               Sydney    New South Wales         AU                    \n",
       "13            Singapore                nan         SG                    \n",
       "14               London                nan         UK                    \n",
       "15               Oxford             Oxford         UK                    \n",
       "16            Stretford         Manchester         UK                    \n",
       "17               Munich            Bavaria         DE                    \n",
       "18            Sao Paulo          Sao Paulo         BR                    \n",
       "19               Geneva             Geneve         CH                    \n",
       "20                 Bern                 BE         CH                    \n",
       "21              Utrecht            Utrecht         NL                    \n",
       "22          Mexico City  Distrito Federal,         MX                    \n",
       "\n",
       "   source_file_date source_file_datetime  \n",
       "0              None                 None  \n",
       "1              None                 None  \n",
       "2              None                 None  \n",
       "3              None                 None  \n",
       "4              None                 None  \n",
       "5              None                 None  \n",
       "6              None                 None  \n",
       "7              None                 None  \n",
       "8              None                 None  \n",
       "9              None                 None  \n",
       "10             None                 None  \n",
       "11             None                 None  \n",
       "12             None                 None  \n",
       "13             None                 None  \n",
       "14             None                 None  \n",
       "15             None                 None  \n",
       "16             None                 None  \n",
       "17             None                 None  \n",
       "18             None                 None  \n",
       "19             None                 None  \n",
       "20             None                 None  \n",
       "21             None                 None  \n",
       "22             None                 None  "
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "locations_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "regions_df = wr.s3.read_parquet(path=\"s3://ay-dp-dev-test-bucket/test-hr-data-parquet/regions/yyyy=2021/mm=04/dd=22\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>region_id</th>\n",
       "      <th>region_name</th>\n",
       "      <th>source_file_name</th>\n",
       "      <th>source_file_date</th>\n",
       "      <th>source_file_datetime</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>Europe</td>\n",
       "      <td></td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>Americas</td>\n",
       "      <td></td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>Asia</td>\n",
       "      <td></td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>Middle East and Africa</td>\n",
       "      <td></td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   region_id             region_name source_file_name source_file_date  \\\n",
       "0          1                  Europe                              None   \n",
       "1          2                Americas                              None   \n",
       "2          3                    Asia                              None   \n",
       "3          4  Middle East and Africa                              None   \n",
       "\n",
       "  source_file_datetime  \n",
       "0                 None  \n",
       "1                 None  \n",
       "2                 None  \n",
       "3                 None  "
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "regions_df"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
 }