Created
July 29, 2021 14:04
-
-
Save vvgsrk/145f6bf64aa966cd7240b1cbcd399c5f to your computer and use it in GitHub Desktop.
HR CSV Data Parser Using Python and Pandas
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Import all standard library modules" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import logging\n", | |
"import os\n", | |
"import pandas as pd\n", | |
"import awswrangler as wr\n", | |
"import boto3\n", | |
"import sys\n", | |
"import json\n", | |
"import numpy as np\n", | |
"from ast import literal_eval\n", | |
"from datetime import datetime" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Declare Event logging" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"logger = logging.getLogger(__name__)\n", | |
"logging.getLogger().setLevel(logging.INFO)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Set jupyter notebook row and column display" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Set ipython's max row display\n", | |
"pd.set_option('display.max_row', 100)\n", | |
"\n", | |
"# Set iPython's max column width to 50\n", | |
"pd.set_option('display.max_columns', 70)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Get OS environment credentials for AWS " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"os.environ['AWS_PROFILE'] = \"fdp-dev\"" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Amazon Simple Storage Service (S3)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"s3 = boto3.client('s3')" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"CSV Schema for each enitiy" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"read_schema = {\n", | |
"\n", | |
" 'countries' : {\n", | |
" 'country_id' : 'str',\n", | |
" 'country_name' : 'str',\n", | |
" 'region_id' : 'Int64'\n", | |
" },\n", | |
" \n", | |
" 'departments' : {\n", | |
" 'department_id' : 'Int64',\n", | |
" 'department_name' : 'str',\n", | |
" 'manager_id' : 'Int64',\n", | |
" 'location_id' : 'Int64'\t\n", | |
" },\n", | |
"\n", | |
" 'employees' : {\n", | |
" 'employee_id' : 'Int64',\n", | |
" 'first_name' : 'str',\n", | |
" 'last_name' : 'str',\n", | |
" 'email' : 'str',\n", | |
" 'phone_number' : 'str',\n", | |
" 'hire_date' : 'str',\n", | |
" 'job_id' : 'str',\n", | |
" 'salary' : 'float64',\n", | |
" 'commission_pct' : 'float64',\n", | |
" 'manager_id' : 'Int64',\n", | |
" 'department_id' : 'Int64',\n", | |
" 'custom_data' : 'str'\n", | |
" },\n", | |
" \n", | |
" 'job_history' : {\n", | |
" 'employee_id' : 'Int64',\n", | |
" 'start_date' : 'str',\n", | |
" 'end_date' : 'str',\n", | |
" 'job_id' : 'str',\n", | |
" 'department_id' : 'Int64'\t\n", | |
" },\n", | |
" \n", | |
" 'jobs' : {\n", | |
" 'job_id' : 'str',\n", | |
" 'job_title' : 'str',\n", | |
" 'min_salary' : 'Int64',\n", | |
" 'max_salary' : 'Int64'\n", | |
" },\n", | |
" \n", | |
" 'locations' : {\n", | |
" 'location_id' : 'Int64',\n", | |
" 'street_address' : 'str',\n", | |
" 'postal_code' : 'str',\n", | |
" 'city' : 'str',\n", | |
" 'state_province' : 'str',\n", | |
" 'country_id' : 'str'\n", | |
" },\n", | |
" \n", | |
" 'regions' : {\n", | |
" 'region_id' : 'Int64', \n", | |
" 'region_name' : 'str'\n", | |
" }\n", | |
"}\n", | |
"\n", | |
"date_columns_schema = {\n", | |
"\n", | |
" 'employees' : ['hire_date'],\t\n", | |
" 'job_history' : ['start_date', 'end_date'],\n", | |
" 'countries' : [],\n", | |
" 'departments' : [],\n", | |
" 'jobs' : [], \n", | |
" 'locations' : [], \n", | |
" 'regions' : []\n", | |
"}\n", | |
"\n", | |
"json_columns_schema = {\n", | |
" 'employees' : ['custom_data']\n", | |
"}\n", | |
"\n", | |
"custom_data_schema = {\n", | |
"\n", | |
" 'employees' : {\n", | |
" 'custom_data_address_line_1': [],\n", | |
" 'custom_data_address_line_2': [],\n", | |
" 'custom_data_zip': [],\n", | |
" 'custom_data_date_of_birth': [],\n", | |
" 'custom_data_gender': [],\n", | |
" 'custom_data_marital_status': [] \n", | |
" }\n", | |
"}\n", | |
"\n", | |
"write_schema = {\n", | |
" 'countries' : {\n", | |
" 'country_id' : 'str',\n", | |
" 'country_name' : 'str',\n", | |
" 'region_id' : 'Int64'\n", | |
" },\n", | |
" \n", | |
" 'departments' : {\n", | |
" 'department_id' : 'Int64',\n", | |
" 'department_name' : 'str',\n", | |
" 'manager_id' : 'Int64',\n", | |
" 'location_id' : 'Int64'\t\n", | |
" },\n", | |
" \n", | |
" 'employees' : {\n", | |
" 'employee_id' : 'Int64',\n", | |
" 'first_name' : 'str',\n", | |
" 'last_name' : 'str',\n", | |
" 'email' : 'str',\n", | |
" 'phone_number' : 'str',\n", | |
" 'hire_date' : 'datetime64[ns]',\n", | |
" 'job_id' : 'str',\n", | |
" 'salary' : 'float64',\n", | |
" 'commission_pct' : 'float64',\n", | |
" 'manager_id' : 'Int64',\n", | |
" 'department_id' : 'Int64',\n", | |
" 'custom_data' : 'str'\n", | |
" },\n", | |
" \n", | |
" 'job_history' : {\n", | |
" 'employee_id' : 'Int64',\n", | |
" 'start_date' : 'datetime64[ns]',\n", | |
" 'end_date' : 'datetime64[ns]',\n", | |
" 'job_id' : 'str',\n", | |
" 'department_id' : 'Int64'\t\n", | |
" },\n", | |
" \n", | |
" 'jobs' : {\n", | |
" 'job_id' : 'str',\n", | |
" 'job_title' : 'str',\n", | |
" 'min_salary' : 'Int64',\n", | |
" 'max_salary' : 'Int64'\n", | |
" },\n", | |
" \n", | |
" 'locations' : {\n", | |
" 'location_id' : 'Int64',\n", | |
" 'street_address' : 'str',\n", | |
" 'postal_code' : 'str',\n", | |
" 'city' : 'str',\n", | |
" 'state_province' : 'str',\n", | |
" 'country_id' : 'str'\n", | |
" },\n", | |
" \n", | |
" 'regions' : {\n", | |
" 'region_id' : 'Int64', \n", | |
" 'region_name' : 'str'\n", | |
" }\t\n", | |
"}" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Helper functions" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def write_to_s3(df, target_path):\n", | |
" \"\"\"\n", | |
" Write pandas dataframe data as parquet file(s) on Amazon S3.\n", | |
" :param df: Pandas DataFrame name\n", | |
" :param target path: full target path to store data\n", | |
" :return: Dictionary with the following information:\n", | |
" ‘paths’: List of all stored files paths on S3.\n", | |
" ‘partitions_values’: Dictionary of partitions added with keys as S3 path locations and values as a list of partitions values as str.\n", | |
" \"\"\"\n", | |
"\n", | |
" try:\n", | |
" stored_files = wr.s3.to_parquet(\n", | |
" df=df,\n", | |
" path=target_path,\n", | |
" index=False,\n", | |
" compression=\"snappy\",\n", | |
" use_threads=False,\n", | |
" dataset=True,\n", | |
" mode=\"overwrite\"\n", | |
" )\n", | |
" except Exception as e:\n", | |
" raise(e)\n", | |
"\n", | |
" logging.info(\"stored_files {}\".format(stored_files))\n", | |
" return stored_files" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def read_hr_data_csv(file_path, file_schema, date_cols):\n", | |
" \"\"\"\n", | |
" Extract HR Data column values from the CSV file\n", | |
" :param file_path: HR Data file path on the S3\n", | |
" :param file_schema: CSV schema\n", | |
" :param date_cols: date columns to parse\n", | |
" :return: returns raw_df pandas dataframe\n", | |
" \"\"\"\n", | |
"\n", | |
" # Read HR Data CSV on S3 using pandas dataframe\n", | |
" raw_df = wr.s3.read_csv(path=file_path, dtype=file_schema, parse_dates=date_cols)\n", | |
"\n", | |
" # Return pandas dataframe\n", | |
" return raw_df" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def get_file_name(ib_bucket_name, ib_bucket_prefix):\n", | |
" \"\"\"\n", | |
" Extract HR Data CSV file name from source path\n", | |
" :param ib_bucket_name: HR Data S3 bucket name\n", | |
" :param ib_bucket_prefix: HR Data CSV file path prefix on S3\n", | |
" :return: returns file name as a string\n", | |
" \"\"\"\n", | |
" try:\n", | |
" result = s3.list_objects_v2(Bucket=ib_bucket_name, Prefix=ib_bucket_prefix)\n", | |
" result_list = result['Contents'][0]['Key'].split(\"/\")\n", | |
"\n", | |
" # Return file name based on source file path\n", | |
" return result_list[-1]\n", | |
" except:\n", | |
" return None" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def get_source_file_date_time(file_name):\n", | |
" \"\"\"\n", | |
" Extract HR Data CSV file date from file name\n", | |
" :param file_name: HR Data CSV file name on S3\n", | |
" :return: returns file date as timestamp\n", | |
" \"\"\"\n", | |
" try:\n", | |
" # Split the file name based on .csv and _ to extract file name elements\n", | |
" file_name_list = file_name.split(\".csv\")[0].split(\"_\")\n", | |
"\n", | |
" # Extract date and date time from list\n", | |
" source_file_date_str = file_name_list[-2]\n", | |
" source_file_date_time_str = file_name_list[-2] + file_name_list[-1]\n", | |
"\n", | |
" logging.info(\"source_file_date_str= {}\".format(source_file_date_str))\n", | |
" logging.info(\"source_file_date_time_str= {}\".format(source_file_date_time_str))\n", | |
"\n", | |
" # Return list of timestamp by converting string to datetime\n", | |
" return [pd.to_datetime(source_file_date_str), pd.to_datetime(source_file_date_time_str)]\n", | |
" except:\n", | |
" return [None, None]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def flatten_json_columns(df, json_cols, custom_df):\n", | |
" \"\"\"\n", | |
" This function flattens JSON columns to individual columns\n", | |
" It merges the flattened dataframe with expected dataframe to capture missing columns from JSON\n", | |
" :param df: HR Data CSV raw dataframe\n", | |
" :param json_cols: custom data columns in CSV's\n", | |
" :param custom_df: expected dataframe\n", | |
" :return: returns df pandas dataframe\n", | |
" \"\"\"\n", | |
"\n", | |
" # Loop through all JSON columns\n", | |
" for column in json_cols:\n", | |
" if not df[column].isnull().all():\n", | |
" # create a temp col to preserve the orginal data\n", | |
" df['custom_data_temp'] = df[column]\n", | |
" # Replace None and NaN with empty braces\n", | |
" df[column].fillna(value='{}', inplace=True)\n", | |
" try:\n", | |
" # Deserialize's a str instance containing a JSON document to a Python object\n", | |
" df[column] = [json.loads(row, strict=False) for row in df[column]]\n", | |
" except TypeError:\n", | |
" # Convert all values to string using literal eval\n", | |
" df[column] = df[column].apply(lambda x: literal_eval(str(x)))\n", | |
" # Normalize semi-structured JSON data into a flat table\n", | |
" column_as_df = pd.json_normalize(df[column])\n", | |
" # Extract main column name and attach it to each sub column name\n", | |
" column_as_df.columns = [f\"{column}_{subcolumn}\" for subcolumn in column_as_df.columns]\n", | |
" # Replace empty strings with None\n", | |
" column_as_df.replace('', np.nan, inplace=True)\n", | |
" # Replace orginal data with temp data\n", | |
" df[column] = df['custom_data_temp']\n", | |
" # Merge extracted result from custom_data field with expected fields\n", | |
" result_df = pd.merge(column_as_df, custom_df, how='left')\n", | |
" # Drop the temp column and merge the flattened dataframe with orginal dataframe\n", | |
" df = df.drop('custom_data_temp', axis=1).merge(result_df, right_index=True, left_index=True)\n", | |
" else:\n", | |
" df = pd.concat([df, custom_df], axis=1)\n", | |
"\n", | |
" # Return dataframe with flatten columns\n", | |
" return df" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def parse_hr_data_csv_df(df, bucket_name, bucket_prefix, entity_name):\n", | |
" \"\"\"\n", | |
" Parse HR Data CSV column(s) values from the dataframe\n", | |
" :param df: HR Data CSV file path on the S3\n", | |
" :param bucket_name: AWS S3 bucket name\n", | |
" :param bucket_prefix: AWS S3 bucket prefix\n", | |
" :param entity_name: HR Data CSV entity name\n", | |
" :return: returns df pandas dataframe\n", | |
" \"\"\"\n", | |
"\n", | |
" # Check if the entity is part of json columns schema\n", | |
" if entity_name in json_columns_schema.keys():\n", | |
"\n", | |
" # Pre defined custom data expected data frame\n", | |
" custom_data_expected_df=pd.DataFrame.from_dict(custom_data_schema[entity_name])\n", | |
"\n", | |
" if entity_name == 'user_point_transactions':\n", | |
" df['reference_type_json'] = [column if column[0] == '{' else None for column in df['reference_type']]\n", | |
" df['reference_type'] = [None if column[0] == '{' else column for column in df['reference_type']]\n", | |
"\n", | |
" # Flatten reference_type_json CSV column to individual columns\n", | |
" upt_df = flatten_json_columns(df=df,\n", | |
" json_cols=json_columns_schema[entity_name],\n", | |
" custom_df=custom_data_expected_df)\n", | |
"\n", | |
" parsed_df = upt_df.drop('reference_type_json', axis=1)\n", | |
" else:\n", | |
" # Flatten CSV JSON Columns to individual columns\n", | |
" parsed_df = flatten_json_columns(df=df,\n", | |
" json_cols=json_columns_schema[entity_name],\n", | |
" custom_df=custom_data_expected_df)\n", | |
" else:\n", | |
" # Return raw dataframe\n", | |
" parsed_df = df\n", | |
"\n", | |
" # Get file name based on s3 source path\n", | |
" hr_data_csv_file_name = get_file_name(ib_bucket_name=bucket_name, ib_bucket_prefix=bucket_prefix)\n", | |
" logging.info(\"hr_data_csv_file_name= {}\".format(hr_data_csv_file_name))\n", | |
"\n", | |
" # Create file name column in parsed data frame\n", | |
" parsed_df['source_file_name'] = hr_data_csv_file_name\n", | |
"\n", | |
" # Get source file date and datetime into a list based on HR Data csv file name\n", | |
" hr_data_csv_source_file_date_and_datetime = get_source_file_date_time(file_name = hr_data_csv_file_name)\n", | |
"\n", | |
" # Create source file date and datetime column(s) in parsed data frame\n", | |
" parsed_df['source_file_date'] = hr_data_csv_source_file_date_and_datetime[0]\n", | |
" parsed_df['source_file_datetime'] = hr_data_csv_source_file_date_and_datetime[1]\n", | |
"\n", | |
" # Return pandas dataframe\n", | |
" return parsed_df" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"CSV Parsing function with parameters" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def parsing_launcher(ib_bucket_name, ib_bucket_name_prefix, dlp_bucket_name, dlp_bucket_name_prefix, hive_style_partitions, source_entities_list):\n", | |
" \"\"\"\n", | |
" Invokes the HR Data CSV Parser to ingest data to data lake\n", | |
" :param ib_bucket_name: AWS S3 bucket name in data platform accounts\n", | |
" :param ib_bucket_name_prefix: AWS S3 bucket name prefix\n", | |
" :param dlp_bucket_name: AWS S3 data lake processing bucket name in data platform accounts\n", | |
" :param dlp_bucket_name_prefix: AWS S3 datalake processing bucket name prefix\n", | |
" :param hive_style_partitions: S3 partitions to read the file\n", | |
" :param source_entities_list: List of entities to process\n", | |
" :return: None\n", | |
" \"\"\"\n", | |
"\n", | |
" for entity in source_entities_list:\n", | |
"\n", | |
" logging.info(\"entity_name= {}\".format(entity))\n", | |
"\n", | |
" # Constructing source path based on source entity\n", | |
" s3_bucket = \"s3://\" + ib_bucket_name + \"/\"\n", | |
" s3_bucket_prefix = ib_bucket_name_prefix + \"/\" + entity + \"/\" + hive_style_partitions\n", | |
" source_path = s3_bucket + s3_bucket_prefix\n", | |
" logging.info(\"Full source_path {}\".format(source_path))\n", | |
"\n", | |
" # Constructing target path based on source entity\n", | |
" target_path = \"s3://\" + dlp_bucket_name + \"/\" + dlp_bucket_name_prefix + \"/\" + entity + \"/\" + hive_style_partitions\n", | |
" logging.info(\"Full target_path {}\".format(target_path))\n", | |
"\n", | |
" # Read HR Data CSV data from S3 path\n", | |
" raw_df = read_hr_data_csv(file_path=source_path,\n", | |
" file_schema=read_schema[entity],\n", | |
" date_cols=date_columns_schema[entity])\n", | |
"\n", | |
" # If raw data frame is not empty then parse it\n", | |
" if not raw_df.empty:\n", | |
" # Parse CSV data\n", | |
" parsed_hr_data_df = parse_hr_data_csv_df(df=raw_df,\n", | |
" bucket_name=ib_bucket_name,\n", | |
" bucket_prefix=s3_bucket_prefix,\n", | |
" entity_name= entity)\n", | |
"\n", | |
" try:\n", | |
" # Convert column data types using astype with write schema\n", | |
" hr_data_astype_dtypes = parsed_hr_data_df.astype(write_schema[entity], copy=True)\n", | |
" except:\n", | |
" # Convert column data types using astype with write schema and ignore errors\n", | |
" hr_data_astype_dtypes = parsed_hr_data_df.astype(write_schema[entity], copy=True, errors='ignore')\n", | |
"\n", | |
" # Write result to S3\n", | |
" write_to_s3(df=hr_data_astype_dtypes,\n", | |
" target_path=target_path)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Invoke launcher" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 14, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"INFO:root:entity_name= countries\n", | |
"INFO:root:Full source_path s3://ay-dp-dev-test-bucket/test-hr-data/countries/yyyy=2021/mm=04/dd=22\n", | |
"INFO:root:Full target_path s3://ay-dp-dev-test-bucket/test-hr-data-parquet/countries/yyyy=2021/mm=04/dd=22\n", | |
"INFO:root:hr_data_csv_file_name= \n", | |
"INFO:root:stored_files {'paths': ['s3://ay-dp-dev-test-bucket/test-hr-data-parquet/countries/yyyy=2021/mm=04/dd=22/59b1fa0b998e411e98e8ff6948f9c780.snappy.parquet'], 'partitions_values': {}}\n", | |
"INFO:root:entity_name= departments\n", | |
"INFO:root:Full source_path s3://ay-dp-dev-test-bucket/test-hr-data/departments/yyyy=2021/mm=04/dd=22\n", | |
"INFO:root:Full target_path s3://ay-dp-dev-test-bucket/test-hr-data-parquet/departments/yyyy=2021/mm=04/dd=22\n", | |
"INFO:root:hr_data_csv_file_name= \n", | |
"INFO:root:stored_files {'paths': ['s3://ay-dp-dev-test-bucket/test-hr-data-parquet/departments/yyyy=2021/mm=04/dd=22/fcb5f951bd4f4077b37402b3f802d503.snappy.parquet'], 'partitions_values': {}}\n", | |
"INFO:root:entity_name= employees\n", | |
"INFO:root:Full source_path s3://ay-dp-dev-test-bucket/test-hr-data/employees/yyyy=2021/mm=04/dd=22\n", | |
"INFO:root:Full target_path s3://ay-dp-dev-test-bucket/test-hr-data-parquet/employees/yyyy=2021/mm=04/dd=22\n", | |
"INFO:root:hr_data_csv_file_name= \n", | |
"INFO:root:stored_files {'paths': ['s3://ay-dp-dev-test-bucket/test-hr-data-parquet/employees/yyyy=2021/mm=04/dd=22/6484a1a54bd54651939a658a06f1c76e.snappy.parquet'], 'partitions_values': {}}\n", | |
"INFO:root:entity_name= job_history\n", | |
"INFO:root:Full source_path s3://ay-dp-dev-test-bucket/test-hr-data/job_history/yyyy=2021/mm=04/dd=22\n", | |
"INFO:root:Full target_path s3://ay-dp-dev-test-bucket/test-hr-data-parquet/job_history/yyyy=2021/mm=04/dd=22\n", | |
"INFO:root:hr_data_csv_file_name= \n", | |
"INFO:root:stored_files {'paths': ['s3://ay-dp-dev-test-bucket/test-hr-data-parquet/job_history/yyyy=2021/mm=04/dd=22/d79def2d905342228e866a593361cb70.snappy.parquet'], 'partitions_values': {}}\n", | |
"INFO:root:entity_name= jobs\n", | |
"INFO:root:Full source_path s3://ay-dp-dev-test-bucket/test-hr-data/jobs/yyyy=2021/mm=04/dd=22\n", | |
"INFO:root:Full target_path s3://ay-dp-dev-test-bucket/test-hr-data-parquet/jobs/yyyy=2021/mm=04/dd=22\n", | |
"INFO:root:hr_data_csv_file_name= \n", | |
"INFO:root:stored_files {'paths': ['s3://ay-dp-dev-test-bucket/test-hr-data-parquet/jobs/yyyy=2021/mm=04/dd=22/671b60c0ad834a579bd0ca04e258257b.snappy.parquet'], 'partitions_values': {}}\n", | |
"INFO:root:entity_name= locations\n", | |
"INFO:root:Full source_path s3://ay-dp-dev-test-bucket/test-hr-data/locations/yyyy=2021/mm=04/dd=22\n", | |
"INFO:root:Full target_path s3://ay-dp-dev-test-bucket/test-hr-data-parquet/locations/yyyy=2021/mm=04/dd=22\n", | |
"INFO:root:hr_data_csv_file_name= \n", | |
"INFO:root:stored_files {'paths': ['s3://ay-dp-dev-test-bucket/test-hr-data-parquet/locations/yyyy=2021/mm=04/dd=22/f6a65a13ab30447a80a021da5c25156f.snappy.parquet'], 'partitions_values': {}}\n", | |
"INFO:root:entity_name= regions\n", | |
"INFO:root:Full source_path s3://ay-dp-dev-test-bucket/test-hr-data/regions/yyyy=2021/mm=04/dd=22\n", | |
"INFO:root:Full target_path s3://ay-dp-dev-test-bucket/test-hr-data-parquet/regions/yyyy=2021/mm=04/dd=22\n", | |
"INFO:root:hr_data_csv_file_name= \n", | |
"INFO:root:stored_files {'paths': ['s3://ay-dp-dev-test-bucket/test-hr-data-parquet/regions/yyyy=2021/mm=04/dd=22/28ba548b8b454de7a6acd69a4d1d9b83.snappy.parquet'], 'partitions_values': {}}\n" | |
] | |
} | |
], | |
"source": [ | |
"parsing_launcher(ib_bucket_name=\"ay-dp-dev-test-bucket\",\n", | |
" ib_bucket_name_prefix=\"test-hr-data\",\n", | |
" dlp_bucket_name=\"ay-dp-dev-test-bucket\",\n", | |
" dlp_bucket_name_prefix=\"test-hr-data-parquet\",\n", | |
" hive_style_partitions=\"yyyy=2021/mm=04/dd=22\",\n", | |
" source_entities_list=literal_eval(\"['countries', 'departments', 'employees', 'job_history', 'jobs', 'locations', 'regions']\"))" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Read parquet data from target path" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 15, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"countries_df = wr.s3.read_parquet(path=\"s3://ay-dp-dev-test-bucket/test-hr-data-parquet/countries/yyyy=2021/mm=04/dd=22\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 16, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>country_id</th>\n", | |
" <th>country_name</th>\n", | |
" <th>region_id</th>\n", | |
" <th>source_file_name</th>\n", | |
" <th>source_file_date</th>\n", | |
" <th>source_file_datetime</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>IT</td>\n", | |
" <td>Italy</td>\n", | |
" <td>1</td>\n", | |
" <td></td>\n", | |
" <td>None</td>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>JP</td>\n", | |
" <td>Japan</td>\n", | |
" <td>3</td>\n", | |
" <td></td>\n", | |
" <td>None</td>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>US</td>\n", | |
" <td>United States of America</td>\n", | |
" <td>2</td>\n", | |
" <td></td>\n", | |
" <td>None</td>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>CA</td>\n", | |
" <td>Canada</td>\n", | |
" <td>2</td>\n", | |
" <td></td>\n", | |
" <td>None</td>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>CN</td>\n", | |
" <td>China</td>\n", | |
" <td>3</td>\n", | |
" <td></td>\n", | |
" <td>None</td>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>5</th>\n", | |
" <td>IN</td>\n", | |
" <td>India</td>\n", | |
" <td>3</td>\n", | |
" <td></td>\n", | |
" <td>None</td>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>6</th>\n", | |
" <td>AU</td>\n", | |
" <td>Australia</td>\n", | |
" <td>3</td>\n", | |
" <td></td>\n", | |
" <td>None</td>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>7</th>\n", | |
" <td>ZW</td>\n", | |
" <td>Zimbabwe</td>\n", | |
" <td>4</td>\n", | |
" <td></td>\n", | |
" <td>None</td>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>8</th>\n", | |
" <td>SG</td>\n", | |
" <td>Singapore</td>\n", | |
" <td>3</td>\n", | |
" <td></td>\n", | |
" <td>None</td>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>9</th>\n", | |
" <td>UK</td>\n", | |
" <td>United Kingdom</td>\n", | |
" <td>1</td>\n", | |
" <td></td>\n", | |
" <td>None</td>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>10</th>\n", | |
" <td>FR</td>\n", | |
" <td>France</td>\n", | |
" <td>1</td>\n", | |
" <td></td>\n", | |
" <td>None</td>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>11</th>\n", | |
" <td>DE</td>\n", | |
" <td>Germany</td>\n", | |
" <td>1</td>\n", | |
" <td></td>\n", | |
" <td>None</td>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>12</th>\n", | |
" <td>ZM</td>\n", | |
" <td>Zambia</td>\n", | |
" <td>4</td>\n", | |
" <td></td>\n", | |
" <td>None</td>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>13</th>\n", | |
" <td>EG</td>\n", | |
" <td>Egypt</td>\n", | |
" <td>4</td>\n", | |
" <td></td>\n", | |
" <td>None</td>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>14</th>\n", | |
" <td>BR</td>\n", | |
" <td>Brazil</td>\n", | |
" <td>2</td>\n", | |
" <td></td>\n", | |
" <td>None</td>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>15</th>\n", | |
" <td>CH</td>\n", | |
" <td>Switzerland</td>\n", | |
" <td>1</td>\n", | |
" <td></td>\n", | |
" <td>None</td>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>16</th>\n", | |
" <td>NL</td>\n", | |
" <td>Netherlands</td>\n", | |
" <td>1</td>\n", | |
" <td></td>\n", | |
" <td>None</td>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>17</th>\n", | |
" <td>MX</td>\n", | |
" <td>Mexico</td>\n", | |
" <td>2</td>\n", | |
" <td></td>\n", | |
" <td>None</td>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>18</th>\n", | |
" <td>KW</td>\n", | |
" <td>Kuwait</td>\n", | |
" <td>4</td>\n", | |
" <td></td>\n", | |
" <td>None</td>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>19</th>\n", | |
" <td>IL</td>\n", | |
" <td>Israel</td>\n", | |
" <td>4</td>\n", | |
" <td></td>\n", | |
" <td>None</td>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>20</th>\n", | |
" <td>DK</td>\n", | |
" <td>Denmark</td>\n", | |
" <td>1</td>\n", | |
" <td></td>\n", | |
" <td>None</td>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>21</th>\n", | |
" <td>HK</td>\n", | |
" <td>HongKong</td>\n", | |
" <td>3</td>\n", | |
" <td></td>\n", | |
" <td>None</td>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>22</th>\n", | |
" <td>NG</td>\n", | |
" <td>Nigeria</td>\n", | |
" <td>4</td>\n", | |
" <td></td>\n", | |
" <td>None</td>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>23</th>\n", | |
" <td>AR</td>\n", | |
" <td>Argentina</td>\n", | |
" <td>2</td>\n", | |
" <td></td>\n", | |
" <td>None</td>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>24</th>\n", | |
" <td>BE</td>\n", | |
" <td>Belgium</td>\n", | |
" <td>1</td>\n", | |
" <td></td>\n", | |
" <td>None</td>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" country_id country_name region_id source_file_name \\\n", | |
"0 IT Italy 1 \n", | |
"1 JP Japan 3 \n", | |
"2 US United States of America 2 \n", | |
"3 CA Canada 2 \n", | |
"4 CN China 3 \n", | |
"5 IN India 3 \n", | |
"6 AU Australia 3 \n", | |
"7 ZW Zimbabwe 4 \n", | |
"8 SG Singapore 3 \n", | |
"9 UK United Kingdom 1 \n", | |
"10 FR France 1 \n", | |
"11 DE Germany 1 \n", | |
"12 ZM Zambia 4 \n", | |
"13 EG Egypt 4 \n", | |
"14 BR Brazil 2 \n", | |
"15 CH Switzerland 1 \n", | |
"16 NL Netherlands 1 \n", | |
"17 MX Mexico 2 \n", | |
"18 KW Kuwait 4 \n", | |
"19 IL Israel 4 \n", | |
"20 DK Denmark 1 \n", | |
"21 HK HongKong 3 \n", | |
"22 NG Nigeria 4 \n", | |
"23 AR Argentina 2 \n", | |
"24 BE Belgium 1 \n", | |
"\n", | |
" source_file_date source_file_datetime \n", | |
"0 None None \n", | |
"1 None None \n", | |
"2 None None \n", | |
"3 None None \n", | |
"4 None None \n", | |
"5 None None \n", | |
"6 None None \n", | |
"7 None None \n", | |
"8 None None \n", | |
"9 None None \n", | |
"10 None None \n", | |
"11 None None \n", | |
"12 None None \n", | |
"13 None None \n", | |
"14 None None \n", | |
"15 None None \n", | |
"16 None None \n", | |
"17 None None \n", | |
"18 None None \n", | |
"19 None None \n", | |
"20 None None \n", | |
"21 None None \n", | |
"22 None None \n", | |
"23 None None \n", | |
"24 None None " | |
] | |
}, | |
"execution_count": 16, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"countries_df" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 17, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"departments_df = wr.s3.read_parquet(path=\"s3://ay-dp-dev-test-bucket/test-hr-data-parquet/departments/yyyy=2021/mm=04/dd=22\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 18, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>department_id</th>\n", | |
" <th>department_name</th>\n", | |
" <th>manager_id</th>\n", | |
" <th>location_id</th>\n", | |
" <th>source_file_name</th>\n", | |
" <th>source_file_date</th>\n", | |
" <th>source_file_datetime</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>10</td>\n", | |
" <td>Administration</td>\n", | |
" <td>200</td>\n", | |
" <td>1700</td>\n", | |
" <td></td>\n", | |
" <td>None</td>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>20</td>\n", | |
" <td>Marketing</td>\n", | |
" <td>201</td>\n", | |
" <td>1800</td>\n", | |
" <td></td>\n", | |
" <td>None</td>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>30</td>\n", | |
" <td>Purchasing</td>\n", | |
" <td>114</td>\n", | |
" <td>1700</td>\n", | |
" <td></td>\n", | |
" <td>None</td>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>40</td>\n", | |
" <td>Human Resources</td>\n", | |
" <td>203</td>\n", | |
" <td>2400</td>\n", | |
" <td></td>\n", | |
" <td>None</td>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>50</td>\n", | |
" <td>Shipping</td>\n", | |
" <td>121</td>\n", | |
" <td>1500</td>\n", | |
" <td></td>\n", | |
" <td>None</td>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>5</th>\n", | |
" <td>60</td>\n", | |
" <td>IT</td>\n", | |
" <td>103</td>\n", | |
" <td>1400</td>\n", | |
" <td></td>\n", | |
" <td>None</td>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>6</th>\n", | |
" <td>70</td>\n", | |
" <td>Public Relations</td>\n", | |
" <td>204</td>\n", | |
" <td>2700</td>\n", | |
" <td></td>\n", | |
" <td>None</td>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>7</th>\n", | |
" <td>80</td>\n", | |
" <td>Sales</td>\n", | |
" <td>145</td>\n", | |
" <td>2500</td>\n", | |
" <td></td>\n", | |
" <td>None</td>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>8</th>\n", | |
" <td>90</td>\n", | |
" <td>Executive</td>\n", | |
" <td>100</td>\n", | |
" <td>1700</td>\n", | |
" <td></td>\n", | |
" <td>None</td>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>9</th>\n", | |
" <td>100</td>\n", | |
" <td>Finance</td>\n", | |
" <td>108</td>\n", | |
" <td>1700</td>\n", | |
" <td></td>\n", | |
" <td>None</td>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>10</th>\n", | |
" <td>110</td>\n", | |
" <td>Accounting</td>\n", | |
" <td>205</td>\n", | |
" <td>1700</td>\n", | |
" <td></td>\n", | |
" <td>None</td>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>11</th>\n", | |
" <td>120</td>\n", | |
" <td>Treasury</td>\n", | |
" <td><NA></td>\n", | |
" <td>1700</td>\n", | |
" <td></td>\n", | |
" <td>None</td>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>12</th>\n", | |
" <td>130</td>\n", | |
" <td>Corporate Tax</td>\n", | |
" <td><NA></td>\n", | |
" <td>1700</td>\n", | |
" <td></td>\n", | |
" <td>None</td>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>13</th>\n", | |
" <td>140</td>\n", | |
" <td>Control And Credit</td>\n", | |
" <td><NA></td>\n", | |
" <td>1700</td>\n", | |
" <td></td>\n", | |
" <td>None</td>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>14</th>\n", | |
" <td>150</td>\n", | |
" <td>Shareholder Services</td>\n", | |
" <td><NA></td>\n", | |
" <td>1700</td>\n", | |
" <td></td>\n", | |
" <td>None</td>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>15</th>\n", | |
" <td>160</td>\n", | |
" <td>Benefits</td>\n", | |
" <td><NA></td>\n", | |
" <td>1700</td>\n", | |
" <td></td>\n", | |
" <td>None</td>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>16</th>\n", | |
" <td>170</td>\n", | |
" <td>Manufacturing</td>\n", | |
" <td><NA></td>\n", | |
" <td>1700</td>\n", | |
" <td></td>\n", | |
" <td>None</td>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>17</th>\n", | |
" <td>180</td>\n", | |
" <td>Construction</td>\n", | |
" <td><NA></td>\n", | |
" <td>1700</td>\n", | |
" <td></td>\n", | |
" <td>None</td>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>18</th>\n", | |
" <td>190</td>\n", | |
" <td>Contracting</td>\n", | |
" <td><NA></td>\n", | |
" <td>1700</td>\n", | |
" <td></td>\n", | |
" <td>None</td>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>19</th>\n", | |
" <td>200</td>\n", | |
" <td>Operations</td>\n", | |
" <td><NA></td>\n", | |
" <td>1700</td>\n", | |
" <td></td>\n", | |
" <td>None</td>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>20</th>\n", | |
" <td>210</td>\n", | |
" <td>IT Support</td>\n", | |
" <td><NA></td>\n", | |
" <td>1700</td>\n", | |
" <td></td>\n", | |
" <td>None</td>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>21</th>\n", | |
" <td>220</td>\n", | |
" <td>NOC</td>\n", | |
" <td><NA></td>\n", | |
" <td>1700</td>\n", | |
" <td></td>\n", | |
" <td>None</td>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>22</th>\n", | |
" <td>230</td>\n", | |
" <td>IT Helpdesk</td>\n", | |
" <td><NA></td>\n", | |
" <td>1700</td>\n", | |
" <td></td>\n", | |
" <td>None</td>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>23</th>\n", | |
" <td>240</td>\n", | |
" <td>Government Sales</td>\n", | |
" <td><NA></td>\n", | |
" <td>1700</td>\n", | |
" <td></td>\n", | |
" <td>None</td>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>24</th>\n", | |
" <td>250</td>\n", | |
" <td>Retail Sales</td>\n", | |
" <td><NA></td>\n", | |
" <td>1700</td>\n", | |
" <td></td>\n", | |
" <td>None</td>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>25</th>\n", | |
" <td>260</td>\n", | |
" <td>Recruiting</td>\n", | |
" <td><NA></td>\n", | |
" <td>1700</td>\n", | |
" <td></td>\n", | |
" <td>None</td>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>26</th>\n", | |
" <td>270</td>\n", | |
" <td>Payroll</td>\n", | |
" <td><NA></td>\n", | |
" <td>1700</td>\n", | |
" <td></td>\n", | |
" <td>None</td>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" department_id department_name manager_id location_id \\\n", | |
"0 10 Administration 200 1700 \n", | |
"1 20 Marketing 201 1800 \n", | |
"2 30 Purchasing 114 1700 \n", | |
"3 40 Human Resources 203 2400 \n", | |
"4 50 Shipping 121 1500 \n", | |
"5 60 IT 103 1400 \n", | |
"6 70 Public Relations 204 2700 \n", | |
"7 80 Sales 145 2500 \n", | |
"8 90 Executive 100 1700 \n", | |
"9 100 Finance 108 1700 \n", | |
"10 110 Accounting 205 1700 \n", | |
"11 120 Treasury <NA> 1700 \n", | |
"12 130 Corporate Tax <NA> 1700 \n", | |
"13 140 Control And Credit <NA> 1700 \n", | |
"14 150 Shareholder Services <NA> 1700 \n", | |
"15 160 Benefits <NA> 1700 \n", | |
"16 170 Manufacturing <NA> 1700 \n", | |
"17 180 Construction <NA> 1700 \n", | |
"18 190 Contracting <NA> 1700 \n", | |
"19 200 Operations <NA> 1700 \n", | |
"20 210 IT Support <NA> 1700 \n", | |
"21 220 NOC <NA> 1700 \n", | |
"22 230 IT Helpdesk <NA> 1700 \n", | |
"23 240 Government Sales <NA> 1700 \n", | |
"24 250 Retail Sales <NA> 1700 \n", | |
"25 260 Recruiting <NA> 1700 \n", | |
"26 270 Payroll <NA> 1700 \n", | |
"\n", | |
" source_file_name source_file_date source_file_datetime \n", | |
"0 None None \n", | |
"1 None None \n", | |
"2 None None \n", | |
"3 None None \n", | |
"4 None None \n", | |
"5 None None \n", | |
"6 None None \n", | |
"7 None None \n", | |
"8 None None \n", | |
"9 None None \n", | |
"10 None None \n", | |
"11 None None \n", | |
"12 None None \n", | |
"13 None None \n", | |
"14 None None \n", | |
"15 None None \n", | |
"16 None None \n", | |
"17 None None \n", | |
"18 None None \n", | |
"19 None None \n", | |
"20 None None \n", | |
"21 None None \n", | |
"22 None None \n", | |
"23 None None \n", | |
"24 None None \n", | |
"25 None None \n", | |
"26 None None " | |
] | |
}, | |
"execution_count": 18, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"departments_df" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 19, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"employees_df = wr.s3.read_parquet(path=\"s3://ay-dp-dev-test-bucket/test-hr-data-parquet/employees/yyyy=2021/mm=04/dd=22\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 20, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>employee_id</th>\n", | |
" <th>first_name</th>\n", | |
" <th>last_name</th>\n", | |
" <th>email</th>\n", | |
" <th>phone_number</th>\n", | |
" <th>hire_date</th>\n", | |
" <th>job_id</th>\n", | |
" <th>salary</th>\n", | |
" <th>commission_pct</th>\n", | |
" <th>manager_id</th>\n", | |
" <th>department_id</th>\n", | |
" <th>custom_data</th>\n", | |
" <th>custom_data_address_line_1</th>\n", | |
" <th>custom_data_address_line_2</th>\n", | |
" <th>custom_data_zip</th>\n", | |
" <th>custom_data_date_of_birth</th>\n", | |
" <th>custom_data_gender</th>\n", | |
" <th>custom_data_marital_status</th>\n", | |
" <th>source_file_name</th>\n", | |
" <th>source_file_date</th>\n", | |
" <th>source_file_datetime</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>100</td>\n", | |
" <td>Steven</td>\n", | |
" <td>King</td>\n", | |
" <td>SKING</td>\n", | |
" <td>515.123.4567</td>\n", | |
" <td>1987-06-17</td>\n", | |
" <td>AD_PRES</td>\n", | |
" <td>24000.0</td>\n", | |
" <td>NaN</td>\n", | |
" <td><NA></td>\n", | |
" <td>90</td>\n", | |
" <td>nan</td>\n", | |
" <td><NA></td>\n", | |
" <td><NA></td>\n", | |
" <td><NA></td>\n", | |
" <td><NA></td>\n", | |
" <td><NA></td>\n", | |
" <td><NA></td>\n", | |
" <td></td>\n", | |
" <td>None</td>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>101</td>\n", | |
" <td>Neena</td>\n", | |
" <td>Kochhar</td>\n", | |
" <td>NKOCHHAR</td>\n", | |
" <td>515.123.4568</td>\n", | |
" <td>1989-09-21</td>\n", | |
" <td>AD_VP</td>\n", | |
" <td>17000.0</td>\n", | |
" <td>NaN</td>\n", | |
" <td>100</td>\n", | |
" <td>90</td>\n", | |
" <td>nan</td>\n", | |
" <td><NA></td>\n", | |
" <td><NA></td>\n", | |
" <td><NA></td>\n", | |
" <td><NA></td>\n", | |
" <td><NA></td>\n", | |
" <td><NA></td>\n", | |
" <td></td>\n", | |
" <td>None</td>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>102</td>\n", | |
" <td>Lex</td>\n", | |
" <td>De Haan</td>\n", | |
" <td>LDEHAAN</td>\n", | |
" <td>515.123.4569</td>\n", | |
" <td>1993-01-13</td>\n", | |
" <td>AD_VP</td>\n", | |
" <td>17000.0</td>\n", | |
" <td>NaN</td>\n", | |
" <td>100</td>\n", | |
" <td>90</td>\n", | |
" <td>nan</td>\n", | |
" <td><NA></td>\n", | |
" <td><NA></td>\n", | |
" <td><NA></td>\n", | |
" <td><NA></td>\n", | |
" <td><NA></td>\n", | |
" <td><NA></td>\n", | |
" <td></td>\n", | |
" <td>None</td>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>103</td>\n", | |
" <td>Alexander</td>\n", | |
" <td>Hunold</td>\n", | |
" <td>AHUNOLD</td>\n", | |
" <td>590.423.4567</td>\n", | |
" <td>1990-01-03</td>\n", | |
" <td>IT_PROG</td>\n", | |
" <td>9000.0</td>\n", | |
" <td>NaN</td>\n", | |
" <td>102</td>\n", | |
" <td>60</td>\n", | |
" <td>{\"address_line_1\":\"6649 N Blue Gum St\",\"addres...</td>\n", | |
" <td>6649 N Blue Gum St</td>\n", | |
" <td>C 22</td>\n", | |
" <td>70116</td>\n", | |
" <td><NA></td>\n", | |
" <td><NA></td>\n", | |
" <td><NA></td>\n", | |
" <td></td>\n", | |
" <td>None</td>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>104</td>\n", | |
" <td>Bruce</td>\n", | |
" <td>Ernst</td>\n", | |
" <td>BERNST</td>\n", | |
" <td>590.423.4568</td>\n", | |
" <td>1991-05-21</td>\n", | |
" <td>IT_PROG</td>\n", | |
" <td>6000.0</td>\n", | |
" <td>NaN</td>\n", | |
" <td>103</td>\n", | |
" <td>60</td>\n", | |
" <td>nan</td>\n", | |
" <td><NA></td>\n", | |
" <td><NA></td>\n", | |
" <td><NA></td>\n", | |
" <td><NA></td>\n", | |
" <td><NA></td>\n", | |
" <td><NA></td>\n", | |
" <td></td>\n", | |
" <td>None</td>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>...</th>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>102</th>\n", | |
" <td>202</td>\n", | |
" <td>Pat</td>\n", | |
" <td>Fay</td>\n", | |
" <td>PFAY</td>\n", | |
" <td>603.123.6666</td>\n", | |
" <td>1997-08-17</td>\n", | |
" <td>MK_REP</td>\n", | |
" <td>6000.0</td>\n", | |
" <td>NaN</td>\n", | |
" <td>201</td>\n", | |
" <td>20</td>\n", | |
" <td>nan</td>\n", | |
" <td><NA></td>\n", | |
" <td><NA></td>\n", | |
" <td><NA></td>\n", | |
" <td><NA></td>\n", | |
" <td><NA></td>\n", | |
" <td><NA></td>\n", | |
" <td></td>\n", | |
" <td>None</td>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>103</th>\n", | |
" <td>203</td>\n", | |
" <td>Susan</td>\n", | |
" <td>Mavris</td>\n", | |
" <td>SMAVRIS</td>\n", | |
" <td>515.123.7777</td>\n", | |
" <td>1994-06-07</td>\n", | |
" <td>HR_REP</td>\n", | |
" <td>6500.0</td>\n", | |
" <td>NaN</td>\n", | |
" <td>101</td>\n", | |
" <td>40</td>\n", | |
" <td>{\"address_line_1\":\"2371 Jerrold Ave\",\"address_...</td>\n", | |
" <td>2371 Jerrold Ave</td>\n", | |
" <td>Block D</td>\n", | |
" <td>19443</td>\n", | |
" <td>1954-06-07</td>\n", | |
" <td>F</td>\n", | |
" <td><NA></td>\n", | |
" <td></td>\n", | |
" <td>None</td>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>104</th>\n", | |
" <td>204</td>\n", | |
" <td>Hermann</td>\n", | |
" <td>Baer</td>\n", | |
" <td>HBAER</td>\n", | |
" <td>515.123.8888</td>\n", | |
" <td>1994-06-07</td>\n", | |
" <td>PR_REP</td>\n", | |
" <td>10000.0</td>\n", | |
" <td>NaN</td>\n", | |
" <td>101</td>\n", | |
" <td>70</td>\n", | |
" <td>nan</td>\n", | |
" <td><NA></td>\n", | |
" <td><NA></td>\n", | |
" <td><NA></td>\n", | |
" <td><NA></td>\n", | |
" <td><NA></td>\n", | |
" <td><NA></td>\n", | |
" <td></td>\n", | |
" <td>None</td>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>105</th>\n", | |
" <td>205</td>\n", | |
" <td>Shelley</td>\n", | |
" <td>Higgins</td>\n", | |
" <td>SHIGGINS</td>\n", | |
" <td>515.123.8080</td>\n", | |
" <td>1994-06-07</td>\n", | |
" <td>AC_MGR</td>\n", | |
" <td>12000.0</td>\n", | |
" <td>NaN</td>\n", | |
" <td>101</td>\n", | |
" <td>110</td>\n", | |
" <td>nan</td>\n", | |
" <td><NA></td>\n", | |
" <td><NA></td>\n", | |
" <td><NA></td>\n", | |
" <td><NA></td>\n", | |
" <td><NA></td>\n", | |
" <td><NA></td>\n", | |
" <td></td>\n", | |
" <td>None</td>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>106</th>\n", | |
" <td>206</td>\n", | |
" <td>William</td>\n", | |
" <td>Gietz</td>\n", | |
" <td>WGIETZ</td>\n", | |
" <td>515.123.8181</td>\n", | |
" <td>1994-06-07</td>\n", | |
" <td>AC_ACCOUNT</td>\n", | |
" <td>8300.0</td>\n", | |
" <td>NaN</td>\n", | |
" <td>205</td>\n", | |
" <td>110</td>\n", | |
" <td>{\"address_line_1\":\"37275 St Rt 17m M\",\"address...</td>\n", | |
" <td>37275 St Rt 17m M</td>\n", | |
" <td>Block C</td>\n", | |
" <td>11953</td>\n", | |
" <td>1964-01-07</td>\n", | |
" <td>M</td>\n", | |
" <td>Married</td>\n", | |
" <td></td>\n", | |
" <td>None</td>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"<p>107 rows × 21 columns</p>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" employee_id first_name last_name email phone_number hire_date \\\n", | |
"0 100 Steven King SKING 515.123.4567 1987-06-17 \n", | |
"1 101 Neena Kochhar NKOCHHAR 515.123.4568 1989-09-21 \n", | |
"2 102 Lex De Haan LDEHAAN 515.123.4569 1993-01-13 \n", | |
"3 103 Alexander Hunold AHUNOLD 590.423.4567 1990-01-03 \n", | |
"4 104 Bruce Ernst BERNST 590.423.4568 1991-05-21 \n", | |
".. ... ... ... ... ... ... \n", | |
"102 202 Pat Fay PFAY 603.123.6666 1997-08-17 \n", | |
"103 203 Susan Mavris SMAVRIS 515.123.7777 1994-06-07 \n", | |
"104 204 Hermann Baer HBAER 515.123.8888 1994-06-07 \n", | |
"105 205 Shelley Higgins SHIGGINS 515.123.8080 1994-06-07 \n", | |
"106 206 William Gietz WGIETZ 515.123.8181 1994-06-07 \n", | |
"\n", | |
" job_id salary commission_pct manager_id department_id \\\n", | |
"0 AD_PRES 24000.0 NaN <NA> 90 \n", | |
"1 AD_VP 17000.0 NaN 100 90 \n", | |
"2 AD_VP 17000.0 NaN 100 90 \n", | |
"3 IT_PROG 9000.0 NaN 102 60 \n", | |
"4 IT_PROG 6000.0 NaN 103 60 \n", | |
".. ... ... ... ... ... \n", | |
"102 MK_REP 6000.0 NaN 201 20 \n", | |
"103 HR_REP 6500.0 NaN 101 40 \n", | |
"104 PR_REP 10000.0 NaN 101 70 \n", | |
"105 AC_MGR 12000.0 NaN 101 110 \n", | |
"106 AC_ACCOUNT 8300.0 NaN 205 110 \n", | |
"\n", | |
" custom_data \\\n", | |
"0 nan \n", | |
"1 nan \n", | |
"2 nan \n", | |
"3 {\"address_line_1\":\"6649 N Blue Gum St\",\"addres... \n", | |
"4 nan \n", | |
".. ... \n", | |
"102 nan \n", | |
"103 {\"address_line_1\":\"2371 Jerrold Ave\",\"address_... \n", | |
"104 nan \n", | |
"105 nan \n", | |
"106 {\"address_line_1\":\"37275 St Rt 17m M\",\"address... \n", | |
"\n", | |
" custom_data_address_line_1 custom_data_address_line_2 custom_data_zip \\\n", | |
"0 <NA> <NA> <NA> \n", | |
"1 <NA> <NA> <NA> \n", | |
"2 <NA> <NA> <NA> \n", | |
"3 6649 N Blue Gum St C 22 70116 \n", | |
"4 <NA> <NA> <NA> \n", | |
".. ... ... ... \n", | |
"102 <NA> <NA> <NA> \n", | |
"103 2371 Jerrold Ave Block D 19443 \n", | |
"104 <NA> <NA> <NA> \n", | |
"105 <NA> <NA> <NA> \n", | |
"106 37275 St Rt 17m M Block C 11953 \n", | |
"\n", | |
" custom_data_date_of_birth custom_data_gender custom_data_marital_status \\\n", | |
"0 <NA> <NA> <NA> \n", | |
"1 <NA> <NA> <NA> \n", | |
"2 <NA> <NA> <NA> \n", | |
"3 <NA> <NA> <NA> \n", | |
"4 <NA> <NA> <NA> \n", | |
".. ... ... ... \n", | |
"102 <NA> <NA> <NA> \n", | |
"103 1954-06-07 F <NA> \n", | |
"104 <NA> <NA> <NA> \n", | |
"105 <NA> <NA> <NA> \n", | |
"106 1964-01-07 M Married \n", | |
"\n", | |
" source_file_name source_file_date source_file_datetime \n", | |
"0 None None \n", | |
"1 None None \n", | |
"2 None None \n", | |
"3 None None \n", | |
"4 None None \n", | |
".. ... ... ... \n", | |
"102 None None \n", | |
"103 None None \n", | |
"104 None None \n", | |
"105 None None \n", | |
"106 None None \n", | |
"\n", | |
"[107 rows x 21 columns]" | |
] | |
}, | |
"execution_count": 20, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"employees_df" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 21, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"job_history_df = wr.s3.read_parquet(path=\"s3://ay-dp-dev-test-bucket/test-hr-data-parquet/job_history/yyyy=2021/mm=04/dd=22\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 22, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>employee_id</th>\n", | |
" <th>start_date</th>\n", | |
" <th>end_date</th>\n", | |
" <th>job_id</th>\n", | |
" <th>department_id</th>\n", | |
" <th>source_file_name</th>\n", | |
" <th>source_file_date</th>\n", | |
" <th>source_file_datetime</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>102</td>\n", | |
" <td>1993-01-13</td>\n", | |
" <td>1998-07-24</td>\n", | |
" <td>IT_PROG</td>\n", | |
" <td>60</td>\n", | |
" <td></td>\n", | |
" <td>None</td>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>101</td>\n", | |
" <td>1989-09-21</td>\n", | |
" <td>1993-10-27</td>\n", | |
" <td>AC_ACCOUNT</td>\n", | |
" <td>110</td>\n", | |
" <td></td>\n", | |
" <td>None</td>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>101</td>\n", | |
" <td>1993-10-28</td>\n", | |
" <td>1997-03-15</td>\n", | |
" <td>AC_MGR</td>\n", | |
" <td>110</td>\n", | |
" <td></td>\n", | |
" <td>None</td>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>201</td>\n", | |
" <td>1996-02-17</td>\n", | |
" <td>1999-12-19</td>\n", | |
" <td>MK_REP</td>\n", | |
" <td>20</td>\n", | |
" <td></td>\n", | |
" <td>None</td>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>114</td>\n", | |
" <td>1998-03-24</td>\n", | |
" <td>1999-12-31</td>\n", | |
" <td>ST_CLERK</td>\n", | |
" <td>50</td>\n", | |
" <td></td>\n", | |
" <td>None</td>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>5</th>\n", | |
" <td>122</td>\n", | |
" <td>1999-01-01</td>\n", | |
" <td>1999-12-31</td>\n", | |
" <td>ST_CLERK</td>\n", | |
" <td>50</td>\n", | |
" <td></td>\n", | |
" <td>None</td>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>6</th>\n", | |
" <td>200</td>\n", | |
" <td>1987-09-17</td>\n", | |
" <td>1993-06-17</td>\n", | |
" <td>AD_ASST</td>\n", | |
" <td>90</td>\n", | |
" <td></td>\n", | |
" <td>None</td>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>7</th>\n", | |
" <td>176</td>\n", | |
" <td>1998-03-24</td>\n", | |
" <td>1998-12-31</td>\n", | |
" <td>SA_REP</td>\n", | |
" <td>80</td>\n", | |
" <td></td>\n", | |
" <td>None</td>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>8</th>\n", | |
" <td>176</td>\n", | |
" <td>1999-01-01</td>\n", | |
" <td>1999-12-31</td>\n", | |
" <td>SA_MAN</td>\n", | |
" <td>80</td>\n", | |
" <td></td>\n", | |
" <td>None</td>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>9</th>\n", | |
" <td>200</td>\n", | |
" <td>1994-07-01</td>\n", | |
" <td>1998-12-31</td>\n", | |
" <td>AC_ACCOUNT</td>\n", | |
" <td>90</td>\n", | |
" <td></td>\n", | |
" <td>None</td>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" employee_id start_date end_date job_id department_id \\\n", | |
"0 102 1993-01-13 1998-07-24 IT_PROG 60 \n", | |
"1 101 1989-09-21 1993-10-27 AC_ACCOUNT 110 \n", | |
"2 101 1993-10-28 1997-03-15 AC_MGR 110 \n", | |
"3 201 1996-02-17 1999-12-19 MK_REP 20 \n", | |
"4 114 1998-03-24 1999-12-31 ST_CLERK 50 \n", | |
"5 122 1999-01-01 1999-12-31 ST_CLERK 50 \n", | |
"6 200 1987-09-17 1993-06-17 AD_ASST 90 \n", | |
"7 176 1998-03-24 1998-12-31 SA_REP 80 \n", | |
"8 176 1999-01-01 1999-12-31 SA_MAN 80 \n", | |
"9 200 1994-07-01 1998-12-31 AC_ACCOUNT 90 \n", | |
"\n", | |
" source_file_name source_file_date source_file_datetime \n", | |
"0 None None \n", | |
"1 None None \n", | |
"2 None None \n", | |
"3 None None \n", | |
"4 None None \n", | |
"5 None None \n", | |
"6 None None \n", | |
"7 None None \n", | |
"8 None None \n", | |
"9 None None " | |
] | |
}, | |
"execution_count": 22, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"job_history_df" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 23, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"jobs_df = wr.s3.read_parquet(path=\"s3://ay-dp-dev-test-bucket/test-hr-data-parquet/jobs/yyyy=2021/mm=04/dd=22\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 24, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>job_id</th>\n", | |
" <th>job_title</th>\n", | |
" <th>min_salary</th>\n", | |
" <th>max_salary</th>\n", | |
" <th>source_file_name</th>\n", | |
" <th>source_file_date</th>\n", | |
" <th>source_file_datetime</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>AD_PRES</td>\n", | |
" <td>President</td>\n", | |
" <td>20000</td>\n", | |
" <td>40000</td>\n", | |
" <td></td>\n", | |
" <td>None</td>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>AD_VP</td>\n", | |
" <td>Administration Vice President</td>\n", | |
" <td>15000</td>\n", | |
" <td>30000</td>\n", | |
" <td></td>\n", | |
" <td>None</td>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>AD_ASST</td>\n", | |
" <td>Administration Assistant</td>\n", | |
" <td>3000</td>\n", | |
" <td>6000</td>\n", | |
" <td></td>\n", | |
" <td>None</td>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>FI_MGR</td>\n", | |
" <td>Finance Manager</td>\n", | |
" <td>8200</td>\n", | |
" <td>16000</td>\n", | |
" <td></td>\n", | |
" <td>None</td>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>FI_ACCOUNT</td>\n", | |
" <td>Accountant</td>\n", | |
" <td>4200</td>\n", | |
" <td>9000</td>\n", | |
" <td></td>\n", | |
" <td>None</td>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>5</th>\n", | |
" <td>AC_MGR</td>\n", | |
" <td>Accounting Manager</td>\n", | |
" <td>8200</td>\n", | |
" <td>16000</td>\n", | |
" <td></td>\n", | |
" <td>None</td>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>6</th>\n", | |
" <td>AC_ACCOUNT</td>\n", | |
" <td>Public Accountant</td>\n", | |
" <td>4200</td>\n", | |
" <td>9000</td>\n", | |
" <td></td>\n", | |
" <td>None</td>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>7</th>\n", | |
" <td>SA_MAN</td>\n", | |
" <td>Sales Manager</td>\n", | |
" <td>10000</td>\n", | |
" <td>20000</td>\n", | |
" <td></td>\n", | |
" <td>None</td>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>8</th>\n", | |
" <td>SA_REP</td>\n", | |
" <td>Sales Representative</td>\n", | |
" <td>6000</td>\n", | |
" <td>12000</td>\n", | |
" <td></td>\n", | |
" <td>None</td>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>9</th>\n", | |
" <td>PU_MAN</td>\n", | |
" <td>Purchasing Manager</td>\n", | |
" <td>8000</td>\n", | |
" <td>15000</td>\n", | |
" <td></td>\n", | |
" <td>None</td>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>10</th>\n", | |
" <td>PU_CLERK</td>\n", | |
" <td>Purchasing Clerk</td>\n", | |
" <td>2500</td>\n", | |
" <td>5500</td>\n", | |
" <td></td>\n", | |
" <td>None</td>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>11</th>\n", | |
" <td>ST_MAN</td>\n", | |
" <td>Stock Manager</td>\n", | |
" <td>5500</td>\n", | |
" <td>8500</td>\n", | |
" <td></td>\n", | |
" <td>None</td>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>12</th>\n", | |
" <td>ST_CLERK</td>\n", | |
" <td>Stock Clerk</td>\n", | |
" <td>2000</td>\n", | |
" <td>5000</td>\n", | |
" <td></td>\n", | |
" <td>None</td>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>13</th>\n", | |
" <td>SH_CLERK</td>\n", | |
" <td>Shipping Clerk</td>\n", | |
" <td>2500</td>\n", | |
" <td>5500</td>\n", | |
" <td></td>\n", | |
" <td>None</td>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>14</th>\n", | |
" <td>IT_PROG</td>\n", | |
" <td>Programmer</td>\n", | |
" <td>4000</td>\n", | |
" <td>10000</td>\n", | |
" <td></td>\n", | |
" <td>None</td>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>15</th>\n", | |
" <td>MK_MAN</td>\n", | |
" <td>Marketing Manager</td>\n", | |
" <td>9000</td>\n", | |
" <td>15000</td>\n", | |
" <td></td>\n", | |
" <td>None</td>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>16</th>\n", | |
" <td>MK_REP</td>\n", | |
" <td>Marketing Representative</td>\n", | |
" <td>4000</td>\n", | |
" <td>9000</td>\n", | |
" <td></td>\n", | |
" <td>None</td>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>17</th>\n", | |
" <td>HR_REP</td>\n", | |
" <td>Human Resources Representative</td>\n", | |
" <td>4000</td>\n", | |
" <td>9000</td>\n", | |
" <td></td>\n", | |
" <td>None</td>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>18</th>\n", | |
" <td>PR_REP</td>\n", | |
" <td>Public Relations Representative</td>\n", | |
" <td>4500</td>\n", | |
" <td>10500</td>\n", | |
" <td></td>\n", | |
" <td>None</td>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" job_id job_title min_salary max_salary \\\n", | |
"0 AD_PRES President 20000 40000 \n", | |
"1 AD_VP Administration Vice President 15000 30000 \n", | |
"2 AD_ASST Administration Assistant 3000 6000 \n", | |
"3 FI_MGR Finance Manager 8200 16000 \n", | |
"4 FI_ACCOUNT Accountant 4200 9000 \n", | |
"5 AC_MGR Accounting Manager 8200 16000 \n", | |
"6 AC_ACCOUNT Public Accountant 4200 9000 \n", | |
"7 SA_MAN Sales Manager 10000 20000 \n", | |
"8 SA_REP Sales Representative 6000 12000 \n", | |
"9 PU_MAN Purchasing Manager 8000 15000 \n", | |
"10 PU_CLERK Purchasing Clerk 2500 5500 \n", | |
"11 ST_MAN Stock Manager 5500 8500 \n", | |
"12 ST_CLERK Stock Clerk 2000 5000 \n", | |
"13 SH_CLERK Shipping Clerk 2500 5500 \n", | |
"14 IT_PROG Programmer 4000 10000 \n", | |
"15 MK_MAN Marketing Manager 9000 15000 \n", | |
"16 MK_REP Marketing Representative 4000 9000 \n", | |
"17 HR_REP Human Resources Representative 4000 9000 \n", | |
"18 PR_REP Public Relations Representative 4500 10500 \n", | |
"\n", | |
" source_file_name source_file_date source_file_datetime \n", | |
"0 None None \n", | |
"1 None None \n", | |
"2 None None \n", | |
"3 None None \n", | |
"4 None None \n", | |
"5 None None \n", | |
"6 None None \n", | |
"7 None None \n", | |
"8 None None \n", | |
"9 None None \n", | |
"10 None None \n", | |
"11 None None \n", | |
"12 None None \n", | |
"13 None None \n", | |
"14 None None \n", | |
"15 None None \n", | |
"16 None None \n", | |
"17 None None \n", | |
"18 None None " | |
] | |
}, | |
"execution_count": 24, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"jobs_df" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 25, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"locations_df = wr.s3.read_parquet(path=\"s3://ay-dp-dev-test-bucket/test-hr-data-parquet/locations/yyyy=2021/mm=04/dd=22\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 26, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>location_id</th>\n", | |
" <th>street_address</th>\n", | |
" <th>postal_code</th>\n", | |
" <th>city</th>\n", | |
" <th>state_province</th>\n", | |
" <th>country_id</th>\n", | |
" <th>source_file_name</th>\n", | |
" <th>source_file_date</th>\n", | |
" <th>source_file_datetime</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>1000</td>\n", | |
" <td>1297 Via Cola di Rie</td>\n", | |
" <td>989</td>\n", | |
" <td>Roma</td>\n", | |
" <td>nan</td>\n", | |
" <td>IT</td>\n", | |
" <td></td>\n", | |
" <td>None</td>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>1100</td>\n", | |
" <td>93091 Calle della Testa</td>\n", | |
" <td>10934</td>\n", | |
" <td>Venice</td>\n", | |
" <td>nan</td>\n", | |
" <td>IT</td>\n", | |
" <td></td>\n", | |
" <td>None</td>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>1200</td>\n", | |
" <td>2017 Shinjuku-ku</td>\n", | |
" <td>1689</td>\n", | |
" <td>Tokyo</td>\n", | |
" <td>Tokyo Prefecture</td>\n", | |
" <td>JP</td>\n", | |
" <td></td>\n", | |
" <td>None</td>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>1300</td>\n", | |
" <td>9450 Kamiya-cho</td>\n", | |
" <td>6823</td>\n", | |
" <td>Hiroshima</td>\n", | |
" <td>nan</td>\n", | |
" <td>JP</td>\n", | |
" <td></td>\n", | |
" <td>None</td>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>1400</td>\n", | |
" <td>2014 Jabberwocky Rd</td>\n", | |
" <td>26192</td>\n", | |
" <td>Southlake</td>\n", | |
" <td>Texas</td>\n", | |
" <td>US</td>\n", | |
" <td></td>\n", | |
" <td>None</td>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>5</th>\n", | |
" <td>1500</td>\n", | |
" <td>2011 Interiors Blvd</td>\n", | |
" <td>99236</td>\n", | |
" <td>South San Francisco</td>\n", | |
" <td>California</td>\n", | |
" <td>US</td>\n", | |
" <td></td>\n", | |
" <td>None</td>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>6</th>\n", | |
" <td>1600</td>\n", | |
" <td>2007 Zagora St</td>\n", | |
" <td>50090</td>\n", | |
" <td>South Brunswick</td>\n", | |
" <td>New Jersey</td>\n", | |
" <td>US</td>\n", | |
" <td></td>\n", | |
" <td>None</td>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>7</th>\n", | |
" <td>1700</td>\n", | |
" <td>2004 Charade Rd</td>\n", | |
" <td>98199</td>\n", | |
" <td>Seattle</td>\n", | |
" <td>Washington</td>\n", | |
" <td>US</td>\n", | |
" <td></td>\n", | |
" <td>None</td>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>8</th>\n", | |
" <td>1800</td>\n", | |
" <td>147 Spadina Ave</td>\n", | |
" <td>M5V 2L7</td>\n", | |
" <td>Toronto</td>\n", | |
" <td>Ontario</td>\n", | |
" <td>CA</td>\n", | |
" <td></td>\n", | |
" <td>None</td>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>9</th>\n", | |
" <td>1900</td>\n", | |
" <td>6092 Boxwood St</td>\n", | |
" <td>YSW 9T2</td>\n", | |
" <td>Whitehorse</td>\n", | |
" <td>Yukon</td>\n", | |
" <td>CA</td>\n", | |
" <td></td>\n", | |
" <td>None</td>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>10</th>\n", | |
" <td>2000</td>\n", | |
" <td>40-5-12 Laogianggen</td>\n", | |
" <td>190518</td>\n", | |
" <td>Beijing</td>\n", | |
" <td>nan</td>\n", | |
" <td>CN</td>\n", | |
" <td></td>\n", | |
" <td>None</td>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>11</th>\n", | |
" <td>2100</td>\n", | |
" <td>1298 Vileparle (E)</td>\n", | |
" <td>490231</td>\n", | |
" <td>Bombay</td>\n", | |
" <td>Maharashtra</td>\n", | |
" <td>IN</td>\n", | |
" <td></td>\n", | |
" <td>None</td>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>12</th>\n", | |
" <td>2200</td>\n", | |
" <td>12-98 Victoria Street</td>\n", | |
" <td>2901</td>\n", | |
" <td>Sydney</td>\n", | |
" <td>New South Wales</td>\n", | |
" <td>AU</td>\n", | |
" <td></td>\n", | |
" <td>None</td>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>13</th>\n", | |
" <td>2300</td>\n", | |
" <td>198 Clementi North</td>\n", | |
" <td>540198</td>\n", | |
" <td>Singapore</td>\n", | |
" <td>nan</td>\n", | |
" <td>SG</td>\n", | |
" <td></td>\n", | |
" <td>None</td>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>14</th>\n", | |
" <td>2400</td>\n", | |
" <td>8204 Arthur St</td>\n", | |
" <td>nan</td>\n", | |
" <td>London</td>\n", | |
" <td>nan</td>\n", | |
" <td>UK</td>\n", | |
" <td></td>\n", | |
" <td>None</td>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>15</th>\n", | |
" <td>2500</td>\n", | |
" <td>Magdalen Centre, The Oxford Science Park</td>\n", | |
" <td>OX9 9ZB</td>\n", | |
" <td>Oxford</td>\n", | |
" <td>Oxford</td>\n", | |
" <td>UK</td>\n", | |
" <td></td>\n", | |
" <td>None</td>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>16</th>\n", | |
" <td>2600</td>\n", | |
" <td>9702 Chester Road</td>\n", | |
" <td>9629850293</td>\n", | |
" <td>Stretford</td>\n", | |
" <td>Manchester</td>\n", | |
" <td>UK</td>\n", | |
" <td></td>\n", | |
" <td>None</td>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>17</th>\n", | |
" <td>2700</td>\n", | |
" <td>Schwanthalerstr. 7031</td>\n", | |
" <td>80925</td>\n", | |
" <td>Munich</td>\n", | |
" <td>Bavaria</td>\n", | |
" <td>DE</td>\n", | |
" <td></td>\n", | |
" <td>None</td>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>18</th>\n", | |
" <td>2800</td>\n", | |
" <td>Rua Frei Caneca 1360</td>\n", | |
" <td>01307-002</td>\n", | |
" <td>Sao Paulo</td>\n", | |
" <td>Sao Paulo</td>\n", | |
" <td>BR</td>\n", | |
" <td></td>\n", | |
" <td>None</td>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>19</th>\n", | |
" <td>2900</td>\n", | |
" <td>20 Rue des Corps-Saints</td>\n", | |
" <td>1730</td>\n", | |
" <td>Geneva</td>\n", | |
" <td>Geneve</td>\n", | |
" <td>CH</td>\n", | |
" <td></td>\n", | |
" <td>None</td>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>20</th>\n", | |
" <td>3000</td>\n", | |
" <td>Murtenstrasse 921</td>\n", | |
" <td>3095</td>\n", | |
" <td>Bern</td>\n", | |
" <td>BE</td>\n", | |
" <td>CH</td>\n", | |
" <td></td>\n", | |
" <td>None</td>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>21</th>\n", | |
" <td>3100</td>\n", | |
" <td>Pieter Breughelstraat 837</td>\n", | |
" <td>3029SK</td>\n", | |
" <td>Utrecht</td>\n", | |
" <td>Utrecht</td>\n", | |
" <td>NL</td>\n", | |
" <td></td>\n", | |
" <td>None</td>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>22</th>\n", | |
" <td>3200</td>\n", | |
" <td>Mariano Escobedo 9991</td>\n", | |
" <td>11932</td>\n", | |
" <td>Mexico City</td>\n", | |
" <td>Distrito Federal,</td>\n", | |
" <td>MX</td>\n", | |
" <td></td>\n", | |
" <td>None</td>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" location_id street_address postal_code \\\n", | |
"0 1000 1297 Via Cola di Rie 989 \n", | |
"1 1100 93091 Calle della Testa 10934 \n", | |
"2 1200 2017 Shinjuku-ku 1689 \n", | |
"3 1300 9450 Kamiya-cho 6823 \n", | |
"4 1400 2014 Jabberwocky Rd 26192 \n", | |
"5 1500 2011 Interiors Blvd 99236 \n", | |
"6 1600 2007 Zagora St 50090 \n", | |
"7 1700 2004 Charade Rd 98199 \n", | |
"8 1800 147 Spadina Ave M5V 2L7 \n", | |
"9 1900 6092 Boxwood St YSW 9T2 \n", | |
"10 2000 40-5-12 Laogianggen 190518 \n", | |
"11 2100 1298 Vileparle (E) 490231 \n", | |
"12 2200 12-98 Victoria Street 2901 \n", | |
"13 2300 198 Clementi North 540198 \n", | |
"14 2400 8204 Arthur St nan \n", | |
"15 2500 Magdalen Centre, The Oxford Science Park OX9 9ZB \n", | |
"16 2600 9702 Chester Road 9629850293 \n", | |
"17 2700 Schwanthalerstr. 7031 80925 \n", | |
"18 2800 Rua Frei Caneca 1360 01307-002 \n", | |
"19 2900 20 Rue des Corps-Saints 1730 \n", | |
"20 3000 Murtenstrasse 921 3095 \n", | |
"21 3100 Pieter Breughelstraat 837 3029SK \n", | |
"22 3200 Mariano Escobedo 9991 11932 \n", | |
"\n", | |
" city state_province country_id source_file_name \\\n", | |
"0 Roma nan IT \n", | |
"1 Venice nan IT \n", | |
"2 Tokyo Tokyo Prefecture JP \n", | |
"3 Hiroshima nan JP \n", | |
"4 Southlake Texas US \n", | |
"5 South San Francisco California US \n", | |
"6 South Brunswick New Jersey US \n", | |
"7 Seattle Washington US \n", | |
"8 Toronto Ontario CA \n", | |
"9 Whitehorse Yukon CA \n", | |
"10 Beijing nan CN \n", | |
"11 Bombay Maharashtra IN \n", | |
"12 Sydney New South Wales AU \n", | |
"13 Singapore nan SG \n", | |
"14 London nan UK \n", | |
"15 Oxford Oxford UK \n", | |
"16 Stretford Manchester UK \n", | |
"17 Munich Bavaria DE \n", | |
"18 Sao Paulo Sao Paulo BR \n", | |
"19 Geneva Geneve CH \n", | |
"20 Bern BE CH \n", | |
"21 Utrecht Utrecht NL \n", | |
"22 Mexico City Distrito Federal, MX \n", | |
"\n", | |
" source_file_date source_file_datetime \n", | |
"0 None None \n", | |
"1 None None \n", | |
"2 None None \n", | |
"3 None None \n", | |
"4 None None \n", | |
"5 None None \n", | |
"6 None None \n", | |
"7 None None \n", | |
"8 None None \n", | |
"9 None None \n", | |
"10 None None \n", | |
"11 None None \n", | |
"12 None None \n", | |
"13 None None \n", | |
"14 None None \n", | |
"15 None None \n", | |
"16 None None \n", | |
"17 None None \n", | |
"18 None None \n", | |
"19 None None \n", | |
"20 None None \n", | |
"21 None None \n", | |
"22 None None " | |
] | |
}, | |
"execution_count": 26, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"locations_df" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 27, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"regions_df = wr.s3.read_parquet(path=\"s3://ay-dp-dev-test-bucket/test-hr-data-parquet/regions/yyyy=2021/mm=04/dd=22\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 28, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>region_id</th>\n", | |
" <th>region_name</th>\n", | |
" <th>source_file_name</th>\n", | |
" <th>source_file_date</th>\n", | |
" <th>source_file_datetime</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>1</td>\n", | |
" <td>Europe</td>\n", | |
" <td></td>\n", | |
" <td>None</td>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>2</td>\n", | |
" <td>Americas</td>\n", | |
" <td></td>\n", | |
" <td>None</td>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>3</td>\n", | |
" <td>Asia</td>\n", | |
" <td></td>\n", | |
" <td>None</td>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>4</td>\n", | |
" <td>Middle East and Africa</td>\n", | |
" <td></td>\n", | |
" <td>None</td>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" region_id region_name source_file_name source_file_date \\\n", | |
"0 1 Europe None \n", | |
"1 2 Americas None \n", | |
"2 3 Asia None \n", | |
"3 4 Middle East and Africa None \n", | |
"\n", | |
" source_file_datetime \n", | |
"0 None \n", | |
"1 None \n", | |
"2 None \n", | |
"3 None " | |
] | |
}, | |
"execution_count": 28, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"regions_df" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.6.9" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 4 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment