vvgsrk · April 12, 2020 09:52
diff --git a/Parse_Fixed_Width_Text_File_Using_Pandas.ipynb b/Parse_Fixed_Width_Text_File_Using_Pandas.ipynb
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 251,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Import pandas library as pd\n",
    "import pandas as pd\n",
    "import hashlib\n",
    "import pathlib"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 252,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Save file name in a variable so that it can be re used\n",
    "source_file_name = 'HR_Data_20200410.dat'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 253,
   "metadata": {},
   "outputs": [],
   "source": [
    "def genrate_primary_key_per_batch(file_path):\n",
    "    \"\"\"\n",
    "    Generate md5 hash key for every batch and use this key to join header, detail, trailer tables\n",
    "    :param file_path: yq remittance file path on the lambda server\n",
    "    :return: batch_pk: md5 hash as string\n",
    "    \"\"\"\n",
    "    batch_pk = hashlib.md5(pathlib.Path(file_path).read_bytes()).hexdigest()\n",
    "    return batch_pk"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 254,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Invoke the batch key generation function\n",
    "batch_key = genrate_primary_key_per_batch(file_path=source_file_name)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 255,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'2d20553c798d5e2d38a4b4bb5ef18c2f'"
      ]
     },
     "execution_count": 255,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Show batch key\n",
    "batch_key"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 256,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Declare herader columns & widths based on file structure \n",
    "header_columns = ['RECID', 'FILENAME', 'CREATEDTIME']\n",
    "header_widths = [1, 18, 14]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 257,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Read HR fixed width text file using pandas library read_fwf method\n",
    "header_df = pd.read_fwf(source_file_name, widths=header_widths, header=None, names=header_columns, nrows=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 258,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Apply batch key to header dataframe\n",
    "header_df['PK_BATCH'] = batch_key"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 259,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "RECID          object\n",
       "FILENAME       object\n",
       "CREATEDTIME     int64\n",
       "PK_BATCH       object\n",
       "dtype: object"
      ]
     },
     "execution_count": 259,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Check data types\n",
    "header_df.dtypes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 260,
   "metadata": {},
   "outputs": [],
   "source": [
    "# convert intger to date time\n",
    "header_df['CREATEDTIME'] = pd.to_datetime(header_df['CREATEDTIME'], format='%Y%m%d%H%M%S')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 261,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "RECID                  object\n",
       "FILENAME               object\n",
       "CREATEDTIME    datetime64[ns]\n",
       "PK_BATCH               object\n",
       "dtype: object"
      ]
     },
     "execution_count": 261,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Check data types after converting and see the converted data type\n",
    "header_df.dtypes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 262,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>RECID</th>\n",
       "      <th>FILENAME</th>\n",
       "      <th>CREATEDTIME</th>\n",
       "      <th>PK_BATCH</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>H</td>\n",
       "      <td>HR-DATA-2020-04-10</td>\n",
       "      <td>2020-04-10 08:29:30</td>\n",
       "      <td>2d20553c798d5e2d38a4b4bb5ef18c2f</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  RECID            FILENAME         CREATEDTIME  \\\n",
       "0     H  HR-DATA-2020-04-10 2020-04-10 08:29:30   \n",
       "\n",
       "                           PK_BATCH  \n",
       "0  2d20553c798d5e2d38a4b4bb5ef18c2f  "
      ]
     },
     "execution_count": 262,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# show content of a pandas dataframe after converting to proper data types\n",
    "header_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 263,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Declare dept herader columns & widths based on file structure \n",
    "dept_columns = ['RECID', 'DEPTNO', 'DNAME', 'LOC']\n",
    "dept_widths = [2, 2, 25, 15]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 264,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Read HR fixed width text file using pandas library read_fwf method\n",
    "dept_df = pd.read_fwf(source_file_name, widths=dept_widths, header=None, names=dept_columns)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 265,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Filter dataframe for department records\n",
    "dept_df = dept_df[dept_df['RECID'] == 'DR']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 266,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Apply batch key to dept dataframe (FK stands for Foreign key)\n",
    "dept_df['FK_BATCH'] = batch_key"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 267,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "RECID       object\n",
       "DEPTNO      object\n",
       "DNAME       object\n",
       "LOC         object\n",
       "FK_BATCH    object\n",
       "dtype: object"
      ]
     },
     "execution_count": 267,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# check data types\n",
    "dept_df.dtypes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 268,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Convert deptno column data type to int\n",
    "dept_df['DEPTNO'] = dept_df['DEPTNO'].astype(int)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 269,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "RECID       object\n",
       "DEPTNO       int64\n",
       "DNAME       object\n",
       "LOC         object\n",
       "FK_BATCH    object\n",
       "dtype: object"
      ]
     },
     "execution_count": 269,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Check data types after converting and see the converted data type\n",
    "dept_df.dtypes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 270,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>RECID</th>\n",
       "      <th>DEPTNO</th>\n",
       "      <th>DNAME</th>\n",
       "      <th>LOC</th>\n",
       "      <th>FK_BATCH</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>DR</td>\n",
       "      <td>10</td>\n",
       "      <td>ACCOUNTING</td>\n",
       "      <td>NEW YORK</td>\n",
       "      <td>2d20553c798d5e2d38a4b4bb5ef18c2f</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>DR</td>\n",
       "      <td>20</td>\n",
       "      <td>RESEARCH</td>\n",
       "      <td>DALLAS</td>\n",
       "      <td>2d20553c798d5e2d38a4b4bb5ef18c2f</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>DR</td>\n",
       "      <td>30</td>\n",
       "      <td>SALES</td>\n",
       "      <td>CHICAGO</td>\n",
       "      <td>2d20553c798d5e2d38a4b4bb5ef18c2f</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>DR</td>\n",
       "      <td>40</td>\n",
       "      <td>OPERATIONS</td>\n",
       "      <td>BOSTON</td>\n",
       "      <td>2d20553c798d5e2d38a4b4bb5ef18c2f</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  RECID  DEPTNO       DNAME       LOC                          FK_BATCH\n",
       "1    DR      10  ACCOUNTING  NEW YORK  2d20553c798d5e2d38a4b4bb5ef18c2f\n",
       "2    DR      20    RESEARCH    DALLAS  2d20553c798d5e2d38a4b4bb5ef18c2f\n",
       "3    DR      30       SALES   CHICAGO  2d20553c798d5e2d38a4b4bb5ef18c2f\n",
       "4    DR      40  OPERATIONS    BOSTON  2d20553c798d5e2d38a4b4bb5ef18c2f"
      ]
     },
     "execution_count": 270,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# show content of a pandas dataframe after converting to proper data types\n",
    "dept_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 271,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Declare emp herader columns & widths based on file structure \n",
    "emp_columns = ['RECID', 'EMPNO', 'ENAME', 'JOB', 'MGR', 'HIREDATE', 'SAL', 'COMM', 'DEPTNO']\n",
    "emp_widths = [2, 5, 25, 15, 5, 10, 6, 4, 2]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 272,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Read HR fixed width text file using pandas library read_fwf method\n",
    "emp_df = pd.read_fwf(source_file_name, widths=emp_widths, header=None, names=emp_columns)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 273,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Filter dataframe for employee records\n",
    "emp_df = emp_df[emp_df['RECID'] == 'ER']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 274,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Apply batch key to emp dataframe (FK stands for Foreign key)\n",
    "emp_df['FK_BATCH'] = batch_key"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 275,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "RECID        object\n",
       "EMPNO        object\n",
       "ENAME        object\n",
       "JOB          object\n",
       "MGR         float64\n",
       "HIREDATE     object\n",
       "SAL         float64\n",
       "COMM        float64\n",
       "DEPTNO      float64\n",
       "FK_BATCH     object\n",
       "dtype: object"
      ]
     },
     "execution_count": 275,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "emp_df.dtypes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 276,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Convert hiredate column from string to date\n",
    "emp_df['HIREDATE'] = pd.to_datetime(emp_df['HIREDATE'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 279,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Convert empno, mgr, deptno columns from string to int\n",
    "emp_columns_dtypes = {'EMPNO': 'int', 'MGR': 'Int64', 'DEPTNO': 'int'}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 280,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Apply the conversion using astype\n",
    "emp_df = emp_df.astype(emp_columns_dtypes)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 281,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "RECID               object\n",
       "EMPNO                int64\n",
       "ENAME               object\n",
       "JOB                 object\n",
       "MGR                  Int64\n",
       "HIREDATE    datetime64[ns]\n",
       "SAL                float64\n",
       "COMM               float64\n",
       "DEPTNO               int64\n",
       "FK_BATCH            object\n",
       "dtype: object"
      ]
     },
     "execution_count": 281,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Check data types after converting and see the converted data type\n",
    "emp_df.dtypes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 282,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>RECID</th>\n",
       "      <th>EMPNO</th>\n",
       "      <th>ENAME</th>\n",
       "      <th>JOB</th>\n",
       "      <th>MGR</th>\n",
       "      <th>HIREDATE</th>\n",
       "      <th>SAL</th>\n",
       "      <th>COMM</th>\n",
       "      <th>DEPTNO</th>\n",
       "      <th>FK_BATCH</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>ER</td>\n",
       "      <td>7369</td>\n",
       "      <td>SMITH</td>\n",
       "      <td>CLERK</td>\n",
       "      <td>7902</td>\n",
       "      <td>1980-12-17</td>\n",
       "      <td>800.0</td>\n",
       "      <td>0.1</td>\n",
       "      <td>20</td>\n",
       "      <td>2d20553c798d5e2d38a4b4bb5ef18c2f</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>ER</td>\n",
       "      <td>7499</td>\n",
       "      <td>ALLEN</td>\n",
       "      <td>SALESMAN</td>\n",
       "      <td>7698</td>\n",
       "      <td>1981-02-20</td>\n",
       "      <td>1600.0</td>\n",
       "      <td>300.0</td>\n",
       "      <td>30</td>\n",
       "      <td>2d20553c798d5e2d38a4b4bb5ef18c2f</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>ER</td>\n",
       "      <td>7521</td>\n",
       "      <td>WARD</td>\n",
       "      <td>SALESMAN</td>\n",
       "      <td>7698</td>\n",
       "      <td>1981-02-22</td>\n",
       "      <td>1250.0</td>\n",
       "      <td>500.0</td>\n",
       "      <td>30</td>\n",
       "      <td>2d20553c798d5e2d38a4b4bb5ef18c2f</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>ER</td>\n",
       "      <td>7566</td>\n",
       "      <td>JONES</td>\n",
       "      <td>MANAGER</td>\n",
       "      <td>7839</td>\n",
       "      <td>1981-02-04</td>\n",
       "      <td>2975.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>20</td>\n",
       "      <td>2d20553c798d5e2d38a4b4bb5ef18c2f</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>ER</td>\n",
       "      <td>7654</td>\n",
       "      <td>MARTIN</td>\n",
       "      <td>SALESMAN</td>\n",
       "      <td>7698</td>\n",
       "      <td>1981-09-28</td>\n",
       "      <td>1250.0</td>\n",
       "      <td>1400.0</td>\n",
       "      <td>30</td>\n",
       "      <td>2d20553c798d5e2d38a4b4bb5ef18c2f</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>ER</td>\n",
       "      <td>7698</td>\n",
       "      <td>BLAKE</td>\n",
       "      <td>MANAGER</td>\n",
       "      <td>7839</td>\n",
       "      <td>1981-01-05</td>\n",
       "      <td>2850.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>30</td>\n",
       "      <td>2d20553c798d5e2d38a4b4bb5ef18c2f</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>ER</td>\n",
       "      <td>7782</td>\n",
       "      <td>CLARK</td>\n",
       "      <td>MANAGER</td>\n",
       "      <td>7839</td>\n",
       "      <td>1981-09-06</td>\n",
       "      <td>2450.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>10</td>\n",
       "      <td>2d20553c798d5e2d38a4b4bb5ef18c2f</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>ER</td>\n",
       "      <td>7788</td>\n",
       "      <td>SCOTT</td>\n",
       "      <td>ANALYST</td>\n",
       "      <td>7566</td>\n",
       "      <td>1982-09-12</td>\n",
       "      <td>3000.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>20</td>\n",
       "      <td>2d20553c798d5e2d38a4b4bb5ef18c2f</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>ER</td>\n",
       "      <td>7839</td>\n",
       "      <td>KING</td>\n",
       "      <td>PRESIDENT</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>1981-11-17</td>\n",
       "      <td>5000.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>10</td>\n",
       "      <td>2d20553c798d5e2d38a4b4bb5ef18c2f</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>ER</td>\n",
       "      <td>7844</td>\n",
       "      <td>TURNER</td>\n",
       "      <td>SALESMAN</td>\n",
       "      <td>7698</td>\n",
       "      <td>1981-08-09</td>\n",
       "      <td>1500.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>30</td>\n",
       "      <td>2d20553c798d5e2d38a4b4bb5ef18c2f</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>ER</td>\n",
       "      <td>7876</td>\n",
       "      <td>ADAMS</td>\n",
       "      <td>CLERK</td>\n",
       "      <td>7788</td>\n",
       "      <td>1983-12-01</td>\n",
       "      <td>1100.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>20</td>\n",
       "      <td>2d20553c798d5e2d38a4b4bb5ef18c2f</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>ER</td>\n",
       "      <td>7900</td>\n",
       "      <td>JAMES</td>\n",
       "      <td>CLERK</td>\n",
       "      <td>7698</td>\n",
       "      <td>1981-03-12</td>\n",
       "      <td>950.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>30</td>\n",
       "      <td>2d20553c798d5e2d38a4b4bb5ef18c2f</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>ER</td>\n",
       "      <td>7902</td>\n",
       "      <td>FORD</td>\n",
       "      <td>ANALYST</td>\n",
       "      <td>7566</td>\n",
       "      <td>1981-03-12</td>\n",
       "      <td>3000.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>20</td>\n",
       "      <td>2d20553c798d5e2d38a4b4bb5ef18c2f</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>ER</td>\n",
       "      <td>7934</td>\n",
       "      <td>MILLER</td>\n",
       "      <td>CLERK</td>\n",
       "      <td>7782</td>\n",
       "      <td>1982-01-23</td>\n",
       "      <td>1300.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>10</td>\n",
       "      <td>2d20553c798d5e2d38a4b4bb5ef18c2f</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   RECID  EMPNO   ENAME        JOB   MGR   HIREDATE     SAL    COMM  DEPTNO  \\\n",
       "5     ER   7369   SMITH      CLERK  7902 1980-12-17   800.0     0.1      20   \n",
       "6     ER   7499   ALLEN   SALESMAN  7698 1981-02-20  1600.0   300.0      30   \n",
       "7     ER   7521    WARD   SALESMAN  7698 1981-02-22  1250.0   500.0      30   \n",
       "8     ER   7566   JONES    MANAGER  7839 1981-02-04  2975.0     NaN      20   \n",
       "9     ER   7654  MARTIN   SALESMAN  7698 1981-09-28  1250.0  1400.0      30   \n",
       "10    ER   7698   BLAKE    MANAGER  7839 1981-01-05  2850.0     NaN      30   \n",
       "11    ER   7782   CLARK    MANAGER  7839 1981-09-06  2450.0     NaN      10   \n",
       "12    ER   7788   SCOTT    ANALYST  7566 1982-09-12  3000.0     NaN      20   \n",
       "13    ER   7839    KING  PRESIDENT  <NA> 1981-11-17  5000.0     NaN      10   \n",
       "14    ER   7844  TURNER   SALESMAN  7698 1981-08-09  1500.0     NaN      30   \n",
       "15    ER   7876   ADAMS      CLERK  7788 1983-12-01  1100.0     NaN      20   \n",
       "16    ER   7900   JAMES      CLERK  7698 1981-03-12   950.0     NaN      30   \n",
       "17    ER   7902    FORD    ANALYST  7566 1981-03-12  3000.0     NaN      20   \n",
       "18    ER   7934  MILLER      CLERK  7782 1982-01-23  1300.0     NaN      10   \n",
       "\n",
       "                            FK_BATCH  \n",
       "5   2d20553c798d5e2d38a4b4bb5ef18c2f  \n",
       "6   2d20553c798d5e2d38a4b4bb5ef18c2f  \n",
       "7   2d20553c798d5e2d38a4b4bb5ef18c2f  \n",
       "8   2d20553c798d5e2d38a4b4bb5ef18c2f  \n",
       "9   2d20553c798d5e2d38a4b4bb5ef18c2f  \n",
       "10  2d20553c798d5e2d38a4b4bb5ef18c2f  \n",
       "11  2d20553c798d5e2d38a4b4bb5ef18c2f  \n",
       "12  2d20553c798d5e2d38a4b4bb5ef18c2f  \n",
       "13  2d20553c798d5e2d38a4b4bb5ef18c2f  \n",
       "14  2d20553c798d5e2d38a4b4bb5ef18c2f  \n",
       "15  2d20553c798d5e2d38a4b4bb5ef18c2f  \n",
       "16  2d20553c798d5e2d38a4b4bb5ef18c2f  \n",
       "17  2d20553c798d5e2d38a4b4bb5ef18c2f  \n",
       "18  2d20553c798d5e2d38a4b4bb5ef18c2f  "
      ]
     },
     "execution_count": 282,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# show content of a pandas dataframe after converting to proper data types\n",
    "emp_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 283,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Declare trailer columns & widths based on file structure \n",
    "trailer_columns = ['RECID', 'ROWCOUNT']\n",
    "trailer_widths = [1, 5]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 284,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Read HR fixed width text file using pandas library read_fwf method\n",
    "trailer_df = pd.read_fwf(source_file_name, widths=trailer_widths, header=None, names=trailer_columns).tail(1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 285,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Apply batch key to trailer dataframe (FK stands for Foreign key)\n",
    "trailer_df['FK_BATCH'] = batch_key"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 286,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "RECID       object\n",
       "ROWCOUNT    object\n",
       "FK_BATCH    object\n",
       "dtype: object"
      ]
     },
     "execution_count": 286,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Check data types\n",
    "trailer_df.dtypes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 287,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Convert rowcount column from string to int\n",
    "trailer_dtypes = {'ROWCOUNT': int}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 289,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Apply the conversion using astype\n",
    "trailer_df = trailer_df.astype(trailer_dtypes)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 290,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>RECID</th>\n",
       "      <th>ROWCOUNT</th>\n",
       "      <th>FK_BATCH</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>T</td>\n",
       "      <td>18</td>\n",
       "      <td>2d20553c798d5e2d38a4b4bb5ef18c2f</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   RECID  ROWCOUNT                          FK_BATCH\n",
       "19     T        18  2d20553c798d5e2d38a4b4bb5ef18c2f"
      ]
     },
     "execution_count": 290,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# show content of a pandas dataframe after converting to proper data types\n",
    "trailer_df"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
 }
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 251,
	"metadata": {},
	"outputs": [],
	"source": [
	"# Import pandas library as pd\n",
	"import pandas as pd\n",
	"import hashlib\n",
	"import pathlib"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 252,
	"metadata": {},
	"outputs": [],
	"source": [
	"# Save file name in a variable so that it can be re used\n",
	"source_file_name = 'HR_Data_20200410.dat'"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 253,
	"metadata": {},
	"outputs": [],
	"source": [
	"def genrate_primary_key_per_batch(file_path):\n",
	" \"\"\"\n",
	" Generate md5 hash key for every batch and use this key to join header, detail, trailer tables\n",
	" :param file_path: yq remittance file path on the lambda server\n",
	" :return: batch_pk: md5 hash as string\n",
	" \"\"\"\n",
	" batch_pk = hashlib.md5(pathlib.Path(file_path).read_bytes()).hexdigest()\n",
	" return batch_pk"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 254,
	"metadata": {},
	"outputs": [],
	"source": [
	"# Invoke the batch key generation function\n",
	"batch_key = genrate_primary_key_per_batch(file_path=source_file_name)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 255,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"'2d20553c798d5e2d38a4b4bb5ef18c2f'"
	]
	},
	"execution_count": 255,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"# Show batch key\n",
	"batch_key"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 256,
	"metadata": {},
	"outputs": [],
	"source": [
	"# Declare herader columns & widths based on file structure \n",
	"header_columns = ['RECID', 'FILENAME', 'CREATEDTIME']\n",
	"header_widths = [1, 18, 14]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 257,
	"metadata": {},
	"outputs": [],
	"source": [
	"# Read HR fixed width text file using pandas library read_fwf method\n",
	"header_df = pd.read_fwf(source_file_name, widths=header_widths, header=None, names=header_columns, nrows=1)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 258,
	"metadata": {},
	"outputs": [],
	"source": [
	"# Apply batch key to header dataframe\n",
	"header_df['PK_BATCH'] = batch_key"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 259,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"RECID object\n",
	"FILENAME object\n",
	"CREATEDTIME int64\n",
	"PK_BATCH object\n",
	"dtype: object"
	]
	},
	"execution_count": 259,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"# Check data types\n",
	"header_df.dtypes"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 260,
	"metadata": {},
	"outputs": [],
	"source": [
	"# convert intger to date time\n",
	"header_df['CREATEDTIME'] = pd.to_datetime(header_df['CREATEDTIME'], format='%Y%m%d%H%M%S')"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 261,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"RECID object\n",
	"FILENAME object\n",
	"CREATEDTIME datetime64[ns]\n",
	"PK_BATCH object\n",
	"dtype: object"
	]
	},
	"execution_count": 261,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"# Check data types after converting and see the converted data type\n",
	"header_df.dtypes"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 262,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<style scoped>\n",
	" .dataframe tbody tr th:only-of-type {\n",
	" vertical-align: middle;\n",
	" }\n",
	"\n",
	" .dataframe tbody tr th {\n",
	" vertical-align: top;\n",
	" }\n",
	"\n",
	" .dataframe thead th {\n",
	" text-align: right;\n",
	" }\n",
	"</style>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>RECID</th>\n",
	" <th>FILENAME</th>\n",
	" <th>CREATEDTIME</th>\n",
	" <th>PK_BATCH</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>0</th>\n",
	" <td>H</td>\n",
	" <td>HR-DATA-2020-04-10</td>\n",
	" <td>2020-04-10 08:29:30</td>\n",
	" <td>2d20553c798d5e2d38a4b4bb5ef18c2f</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"text/plain": [
	" RECID FILENAME CREATEDTIME \\\n",
	"0 H HR-DATA-2020-04-10 2020-04-10 08:29:30 \n",
	"\n",
	" PK_BATCH \n",
	"0 2d20553c798d5e2d38a4b4bb5ef18c2f "
	]
	},
	"execution_count": 262,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"# show content of a pandas dataframe after converting to proper data types\n",
	"header_df"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 263,
	"metadata": {},
	"outputs": [],
	"source": [
	"# Declare dept herader columns & widths based on file structure \n",
	"dept_columns = ['RECID', 'DEPTNO', 'DNAME', 'LOC']\n",
	"dept_widths = [2, 2, 25, 15]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 264,
	"metadata": {},
	"outputs": [],
	"source": [
	"# Read HR fixed width text file using pandas library read_fwf method\n",
	"dept_df = pd.read_fwf(source_file_name, widths=dept_widths, header=None, names=dept_columns)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 265,
	"metadata": {},
	"outputs": [],
	"source": [
	"# Filter dataframe for department records\n",
	"dept_df = dept_df[dept_df['RECID'] == 'DR']"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 266,
	"metadata": {},
	"outputs": [],
	"source": [
	"# Apply batch key to dept dataframe (FK stands for Foreign key)\n",
	"dept_df['FK_BATCH'] = batch_key"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 267,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"RECID object\n",
	"DEPTNO object\n",
	"DNAME object\n",
	"LOC object\n",
	"FK_BATCH object\n",
	"dtype: object"
	]
	},
	"execution_count": 267,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"# check data types\n",
	"dept_df.dtypes"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 268,
	"metadata": {},
	"outputs": [],
	"source": [
	"# Convert deptno column data type to int\n",
	"dept_df['DEPTNO'] = dept_df['DEPTNO'].astype(int)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 269,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"RECID object\n",
	"DEPTNO int64\n",
	"DNAME object\n",
	"LOC object\n",
	"FK_BATCH object\n",
	"dtype: object"
	]
	},
	"execution_count": 269,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"# Check data types after converting and see the converted data type\n",
	"dept_df.dtypes"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 270,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<style scoped>\n",
	" .dataframe tbody tr th:only-of-type {\n",
	" vertical-align: middle;\n",
	" }\n",
	"\n",
	" .dataframe tbody tr th {\n",
	" vertical-align: top;\n",
	" }\n",
	"\n",
	" .dataframe thead th {\n",
	" text-align: right;\n",
	" }\n",
	"</style>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>RECID</th>\n",
	" <th>DEPTNO</th>\n",
	" <th>DNAME</th>\n",
	" <th>LOC</th>\n",
	" <th>FK_BATCH</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>1</th>\n",
	" <td>DR</td>\n",
	" <td>10</td>\n",
	" <td>ACCOUNTING</td>\n",
	" <td>NEW YORK</td>\n",
	" <td>2d20553c798d5e2d38a4b4bb5ef18c2f</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>2</th>\n",
	" <td>DR</td>\n",
	" <td>20</td>\n",
	" <td>RESEARCH</td>\n",
	" <td>DALLAS</td>\n",
	" <td>2d20553c798d5e2d38a4b4bb5ef18c2f</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>3</th>\n",
	" <td>DR</td>\n",
	" <td>30</td>\n",
	" <td>SALES</td>\n",
	" <td>CHICAGO</td>\n",
	" <td>2d20553c798d5e2d38a4b4bb5ef18c2f</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>4</th>\n",
	" <td>DR</td>\n",
	" <td>40</td>\n",
	" <td>OPERATIONS</td>\n",
	" <td>BOSTON</td>\n",
	" <td>2d20553c798d5e2d38a4b4bb5ef18c2f</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"text/plain": [
	" RECID DEPTNO DNAME LOC FK_BATCH\n",
	"1 DR 10 ACCOUNTING NEW YORK 2d20553c798d5e2d38a4b4bb5ef18c2f\n",
	"2 DR 20 RESEARCH DALLAS 2d20553c798d5e2d38a4b4bb5ef18c2f\n",
	"3 DR 30 SALES CHICAGO 2d20553c798d5e2d38a4b4bb5ef18c2f\n",
	"4 DR 40 OPERATIONS BOSTON 2d20553c798d5e2d38a4b4bb5ef18c2f"
	]
	},
	"execution_count": 270,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"# show content of a pandas dataframe after converting to proper data types\n",
	"dept_df"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 271,
	"metadata": {},
	"outputs": [],
	"source": [
	"# Declare emp herader columns & widths based on file structure \n",
	"emp_columns = ['RECID', 'EMPNO', 'ENAME', 'JOB', 'MGR', 'HIREDATE', 'SAL', 'COMM', 'DEPTNO']\n",
	"emp_widths = [2, 5, 25, 15, 5, 10, 6, 4, 2]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 272,
	"metadata": {},
	"outputs": [],
	"source": [
	"# Read HR fixed width text file using pandas library read_fwf method\n",
	"emp_df = pd.read_fwf(source_file_name, widths=emp_widths, header=None, names=emp_columns)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 273,
	"metadata": {},
	"outputs": [],
	"source": [
	"# Filter dataframe for employee records\n",
	"emp_df = emp_df[emp_df['RECID'] == 'ER']"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 274,
	"metadata": {},
	"outputs": [],
	"source": [
	"# Apply batch key to emp dataframe (FK stands for Foreign key)\n",
	"emp_df['FK_BATCH'] = batch_key"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 275,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"RECID object\n",
	"EMPNO object\n",
	"ENAME object\n",
	"JOB object\n",
	"MGR float64\n",
	"HIREDATE object\n",
	"SAL float64\n",
	"COMM float64\n",
	"DEPTNO float64\n",
	"FK_BATCH object\n",
	"dtype: object"
	]
	},
	"execution_count": 275,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"emp_df.dtypes"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 276,
	"metadata": {},
	"outputs": [],
	"source": [
	"# Convert hiredate column from string to date\n",
	"emp_df['HIREDATE'] = pd.to_datetime(emp_df['HIREDATE'])"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 279,
	"metadata": {},
	"outputs": [],
	"source": [
	"# Convert empno, mgr, deptno columns from string to int\n",
	"emp_columns_dtypes = {'EMPNO': 'int', 'MGR': 'Int64', 'DEPTNO': 'int'}"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 280,
	"metadata": {},
	"outputs": [],
	"source": [
	"# Apply the conversion using astype\n",
	"emp_df = emp_df.astype(emp_columns_dtypes)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 281,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"RECID object\n",
	"EMPNO int64\n",
	"ENAME object\n",
	"JOB object\n",
	"MGR Int64\n",
	"HIREDATE datetime64[ns]\n",
	"SAL float64\n",
	"COMM float64\n",
	"DEPTNO int64\n",
	"FK_BATCH object\n",
	"dtype: object"
	]
	},
	"execution_count": 281,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"# Check data types after converting and see the converted data type\n",
	"emp_df.dtypes"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 282,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<style scoped>\n",
	" .dataframe tbody tr th:only-of-type {\n",
	" vertical-align: middle;\n",
	" }\n",
	"\n",
	" .dataframe tbody tr th {\n",
	" vertical-align: top;\n",
	" }\n",
	"\n",
	" .dataframe thead th {\n",
	" text-align: right;\n",
	" }\n",
	"</style>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>RECID</th>\n",
	" <th>EMPNO</th>\n",
	" <th>ENAME</th>\n",
	" <th>JOB</th>\n",
	" <th>MGR</th>\n",
	" <th>HIREDATE</th>\n",
	" <th>SAL</th>\n",
	" <th>COMM</th>\n",
	" <th>DEPTNO</th>\n",
	" <th>FK_BATCH</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>5</th>\n",
	" <td>ER</td>\n",
	" <td>7369</td>\n",
	" <td>SMITH</td>\n",
	" <td>CLERK</td>\n",
	" <td>7902</td>\n",
	" <td>1980-12-17</td>\n",
	" <td>800.0</td>\n",
	" <td>0.1</td>\n",
	" <td>20</td>\n",
	" <td>2d20553c798d5e2d38a4b4bb5ef18c2f</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>6</th>\n",
	" <td>ER</td>\n",
	" <td>7499</td>\n",
	" <td>ALLEN</td>\n",
	" <td>SALESMAN</td>\n",
	" <td>7698</td>\n",
	" <td>1981-02-20</td>\n",
	" <td>1600.0</td>\n",
	" <td>300.0</td>\n",
	" <td>30</td>\n",
	" <td>2d20553c798d5e2d38a4b4bb5ef18c2f</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>7</th>\n",
	" <td>ER</td>\n",
	" <td>7521</td>\n",
	" <td>WARD</td>\n",
	" <td>SALESMAN</td>\n",
	" <td>7698</td>\n",
	" <td>1981-02-22</td>\n",
	" <td>1250.0</td>\n",
	" <td>500.0</td>\n",
	" <td>30</td>\n",
	" <td>2d20553c798d5e2d38a4b4bb5ef18c2f</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>8</th>\n",
	" <td>ER</td>\n",
	" <td>7566</td>\n",
	" <td>JONES</td>\n",
	" <td>MANAGER</td>\n",
	" <td>7839</td>\n",
	" <td>1981-02-04</td>\n",
	" <td>2975.0</td>\n",
	" <td>NaN</td>\n",
	" <td>20</td>\n",
	" <td>2d20553c798d5e2d38a4b4bb5ef18c2f</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>9</th>\n",
	" <td>ER</td>\n",
	" <td>7654</td>\n",
	" <td>MARTIN</td>\n",
	" <td>SALESMAN</td>\n",
	" <td>7698</td>\n",
	" <td>1981-09-28</td>\n",
	" <td>1250.0</td>\n",
	" <td>1400.0</td>\n",
	" <td>30</td>\n",
	" <td>2d20553c798d5e2d38a4b4bb5ef18c2f</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>10</th>\n",
	" <td>ER</td>\n",
	" <td>7698</td>\n",
	" <td>BLAKE</td>\n",
	" <td>MANAGER</td>\n",
	" <td>7839</td>\n",
	" <td>1981-01-05</td>\n",
	" <td>2850.0</td>\n",
	" <td>NaN</td>\n",
	" <td>30</td>\n",
	" <td>2d20553c798d5e2d38a4b4bb5ef18c2f</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>11</th>\n",
	" <td>ER</td>\n",
	" <td>7782</td>\n",
	" <td>CLARK</td>\n",
	" <td>MANAGER</td>\n",
	" <td>7839</td>\n",
	" <td>1981-09-06</td>\n",
	" <td>2450.0</td>\n",
	" <td>NaN</td>\n",
	" <td>10</td>\n",
	" <td>2d20553c798d5e2d38a4b4bb5ef18c2f</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>12</th>\n",
	" <td>ER</td>\n",
	" <td>7788</td>\n",
	" <td>SCOTT</td>\n",
	" <td>ANALYST</td>\n",
	" <td>7566</td>\n",
	" <td>1982-09-12</td>\n",
	" <td>3000.0</td>\n",
	" <td>NaN</td>\n",
	" <td>20</td>\n",
	" <td>2d20553c798d5e2d38a4b4bb5ef18c2f</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>13</th>\n",
	" <td>ER</td>\n",
	" <td>7839</td>\n",
	" <td>KING</td>\n",
	" <td>PRESIDENT</td>\n",
	" <td><NA></td>\n",
	" <td>1981-11-17</td>\n",
	" <td>5000.0</td>\n",
	" <td>NaN</td>\n",
	" <td>10</td>\n",
	" <td>2d20553c798d5e2d38a4b4bb5ef18c2f</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>14</th>\n",
	" <td>ER</td>\n",
	" <td>7844</td>\n",
	" <td>TURNER</td>\n",
	" <td>SALESMAN</td>\n",
	" <td>7698</td>\n",
	" <td>1981-08-09</td>\n",
	" <td>1500.0</td>\n",
	" <td>NaN</td>\n",
	" <td>30</td>\n",
	" <td>2d20553c798d5e2d38a4b4bb5ef18c2f</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>15</th>\n",
	" <td>ER</td>\n",
	" <td>7876</td>\n",
	" <td>ADAMS</td>\n",
	" <td>CLERK</td>\n",
	" <td>7788</td>\n",
	" <td>1983-12-01</td>\n",
	" <td>1100.0</td>\n",
	" <td>NaN</td>\n",
	" <td>20</td>\n",
	" <td>2d20553c798d5e2d38a4b4bb5ef18c2f</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>16</th>\n",
	" <td>ER</td>\n",
	" <td>7900</td>\n",
	" <td>JAMES</td>\n",
	" <td>CLERK</td>\n",
	" <td>7698</td>\n",
	" <td>1981-03-12</td>\n",
	" <td>950.0</td>\n",
	" <td>NaN</td>\n",
	" <td>30</td>\n",
	" <td>2d20553c798d5e2d38a4b4bb5ef18c2f</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>17</th>\n",
	" <td>ER</td>\n",
	" <td>7902</td>\n",
	" <td>FORD</td>\n",
	" <td>ANALYST</td>\n",
	" <td>7566</td>\n",
	" <td>1981-03-12</td>\n",
	" <td>3000.0</td>\n",
	" <td>NaN</td>\n",
	" <td>20</td>\n",
	" <td>2d20553c798d5e2d38a4b4bb5ef18c2f</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>18</th>\n",
	" <td>ER</td>\n",
	" <td>7934</td>\n",
	" <td>MILLER</td>\n",
	" <td>CLERK</td>\n",
	" <td>7782</td>\n",
	" <td>1982-01-23</td>\n",
	" <td>1300.0</td>\n",
	" <td>NaN</td>\n",
	" <td>10</td>\n",
	" <td>2d20553c798d5e2d38a4b4bb5ef18c2f</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"text/plain": [
	" RECID EMPNO ENAME JOB MGR HIREDATE SAL COMM DEPTNO \\\n",
	"5 ER 7369 SMITH CLERK 7902 1980-12-17 800.0 0.1 20 \n",
	"6 ER 7499 ALLEN SALESMAN 7698 1981-02-20 1600.0 300.0 30 \n",
	"7 ER 7521 WARD SALESMAN 7698 1981-02-22 1250.0 500.0 30 \n",
	"8 ER 7566 JONES MANAGER 7839 1981-02-04 2975.0 NaN 20 \n",
	"9 ER 7654 MARTIN SALESMAN 7698 1981-09-28 1250.0 1400.0 30 \n",
	"10 ER 7698 BLAKE MANAGER 7839 1981-01-05 2850.0 NaN 30 \n",
	"11 ER 7782 CLARK MANAGER 7839 1981-09-06 2450.0 NaN 10 \n",
	"12 ER 7788 SCOTT ANALYST 7566 1982-09-12 3000.0 NaN 20 \n",
	"13 ER 7839 KING PRESIDENT <NA> 1981-11-17 5000.0 NaN 10 \n",
	"14 ER 7844 TURNER SALESMAN 7698 1981-08-09 1500.0 NaN 30 \n",
	"15 ER 7876 ADAMS CLERK 7788 1983-12-01 1100.0 NaN 20 \n",
	"16 ER 7900 JAMES CLERK 7698 1981-03-12 950.0 NaN 30 \n",
	"17 ER 7902 FORD ANALYST 7566 1981-03-12 3000.0 NaN 20 \n",
	"18 ER 7934 MILLER CLERK 7782 1982-01-23 1300.0 NaN 10 \n",
	"\n",
	" FK_BATCH \n",
	"5 2d20553c798d5e2d38a4b4bb5ef18c2f \n",
	"6 2d20553c798d5e2d38a4b4bb5ef18c2f \n",
	"7 2d20553c798d5e2d38a4b4bb5ef18c2f \n",
	"8 2d20553c798d5e2d38a4b4bb5ef18c2f \n",
	"9 2d20553c798d5e2d38a4b4bb5ef18c2f \n",
	"10 2d20553c798d5e2d38a4b4bb5ef18c2f \n",
	"11 2d20553c798d5e2d38a4b4bb5ef18c2f \n",
	"12 2d20553c798d5e2d38a4b4bb5ef18c2f \n",
	"13 2d20553c798d5e2d38a4b4bb5ef18c2f \n",
	"14 2d20553c798d5e2d38a4b4bb5ef18c2f \n",
	"15 2d20553c798d5e2d38a4b4bb5ef18c2f \n",
	"16 2d20553c798d5e2d38a4b4bb5ef18c2f \n",
	"17 2d20553c798d5e2d38a4b4bb5ef18c2f \n",
	"18 2d20553c798d5e2d38a4b4bb5ef18c2f "
	]
	},
	"execution_count": 282,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"# show content of a pandas dataframe after converting to proper data types\n",
	"emp_df"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 283,
	"metadata": {},
	"outputs": [],
	"source": [
	"# Declare trailer columns & widths based on file structure \n",
	"trailer_columns = ['RECID', 'ROWCOUNT']\n",
	"trailer_widths = [1, 5]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 284,
	"metadata": {},
	"outputs": [],
	"source": [
	"# Read HR fixed width text file using pandas library read_fwf method\n",
	"trailer_df = pd.read_fwf(source_file_name, widths=trailer_widths, header=None, names=trailer_columns).tail(1)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 285,
	"metadata": {},
	"outputs": [],
	"source": [
	"# Apply batch key to trailer dataframe (FK stands for Foreign key)\n",
	"trailer_df['FK_BATCH'] = batch_key"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 286,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"RECID object\n",
	"ROWCOUNT object\n",
	"FK_BATCH object\n",
	"dtype: object"
	]
	},
	"execution_count": 286,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"# Check data types\n",
	"trailer_df.dtypes"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 287,
	"metadata": {},
	"outputs": [],
	"source": [
	"# Convert rowcount column from string to int\n",
	"trailer_dtypes = {'ROWCOUNT': int}"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 289,
	"metadata": {},
	"outputs": [],
	"source": [
	"# Apply the conversion using astype\n",
	"trailer_df = trailer_df.astype(trailer_dtypes)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 290,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<style scoped>\n",
	" .dataframe tbody tr th:only-of-type {\n",
	" vertical-align: middle;\n",
	" }\n",
	"\n",
	" .dataframe tbody tr th {\n",
	" vertical-align: top;\n",
	" }\n",
	"\n",
	" .dataframe thead th {\n",
	" text-align: right;\n",
	" }\n",
	"</style>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>RECID</th>\n",
	" <th>ROWCOUNT</th>\n",
	" <th>FK_BATCH</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>19</th>\n",
	" <td>T</td>\n",
	" <td>18</td>\n",
	" <td>2d20553c798d5e2d38a4b4bb5ef18c2f</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"text/plain": [
	" RECID ROWCOUNT FK_BATCH\n",
	"19 T 18 2d20553c798d5e2d38a4b4bb5ef18c2f"
	]
	},
	"execution_count": 290,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"# show content of a pandas dataframe after converting to proper data types\n",
	"trailer_df"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.6.9"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 4
	}