Created
April 12, 2020 09:52
-
-
Save vvgsrk/0072a41a59145931d64db705a2bf8d32 to your computer and use it in GitHub Desktop.
Parse fixed width text file using Pandas library read_fwf method
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 251, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Import pandas library as pd\n", | |
"import pandas as pd\n", | |
"import hashlib\n", | |
"import pathlib" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 252, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Save file name in a variable so that it can be re used\n", | |
"source_file_name = 'HR_Data_20200410.dat'" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 253, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def genrate_primary_key_per_batch(file_path):\n", | |
" \"\"\"\n", | |
" Generate md5 hash key for every batch and use this key to join header, detail, trailer tables\n", | |
" :param file_path: yq remittance file path on the lambda server\n", | |
" :return: batch_pk: md5 hash as string\n", | |
" \"\"\"\n", | |
" batch_pk = hashlib.md5(pathlib.Path(file_path).read_bytes()).hexdigest()\n", | |
" return batch_pk" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 254, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Invoke the batch key generation function\n", | |
"batch_key = genrate_primary_key_per_batch(file_path=source_file_name)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 255, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"'2d20553c798d5e2d38a4b4bb5ef18c2f'" | |
] | |
}, | |
"execution_count": 255, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# Show batch key\n", | |
"batch_key" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 256, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Declare herader columns & widths based on file structure \n", | |
"header_columns = ['RECID', 'FILENAME', 'CREATEDTIME']\n", | |
"header_widths = [1, 18, 14]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 257, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Read HR fixed width text file using pandas library read_fwf method\n", | |
"header_df = pd.read_fwf(source_file_name, widths=header_widths, header=None, names=header_columns, nrows=1)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 258, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Apply batch key to header dataframe\n", | |
"header_df['PK_BATCH'] = batch_key" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 259, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"RECID object\n", | |
"FILENAME object\n", | |
"CREATEDTIME int64\n", | |
"PK_BATCH object\n", | |
"dtype: object" | |
] | |
}, | |
"execution_count": 259, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# Check data types\n", | |
"header_df.dtypes" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 260, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# convert intger to date time\n", | |
"header_df['CREATEDTIME'] = pd.to_datetime(header_df['CREATEDTIME'], format='%Y%m%d%H%M%S')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 261, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"RECID object\n", | |
"FILENAME object\n", | |
"CREATEDTIME datetime64[ns]\n", | |
"PK_BATCH object\n", | |
"dtype: object" | |
] | |
}, | |
"execution_count": 261, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# Check data types after converting and see the converted data type\n", | |
"header_df.dtypes" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 262, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>RECID</th>\n", | |
" <th>FILENAME</th>\n", | |
" <th>CREATEDTIME</th>\n", | |
" <th>PK_BATCH</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>H</td>\n", | |
" <td>HR-DATA-2020-04-10</td>\n", | |
" <td>2020-04-10 08:29:30</td>\n", | |
" <td>2d20553c798d5e2d38a4b4bb5ef18c2f</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" RECID FILENAME CREATEDTIME \\\n", | |
"0 H HR-DATA-2020-04-10 2020-04-10 08:29:30 \n", | |
"\n", | |
" PK_BATCH \n", | |
"0 2d20553c798d5e2d38a4b4bb5ef18c2f " | |
] | |
}, | |
"execution_count": 262, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# show content of a pandas dataframe after converting to proper data types\n", | |
"header_df" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 263, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Declare dept herader columns & widths based on file structure \n", | |
"dept_columns = ['RECID', 'DEPTNO', 'DNAME', 'LOC']\n", | |
"dept_widths = [2, 2, 25, 15]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 264, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Read HR fixed width text file using pandas library read_fwf method\n", | |
"dept_df = pd.read_fwf(source_file_name, widths=dept_widths, header=None, names=dept_columns)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 265, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Filter dataframe for department records\n", | |
"dept_df = dept_df[dept_df['RECID'] == 'DR']" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 266, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Apply batch key to dept dataframe (FK stands for Foreign key)\n", | |
"dept_df['FK_BATCH'] = batch_key" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 267, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"RECID object\n", | |
"DEPTNO object\n", | |
"DNAME object\n", | |
"LOC object\n", | |
"FK_BATCH object\n", | |
"dtype: object" | |
] | |
}, | |
"execution_count": 267, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# check data types\n", | |
"dept_df.dtypes" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 268, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Convert deptno column data type to int\n", | |
"dept_df['DEPTNO'] = dept_df['DEPTNO'].astype(int)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 269, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"RECID object\n", | |
"DEPTNO int64\n", | |
"DNAME object\n", | |
"LOC object\n", | |
"FK_BATCH object\n", | |
"dtype: object" | |
] | |
}, | |
"execution_count": 269, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# Check data types after converting and see the converted data type\n", | |
"dept_df.dtypes" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 270, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>RECID</th>\n", | |
" <th>DEPTNO</th>\n", | |
" <th>DNAME</th>\n", | |
" <th>LOC</th>\n", | |
" <th>FK_BATCH</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>DR</td>\n", | |
" <td>10</td>\n", | |
" <td>ACCOUNTING</td>\n", | |
" <td>NEW YORK</td>\n", | |
" <td>2d20553c798d5e2d38a4b4bb5ef18c2f</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>DR</td>\n", | |
" <td>20</td>\n", | |
" <td>RESEARCH</td>\n", | |
" <td>DALLAS</td>\n", | |
" <td>2d20553c798d5e2d38a4b4bb5ef18c2f</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>DR</td>\n", | |
" <td>30</td>\n", | |
" <td>SALES</td>\n", | |
" <td>CHICAGO</td>\n", | |
" <td>2d20553c798d5e2d38a4b4bb5ef18c2f</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>DR</td>\n", | |
" <td>40</td>\n", | |
" <td>OPERATIONS</td>\n", | |
" <td>BOSTON</td>\n", | |
" <td>2d20553c798d5e2d38a4b4bb5ef18c2f</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" RECID DEPTNO DNAME LOC FK_BATCH\n", | |
"1 DR 10 ACCOUNTING NEW YORK 2d20553c798d5e2d38a4b4bb5ef18c2f\n", | |
"2 DR 20 RESEARCH DALLAS 2d20553c798d5e2d38a4b4bb5ef18c2f\n", | |
"3 DR 30 SALES CHICAGO 2d20553c798d5e2d38a4b4bb5ef18c2f\n", | |
"4 DR 40 OPERATIONS BOSTON 2d20553c798d5e2d38a4b4bb5ef18c2f" | |
] | |
}, | |
"execution_count": 270, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# show content of a pandas dataframe after converting to proper data types\n", | |
"dept_df" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 271, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Declare emp herader columns & widths based on file structure \n", | |
"emp_columns = ['RECID', 'EMPNO', 'ENAME', 'JOB', 'MGR', 'HIREDATE', 'SAL', 'COMM', 'DEPTNO']\n", | |
"emp_widths = [2, 5, 25, 15, 5, 10, 6, 4, 2]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 272, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Read HR fixed width text file using pandas library read_fwf method\n", | |
"emp_df = pd.read_fwf(source_file_name, widths=emp_widths, header=None, names=emp_columns)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 273, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Filter dataframe for employee records\n", | |
"emp_df = emp_df[emp_df['RECID'] == 'ER']" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 274, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Apply batch key to emp dataframe (FK stands for Foreign key)\n", | |
"emp_df['FK_BATCH'] = batch_key" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 275, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"RECID object\n", | |
"EMPNO object\n", | |
"ENAME object\n", | |
"JOB object\n", | |
"MGR float64\n", | |
"HIREDATE object\n", | |
"SAL float64\n", | |
"COMM float64\n", | |
"DEPTNO float64\n", | |
"FK_BATCH object\n", | |
"dtype: object" | |
] | |
}, | |
"execution_count": 275, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"emp_df.dtypes" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 276, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Convert hiredate column from string to date\n", | |
"emp_df['HIREDATE'] = pd.to_datetime(emp_df['HIREDATE'])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 279, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Convert empno, mgr, deptno columns from string to int\n", | |
"emp_columns_dtypes = {'EMPNO': 'int', 'MGR': 'Int64', 'DEPTNO': 'int'}" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 280, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Apply the conversion using astype\n", | |
"emp_df = emp_df.astype(emp_columns_dtypes)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 281, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"RECID object\n", | |
"EMPNO int64\n", | |
"ENAME object\n", | |
"JOB object\n", | |
"MGR Int64\n", | |
"HIREDATE datetime64[ns]\n", | |
"SAL float64\n", | |
"COMM float64\n", | |
"DEPTNO int64\n", | |
"FK_BATCH object\n", | |
"dtype: object" | |
] | |
}, | |
"execution_count": 281, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# Check data types after converting and see the converted data type\n", | |
"emp_df.dtypes" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 282, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>RECID</th>\n", | |
" <th>EMPNO</th>\n", | |
" <th>ENAME</th>\n", | |
" <th>JOB</th>\n", | |
" <th>MGR</th>\n", | |
" <th>HIREDATE</th>\n", | |
" <th>SAL</th>\n", | |
" <th>COMM</th>\n", | |
" <th>DEPTNO</th>\n", | |
" <th>FK_BATCH</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>5</th>\n", | |
" <td>ER</td>\n", | |
" <td>7369</td>\n", | |
" <td>SMITH</td>\n", | |
" <td>CLERK</td>\n", | |
" <td>7902</td>\n", | |
" <td>1980-12-17</td>\n", | |
" <td>800.0</td>\n", | |
" <td>0.1</td>\n", | |
" <td>20</td>\n", | |
" <td>2d20553c798d5e2d38a4b4bb5ef18c2f</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>6</th>\n", | |
" <td>ER</td>\n", | |
" <td>7499</td>\n", | |
" <td>ALLEN</td>\n", | |
" <td>SALESMAN</td>\n", | |
" <td>7698</td>\n", | |
" <td>1981-02-20</td>\n", | |
" <td>1600.0</td>\n", | |
" <td>300.0</td>\n", | |
" <td>30</td>\n", | |
" <td>2d20553c798d5e2d38a4b4bb5ef18c2f</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>7</th>\n", | |
" <td>ER</td>\n", | |
" <td>7521</td>\n", | |
" <td>WARD</td>\n", | |
" <td>SALESMAN</td>\n", | |
" <td>7698</td>\n", | |
" <td>1981-02-22</td>\n", | |
" <td>1250.0</td>\n", | |
" <td>500.0</td>\n", | |
" <td>30</td>\n", | |
" <td>2d20553c798d5e2d38a4b4bb5ef18c2f</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>8</th>\n", | |
" <td>ER</td>\n", | |
" <td>7566</td>\n", | |
" <td>JONES</td>\n", | |
" <td>MANAGER</td>\n", | |
" <td>7839</td>\n", | |
" <td>1981-02-04</td>\n", | |
" <td>2975.0</td>\n", | |
" <td>NaN</td>\n", | |
" <td>20</td>\n", | |
" <td>2d20553c798d5e2d38a4b4bb5ef18c2f</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>9</th>\n", | |
" <td>ER</td>\n", | |
" <td>7654</td>\n", | |
" <td>MARTIN</td>\n", | |
" <td>SALESMAN</td>\n", | |
" <td>7698</td>\n", | |
" <td>1981-09-28</td>\n", | |
" <td>1250.0</td>\n", | |
" <td>1400.0</td>\n", | |
" <td>30</td>\n", | |
" <td>2d20553c798d5e2d38a4b4bb5ef18c2f</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>10</th>\n", | |
" <td>ER</td>\n", | |
" <td>7698</td>\n", | |
" <td>BLAKE</td>\n", | |
" <td>MANAGER</td>\n", | |
" <td>7839</td>\n", | |
" <td>1981-01-05</td>\n", | |
" <td>2850.0</td>\n", | |
" <td>NaN</td>\n", | |
" <td>30</td>\n", | |
" <td>2d20553c798d5e2d38a4b4bb5ef18c2f</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>11</th>\n", | |
" <td>ER</td>\n", | |
" <td>7782</td>\n", | |
" <td>CLARK</td>\n", | |
" <td>MANAGER</td>\n", | |
" <td>7839</td>\n", | |
" <td>1981-09-06</td>\n", | |
" <td>2450.0</td>\n", | |
" <td>NaN</td>\n", | |
" <td>10</td>\n", | |
" <td>2d20553c798d5e2d38a4b4bb5ef18c2f</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>12</th>\n", | |
" <td>ER</td>\n", | |
" <td>7788</td>\n", | |
" <td>SCOTT</td>\n", | |
" <td>ANALYST</td>\n", | |
" <td>7566</td>\n", | |
" <td>1982-09-12</td>\n", | |
" <td>3000.0</td>\n", | |
" <td>NaN</td>\n", | |
" <td>20</td>\n", | |
" <td>2d20553c798d5e2d38a4b4bb5ef18c2f</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>13</th>\n", | |
" <td>ER</td>\n", | |
" <td>7839</td>\n", | |
" <td>KING</td>\n", | |
" <td>PRESIDENT</td>\n", | |
" <td><NA></td>\n", | |
" <td>1981-11-17</td>\n", | |
" <td>5000.0</td>\n", | |
" <td>NaN</td>\n", | |
" <td>10</td>\n", | |
" <td>2d20553c798d5e2d38a4b4bb5ef18c2f</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>14</th>\n", | |
" <td>ER</td>\n", | |
" <td>7844</td>\n", | |
" <td>TURNER</td>\n", | |
" <td>SALESMAN</td>\n", | |
" <td>7698</td>\n", | |
" <td>1981-08-09</td>\n", | |
" <td>1500.0</td>\n", | |
" <td>NaN</td>\n", | |
" <td>30</td>\n", | |
" <td>2d20553c798d5e2d38a4b4bb5ef18c2f</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>15</th>\n", | |
" <td>ER</td>\n", | |
" <td>7876</td>\n", | |
" <td>ADAMS</td>\n", | |
" <td>CLERK</td>\n", | |
" <td>7788</td>\n", | |
" <td>1983-12-01</td>\n", | |
" <td>1100.0</td>\n", | |
" <td>NaN</td>\n", | |
" <td>20</td>\n", | |
" <td>2d20553c798d5e2d38a4b4bb5ef18c2f</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>16</th>\n", | |
" <td>ER</td>\n", | |
" <td>7900</td>\n", | |
" <td>JAMES</td>\n", | |
" <td>CLERK</td>\n", | |
" <td>7698</td>\n", | |
" <td>1981-03-12</td>\n", | |
" <td>950.0</td>\n", | |
" <td>NaN</td>\n", | |
" <td>30</td>\n", | |
" <td>2d20553c798d5e2d38a4b4bb5ef18c2f</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>17</th>\n", | |
" <td>ER</td>\n", | |
" <td>7902</td>\n", | |
" <td>FORD</td>\n", | |
" <td>ANALYST</td>\n", | |
" <td>7566</td>\n", | |
" <td>1981-03-12</td>\n", | |
" <td>3000.0</td>\n", | |
" <td>NaN</td>\n", | |
" <td>20</td>\n", | |
" <td>2d20553c798d5e2d38a4b4bb5ef18c2f</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>18</th>\n", | |
" <td>ER</td>\n", | |
" <td>7934</td>\n", | |
" <td>MILLER</td>\n", | |
" <td>CLERK</td>\n", | |
" <td>7782</td>\n", | |
" <td>1982-01-23</td>\n", | |
" <td>1300.0</td>\n", | |
" <td>NaN</td>\n", | |
" <td>10</td>\n", | |
" <td>2d20553c798d5e2d38a4b4bb5ef18c2f</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" RECID EMPNO ENAME JOB MGR HIREDATE SAL COMM DEPTNO \\\n", | |
"5 ER 7369 SMITH CLERK 7902 1980-12-17 800.0 0.1 20 \n", | |
"6 ER 7499 ALLEN SALESMAN 7698 1981-02-20 1600.0 300.0 30 \n", | |
"7 ER 7521 WARD SALESMAN 7698 1981-02-22 1250.0 500.0 30 \n", | |
"8 ER 7566 JONES MANAGER 7839 1981-02-04 2975.0 NaN 20 \n", | |
"9 ER 7654 MARTIN SALESMAN 7698 1981-09-28 1250.0 1400.0 30 \n", | |
"10 ER 7698 BLAKE MANAGER 7839 1981-01-05 2850.0 NaN 30 \n", | |
"11 ER 7782 CLARK MANAGER 7839 1981-09-06 2450.0 NaN 10 \n", | |
"12 ER 7788 SCOTT ANALYST 7566 1982-09-12 3000.0 NaN 20 \n", | |
"13 ER 7839 KING PRESIDENT <NA> 1981-11-17 5000.0 NaN 10 \n", | |
"14 ER 7844 TURNER SALESMAN 7698 1981-08-09 1500.0 NaN 30 \n", | |
"15 ER 7876 ADAMS CLERK 7788 1983-12-01 1100.0 NaN 20 \n", | |
"16 ER 7900 JAMES CLERK 7698 1981-03-12 950.0 NaN 30 \n", | |
"17 ER 7902 FORD ANALYST 7566 1981-03-12 3000.0 NaN 20 \n", | |
"18 ER 7934 MILLER CLERK 7782 1982-01-23 1300.0 NaN 10 \n", | |
"\n", | |
" FK_BATCH \n", | |
"5 2d20553c798d5e2d38a4b4bb5ef18c2f \n", | |
"6 2d20553c798d5e2d38a4b4bb5ef18c2f \n", | |
"7 2d20553c798d5e2d38a4b4bb5ef18c2f \n", | |
"8 2d20553c798d5e2d38a4b4bb5ef18c2f \n", | |
"9 2d20553c798d5e2d38a4b4bb5ef18c2f \n", | |
"10 2d20553c798d5e2d38a4b4bb5ef18c2f \n", | |
"11 2d20553c798d5e2d38a4b4bb5ef18c2f \n", | |
"12 2d20553c798d5e2d38a4b4bb5ef18c2f \n", | |
"13 2d20553c798d5e2d38a4b4bb5ef18c2f \n", | |
"14 2d20553c798d5e2d38a4b4bb5ef18c2f \n", | |
"15 2d20553c798d5e2d38a4b4bb5ef18c2f \n", | |
"16 2d20553c798d5e2d38a4b4bb5ef18c2f \n", | |
"17 2d20553c798d5e2d38a4b4bb5ef18c2f \n", | |
"18 2d20553c798d5e2d38a4b4bb5ef18c2f " | |
] | |
}, | |
"execution_count": 282, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# show content of a pandas dataframe after converting to proper data types\n", | |
"emp_df" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 283, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Declare trailer columns & widths based on file structure \n", | |
"trailer_columns = ['RECID', 'ROWCOUNT']\n", | |
"trailer_widths = [1, 5]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 284, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Read HR fixed width text file using pandas library read_fwf method\n", | |
"trailer_df = pd.read_fwf(source_file_name, widths=trailer_widths, header=None, names=trailer_columns).tail(1)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 285, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Apply batch key to trailer dataframe (FK stands for Foreign key)\n", | |
"trailer_df['FK_BATCH'] = batch_key" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 286, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"RECID object\n", | |
"ROWCOUNT object\n", | |
"FK_BATCH object\n", | |
"dtype: object" | |
] | |
}, | |
"execution_count": 286, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# Check data types\n", | |
"trailer_df.dtypes" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 287, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Convert rowcount column from string to int\n", | |
"trailer_dtypes = {'ROWCOUNT': int}" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 289, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Apply the conversion using astype\n", | |
"trailer_df = trailer_df.astype(trailer_dtypes)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 290, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>RECID</th>\n", | |
" <th>ROWCOUNT</th>\n", | |
" <th>FK_BATCH</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>19</th>\n", | |
" <td>T</td>\n", | |
" <td>18</td>\n", | |
" <td>2d20553c798d5e2d38a4b4bb5ef18c2f</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" RECID ROWCOUNT FK_BATCH\n", | |
"19 T 18 2d20553c798d5e2d38a4b4bb5ef18c2f" | |
] | |
}, | |
"execution_count": 290, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# show content of a pandas dataframe after converting to proper data types\n", | |
"trailer_df" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.6.9" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 4 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment