Last active
August 27, 2018 12:45
-
-
Save noleto/198a526f4c6fd87997acea1af199ba7e to your computer and use it in GitHub Desktop.
UCI Bank Marketing Data Set - reverse engineering date from records (not given in the bank-additional-full data set)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## Data set in https://archive.ics.uci.edu/ml/datasets/bank+marketing does not contain timestamp. | |
## We can "deduce" them from the ordedr of records and use month and day_of_week columns to create a full date | |
## We know the first record was from 05-05-2008 | |
import pandas as pd | |
import numpy as np | |
import datetime as dt | |
import sys | |
# read records from May 2008 to November 2010 | |
bank_df = pd.read_csv("data/bank-additional-full.csv", sep=";") | |
bank_df.rename(index=str, columns={"y": "subscribed"}, inplace=True) | |
def get_first_match_weekday_of_month(year, month, day_of_week): | |
first_day = 1 | |
while dt.datetime(year, month, first_day).strftime('%a').lower() != day_of_week: | |
first_day = first_day + 1 | |
return first_day | |
current_day_of_week = "mon" | |
current_month_str = "may" | |
current_day = 5 | |
current_month = 5 | |
current_year = 2008 | |
dow_map = {"mon": {"mon": 0, "tue": 1, "wed": 2, "thu": 3, "fri": 4}, | |
"tue": {"mon": 6, "tue": 0, "wed": 1, "thu": 2, "fri": 3}, | |
"wed": {"mon": 5, "tue": 6, "wed": 0, "thu": 1, "fri": 2}, | |
"thu": {"mon": 4, "tue": 5, "wed": 6, "thu": 0, "fri": 1}, | |
"fri": {"mon": 3, "tue": 4, "wed": 5, "thu": 6, "fri": 0}} | |
month_names_map = {"jan": 1, "feb": 2, "mar": 3, "apr": 4, "may": 5, "jun": 6, | |
"jul": 7, "aug": 8 ,"sep": 9, "oct": 10, "nov": 11, "dec": 12} | |
parsed_dates = [] | |
for row in bank_df.itertuples(): | |
if row.day_of_week != current_day_of_week: | |
next_day_inc = dow_map[current_day_of_week][row.day_of_week] | |
current_day_of_week = row.day_of_week # advances one day of week | |
current_day = current_day + next_day_inc | |
if row.month != current_month_str: | |
current_month_str = row.month # advances one month | |
previous_month = current_month | |
current_month = month_names_map[current_month_str] | |
if current_month < previous_month: # next year | |
current_year = current_year + 1 | |
# reset of to the 1st of day of week found | |
current_day = get_first_match_weekday_of_month(current_year, current_month, current_day_of_week) | |
try: | |
date = dt.datetime(current_year, current_month, current_day) | |
parsed_dates.append({"index": row.Index, # needed to join with the original data set | |
"date": date, | |
"check_month": date.strftime('%b').lower(), ## just to check | |
"check_day_of_week": date.strftime('%a').lower()}) ## just to check | |
except ValueError: | |
print(f"Unexpected error on index {row.Index}: " | |
f"trying to parse {current_year}-{current_month}-{current_day}", sys.exc_info()[0]) | |
raise | |
## combine both dataframe using index column | |
parsed_date_df = pd.DataFrame(parsed_dates) | |
parsed_date_df.set_index("index", inplace=True) | |
df = bank_df.join(parsed_date_df) | |
## save back the full data set with date reconstructed | |
df[["date"] + bank_df.columns.tolist()].to_csv("data/bank-additional-full-timestamp.csv", index=False, sep=";") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment