Skip to content

Instantly share code, notes, and snippets.

@noleto
Last active August 27, 2018 12:45
Show Gist options
  • Save noleto/198a526f4c6fd87997acea1af199ba7e to your computer and use it in GitHub Desktop.
Save noleto/198a526f4c6fd87997acea1af199ba7e to your computer and use it in GitHub Desktop.
UCI Bank Marketing Data Set - reverse engineering date from records (not given in the bank-additional-full data set)
## Data set in https://archive.ics.uci.edu/ml/datasets/bank+marketing does not contain timestamp.
## We can "deduce" them from the ordedr of records and use month and day_of_week columns to create a full date
## We know the first record was from 05-05-2008
import pandas as pd
import numpy as np
import datetime as dt
import sys
# read records from May 2008 to November 2010
bank_df = pd.read_csv("data/bank-additional-full.csv", sep=";")
bank_df.rename(index=str, columns={"y": "subscribed"}, inplace=True)
def get_first_match_weekday_of_month(year, month, day_of_week):
first_day = 1
while dt.datetime(year, month, first_day).strftime('%a').lower() != day_of_week:
first_day = first_day + 1
return first_day
current_day_of_week = "mon"
current_month_str = "may"
current_day = 5
current_month = 5
current_year = 2008
dow_map = {"mon": {"mon": 0, "tue": 1, "wed": 2, "thu": 3, "fri": 4},
"tue": {"mon": 6, "tue": 0, "wed": 1, "thu": 2, "fri": 3},
"wed": {"mon": 5, "tue": 6, "wed": 0, "thu": 1, "fri": 2},
"thu": {"mon": 4, "tue": 5, "wed": 6, "thu": 0, "fri": 1},
"fri": {"mon": 3, "tue": 4, "wed": 5, "thu": 6, "fri": 0}}
month_names_map = {"jan": 1, "feb": 2, "mar": 3, "apr": 4, "may": 5, "jun": 6,
"jul": 7, "aug": 8 ,"sep": 9, "oct": 10, "nov": 11, "dec": 12}
parsed_dates = []
for row in bank_df.itertuples():
if row.day_of_week != current_day_of_week:
next_day_inc = dow_map[current_day_of_week][row.day_of_week]
current_day_of_week = row.day_of_week # advances one day of week
current_day = current_day + next_day_inc
if row.month != current_month_str:
current_month_str = row.month # advances one month
previous_month = current_month
current_month = month_names_map[current_month_str]
if current_month < previous_month: # next year
current_year = current_year + 1
# reset of to the 1st of day of week found
current_day = get_first_match_weekday_of_month(current_year, current_month, current_day_of_week)
try:
date = dt.datetime(current_year, current_month, current_day)
parsed_dates.append({"index": row.Index, # needed to join with the original data set
"date": date,
"check_month": date.strftime('%b').lower(), ## just to check
"check_day_of_week": date.strftime('%a').lower()}) ## just to check
except ValueError:
print(f"Unexpected error on index {row.Index}: "
f"trying to parse {current_year}-{current_month}-{current_day}", sys.exc_info()[0])
raise
## combine both dataframe using index column
parsed_date_df = pd.DataFrame(parsed_dates)
parsed_date_df.set_index("index", inplace=True)
df = bank_df.join(parsed_date_df)
## save back the full data set with date reconstructed
df[["date"] + bank_df.columns.tolist()].to_csv("data/bank-additional-full-timestamp.csv", index=False, sep=";")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment