noleto · August 27, 2018 12:45
diff --git a/uci_bank_marketing_dataset_extract_timestamp.py b/uci_bank_marketing_dataset_extract_timestamp.py
 ## Data set in https://archive.ics.uci.edu/ml/datasets/bank+marketing does not contain timestamp.
 ## We can "deduce" them from the ordedr of records and use month and day_of_week columns to create a full date
 ## We know the first record was from 05-05-2008

 import pandas as pd
 import numpy as np
 import datetime as dt
 import sys

 # read records from May 2008 to November 2010
 bank_df = pd.read_csv("data/bank-additional-full.csv", sep=";")
 bank_df.rename(index=str, columns={"y": "subscribed"}, inplace=True)

 def get_first_match_weekday_of_month(year, month, day_of_week):
    
    first_day = 1
    while dt.datetime(year, month, first_day).strftime('%a').lower() != day_of_week:
        first_day = first_day + 1
    
    return first_day


 current_day_of_week = "mon"
 current_month_str = "may"
 current_day = 5
 current_month = 5
 current_year = 2008

 dow_map = {"mon": {"mon": 0, "tue": 1, "wed": 2, "thu": 3, "fri": 4}, 
           "tue": {"mon": 6, "tue": 0, "wed": 1, "thu": 2, "fri": 3}, 
           "wed": {"mon": 5, "tue": 6, "wed": 0, "thu": 1, "fri": 2}, 
           "thu": {"mon": 4, "tue": 5, "wed": 6, "thu": 0, "fri": 1}, 
           "fri": {"mon": 3, "tue": 4, "wed": 5, "thu": 6, "fri": 0}}

 month_names_map = {"jan": 1, "feb": 2, "mar": 3, "apr": 4, "may": 5, "jun": 6, 
                   "jul": 7, "aug": 8 ,"sep": 9, "oct": 10, "nov": 11, "dec": 12}

 parsed_dates = []

 for row in bank_df.itertuples():
    if row.day_of_week != current_day_of_week:
        next_day_inc = dow_map[current_day_of_week][row.day_of_week]
        current_day_of_week = row.day_of_week # advances one day of week
        current_day = current_day + next_day_inc
        
    if row.month != current_month_str:
        current_month_str = row.month # advances one month
        previous_month = current_month
        current_month = month_names_map[current_month_str]
            
        if current_month < previous_month: # next year
            current_year = current_year + 1
                
        # reset of to the 1st of day of week found
        current_day = get_first_match_weekday_of_month(current_year, current_month, current_day_of_week) 
            
    try:
        date = dt.datetime(current_year, current_month, current_day)
        parsed_dates.append({"index": row.Index, # needed to join with the original data set
                             "date": date, 
                             "check_month": date.strftime('%b').lower(), ## just to check
                             "check_day_of_week": date.strftime('%a').lower()}) ## just to check
    except ValueError:
        print(f"Unexpected error on index {row.Index}: "
              f"trying to parse {current_year}-{current_month}-{current_day}", sys.exc_info()[0])
        raise

 ## combine both dataframe using index column
 parsed_date_df = pd.DataFrame(parsed_dates)
 parsed_date_df.set_index("index", inplace=True)
 df = bank_df.join(parsed_date_df)

 ## save back the full data set with date reconstructed 
 df[["date"] + bank_df.columns.tolist()].to_csv("data/bank-additional-full-timestamp.csv", index=False, sep=";")
	## Data set in https://archive.ics.uci.edu/ml/datasets/bank+marketing does not contain timestamp.
	## We can "deduce" them from the ordedr of records and use month and day_of_week columns to create a full date
	## We know the first record was from 05-05-2008

	import pandas as pd
	import numpy as np
	import datetime as dt
	import sys

	# read records from May 2008 to November 2010
	bank_df = pd.read_csv("data/bank-additional-full.csv", sep=";")
	bank_df.rename(index=str, columns={"y": "subscribed"}, inplace=True)

	def get_first_match_weekday_of_month(year, month, day_of_week):

	first_day = 1
	while dt.datetime(year, month, first_day).strftime('%a').lower() != day_of_week:
	first_day = first_day + 1

	return first_day


	current_day_of_week = "mon"
	current_month_str = "may"
	current_day = 5
	current_month = 5
	current_year = 2008

	dow_map = {"mon": {"mon": 0, "tue": 1, "wed": 2, "thu": 3, "fri": 4},
	"tue": {"mon": 6, "tue": 0, "wed": 1, "thu": 2, "fri": 3},
	"wed": {"mon": 5, "tue": 6, "wed": 0, "thu": 1, "fri": 2},
	"thu": {"mon": 4, "tue": 5, "wed": 6, "thu": 0, "fri": 1},
	"fri": {"mon": 3, "tue": 4, "wed": 5, "thu": 6, "fri": 0}}

	month_names_map = {"jan": 1, "feb": 2, "mar": 3, "apr": 4, "may": 5, "jun": 6,
	"jul": 7, "aug": 8 ,"sep": 9, "oct": 10, "nov": 11, "dec": 12}

	parsed_dates = []

	for row in bank_df.itertuples():
	if row.day_of_week != current_day_of_week:
	next_day_inc = dow_map[current_day_of_week][row.day_of_week]
	current_day_of_week = row.day_of_week # advances one day of week
	current_day = current_day + next_day_inc

	if row.month != current_month_str:
	current_month_str = row.month # advances one month
	previous_month = current_month
	current_month = month_names_map[current_month_str]

	if current_month < previous_month: # next year
	current_year = current_year + 1

	# reset of to the 1st of day of week found
	current_day = get_first_match_weekday_of_month(current_year, current_month, current_day_of_week)

	try:
	date = dt.datetime(current_year, current_month, current_day)
	parsed_dates.append({"index": row.Index, # needed to join with the original data set
	"date": date,
	"check_month": date.strftime('%b').lower(), ## just to check
	"check_day_of_week": date.strftime('%a').lower()}) ## just to check
	except ValueError:
	print(f"Unexpected error on index {row.Index}: "
	f"trying to parse {current_year}-{current_month}-{current_day}", sys.exc_info()[0])
	raise

	## combine both dataframe using index column
	parsed_date_df = pd.DataFrame(parsed_dates)
	parsed_date_df.set_index("index", inplace=True)
	df = bank_df.join(parsed_date_df)

	## save back the full data set with date reconstructed
	df[["date"] + bank_df.columns.tolist()].to_csv("data/bank-additional-full-timestamp.csv", index=False, sep=";")