snewcomer · January 10, 2023 15:09
diff --git a/cal_time_pred.py b/cal_time_pred.py
 import pandas as pd
 import numpy as np
 from datetime import datetime, time

 #import matplotlib.pyplot as plt

 from sklearn.linear_model import LinearRegression
 from sklearn.model_selection import train_test_split
 from sklearn.metrics import r2_score, mean_squared_error
 from sklearn.ensemble import RandomForestClassifier
 import xgboost as xgb

 cal = pd.read_csv('cal.csv', index_col=False)

 # clean summary
 cal = cal.query("summary not in ('Mid-week Meditation', 'Coffee Chat: Meet our New Hires!', 'PED Lunch & Learn Hold', 'AskPeople')")
 cal = cal[cal['summary'].str.contains("Company Holiday") == False]
 cal = cal[cal['summary'].str.contains("L&L") == False]
 cal = cal.drop_duplicates()

 # clean dtstart
 parsed = pd.to_datetime(cal["dtstart"], errors="coerce").fillna(pd.to_datetime(cal["dtstart"], format="%Y-%d-%m", errors="coerce"))
 ordinal = pd.to_numeric(cal["dtstart"], errors="coerce").apply(lambda x: pd.Timestamp("1899-12-30") + pd.Timedelta(x, unit="D"))
 parsed_e = pd.to_datetime(cal["dtend"], errors="coerce").fillna(pd.to_datetime(cal["dtend"], format="%Y-%d-%m", errors="coerce"))
 ordinal_e = pd.to_numeric(cal["dtend"], errors="coerce").apply(lambda x: pd.Timestamp("1899-12-30")+pd.Timedelta(x, unit="D"))
 
 cal['dtstart'] = parsed.fillna(ordinal)
 cal['dtend'] = parsed_e.fillna(ordinal_e)
 # create unique column we can group and resample over
 cal['DtCombined'] = cal['dtstart'].astype(str) + '_' + cal['dtend'].astype(str)
 # still might have problems, drop Zulu
 cal = cal.dropna()
 cal['dtstart'] = cal['dtstart'].apply(lambda x: datetime.replace(x, tzinfo=None))
 cal['dtend'] = cal['dtend'].apply(lambda x: datetime.replace(x, tzinfo=None))
 cal['MeetingLength'] = cal['dtend'] - cal['dtstart']

 cal['Date'] = cal['dtstart'].dt.date
 cal['Date'] = pd.to_datetime(cal['Date'], errors="coerce")
 cal['StartTime'] = cal['dtstart'].dt.time
 cal['EndTime'] = cal['dtend'].dt.time

 cal = cal.melt(id_vars=['summary', 'Date', 'dtstart', 'dtend', 'DtCombined', 'MeetingLength'], var_name='Start/End', value_name='TimeOfDay')
 cal['Busy'] = 1
 # conditional create column that we will ultimately resample
 cal['DateTime'] = np.where(cal['Start/End'] == 'StartTime', cal['dtstart'], cal['dtend'])

 cal.sort_values(by=['dtstart', 'TimeOfDay'], inplace=True)
 cal.drop(['dtstart', 'dtend', 'Start/End'], axis=1, inplace=True)

 cal = cal.groupby('DtCombined').apply(lambda x: x.drop_duplicates('DateTime').set_index('DateTime').resample('30Min').ffill()).reset_index('DtCombined', drop=True).reset_index()
 cal['TimeOfDay'] = cal['DateTime'].dt.time
 # resample drops non numeric columns
 cal = cal.set_index('DateTime').resample('30Min').mean().reset_index()
 cal['Busy'] = cal['Busy'].fillna(0)
 cal['Weekday'] = cal['DateTime'].dt.weekday

 # convert to int b/c models require it
 #cal['Day'] = cal['DateTime'].apply(lambda time: time.day)
 cal = cal.set_index('DateTime')['2021-05-01' :'2023-01-08'].reset_index()
 cal['Hour'] = cal['DateTime'].apply(lambda time: time.hour)
 cal['Minute'] = cal['DateTime'].apply(lambda time: time.minute)
 #cal['Hour:Minute'] = cal['Hour'].astype(str) + ':' + cal['Minute'].astype(str)
 cal.set_index('DateTime', inplace=True)

 # predict for each weekday
 for weekday in range(0, 6):
 	print("Weekday {} \n".format(weekday))

 	group = cal.groupby('Weekday').get_group(weekday)

 	features = group.drop(['Busy'], axis=1)
 	target = group['Busy']

 	X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.3)
 	
 	xgb_model = xgb.XGBClassifier(n_estimators=1000, learning_rate=0.5, objective='reg:squarederror')
 	xgb_model.fit(X_train, y_train)
 	xgb_pred = xgb_model.predict(X_test)

 	score = r2_score(y_test, xgb_pred)
 	r_squared = xgb_model.score(X_test, y_test)
 	rmse = mean_squared_error(y_test, xgb_pred, squared=False)
 	print("XGB Score: {}".format(score))
 	print("XGB R^2: {}".format(r_squared))
 	print("XGB RMSE: {} \n".format(rmse))

 	# low r2 score.  Best is 1.0
 	# Linear Regression
 	#lin_reg = LinearRegression()
 	#lin_reg.fit(X_train, y_train)
 	#y_pred = lin_reg.predict(X_test)
 	#score = r2_score(y_test, y_pred)
 	#r_squared = lin_reg.score(X_test, y_test)
 	#rmse = mean_squared_error(y_test, y_pred, squared=False)
 	#print("Linear Regression Score: {}".format(score))
 	#print("Linear Regression R^2: {}".format(r_squared))
 	#print("Linear Regression RMSE: {} \n".format(rmse))

 	forest_model = RandomForestClassifier(n_estimators=100, min_samples_split=200, random_state=1)
 	forest_model.fit(X_train, y_train)
 	forest_pred = forest_model.predict(X_test)
 	score = r2_score(y_test, forest_pred)
 	r_squared = forest_model.score(X_test, y_test)
 	rmse = mean_squared_error(y_test, forest_pred, squared=False)
 	print("Random Forest Score: {}".format(score))
 	print("Random Forest R^2: {}".format(r_squared))
 	print("Random Forest RMSE: {} \n".format(rmse))

 	# predict probability
 	xgb_pred_prob = xgb_model.predict_proba(X_test)
 	preds = xgb_pred_prob[:,1]
 	preds = pd.DataFrame(preds, columns=['Busy'])
 	preds['Probability'] = preds['Busy']
 	preds['Busy'] = preds['Probability'].apply(lambda prob: 1 if prob > 0.4 else 0)
 	preds['DateTime'] = pd.to_datetime(y_test.index)
 	preds['Time'] = preds['DateTime'].dt.time#strftime("%H:%M:%S")

 	preds = preds.drop_duplicates('Time')
 	preds.sort_values(by=['Time'], inplace=True)
 	preds.drop('DateTime', axis=1, inplace=True)
 	preds = preds[preds['Time'] > time(9)]
 	preds = preds[preds['Time'] < time(18)]
 	print("Weekday \n {}: {}".format(weekday, preds))
	import pandas as pd
	import numpy as np
	from datetime import datetime, time

	#import matplotlib.pyplot as plt

	from sklearn.linear_model import LinearRegression
	from sklearn.model_selection import train_test_split
	from sklearn.metrics import r2_score, mean_squared_error
	from sklearn.ensemble import RandomForestClassifier
	import xgboost as xgb

	cal = pd.read_csv('cal.csv', index_col=False)

	# clean summary
	cal = cal.query("summary not in ('Mid-week Meditation', 'Coffee Chat: Meet our New Hires!', 'PED Lunch & Learn Hold', 'AskPeople')")
	cal = cal[cal['summary'].str.contains("Company Holiday") == False]
	cal = cal[cal['summary'].str.contains("L&L") == False]
	cal = cal.drop_duplicates()

	# clean dtstart
	parsed = pd.to_datetime(cal["dtstart"], errors="coerce").fillna(pd.to_datetime(cal["dtstart"], format="%Y-%d-%m", errors="coerce"))
	ordinal = pd.to_numeric(cal["dtstart"], errors="coerce").apply(lambda x: pd.Timestamp("1899-12-30") + pd.Timedelta(x, unit="D"))
	parsed_e = pd.to_datetime(cal["dtend"], errors="coerce").fillna(pd.to_datetime(cal["dtend"], format="%Y-%d-%m", errors="coerce"))
	ordinal_e = pd.to_numeric(cal["dtend"], errors="coerce").apply(lambda x: pd.Timestamp("1899-12-30")+pd.Timedelta(x, unit="D"))

	cal['dtstart'] = parsed.fillna(ordinal)
	cal['dtend'] = parsed_e.fillna(ordinal_e)
	# create unique column we can group and resample over
	cal['DtCombined'] = cal['dtstart'].astype(str) + '_' + cal['dtend'].astype(str)
	# still might have problems, drop Zulu
	cal = cal.dropna()
	cal['dtstart'] = cal['dtstart'].apply(lambda x: datetime.replace(x, tzinfo=None))
	cal['dtend'] = cal['dtend'].apply(lambda x: datetime.replace(x, tzinfo=None))
	cal['MeetingLength'] = cal['dtend'] - cal['dtstart']

	cal['Date'] = cal['dtstart'].dt.date
	cal['Date'] = pd.to_datetime(cal['Date'], errors="coerce")
	cal['StartTime'] = cal['dtstart'].dt.time
	cal['EndTime'] = cal['dtend'].dt.time

	cal = cal.melt(id_vars=['summary', 'Date', 'dtstart', 'dtend', 'DtCombined', 'MeetingLength'], var_name='Start/End', value_name='TimeOfDay')
	cal['Busy'] = 1
	# conditional create column that we will ultimately resample
	cal['DateTime'] = np.where(cal['Start/End'] == 'StartTime', cal['dtstart'], cal['dtend'])

	cal.sort_values(by=['dtstart', 'TimeOfDay'], inplace=True)
	cal.drop(['dtstart', 'dtend', 'Start/End'], axis=1, inplace=True)

	cal = cal.groupby('DtCombined').apply(lambda x: x.drop_duplicates('DateTime').set_index('DateTime').resample('30Min').ffill()).reset_index('DtCombined', drop=True).reset_index()
	cal['TimeOfDay'] = cal['DateTime'].dt.time
	# resample drops non numeric columns
	cal = cal.set_index('DateTime').resample('30Min').mean().reset_index()
	cal['Busy'] = cal['Busy'].fillna(0)
	cal['Weekday'] = cal['DateTime'].dt.weekday

	# convert to int b/c models require it
	#cal['Day'] = cal['DateTime'].apply(lambda time: time.day)
	cal = cal.set_index('DateTime')['2021-05-01' :'2023-01-08'].reset_index()
	cal['Hour'] = cal['DateTime'].apply(lambda time: time.hour)
	cal['Minute'] = cal['DateTime'].apply(lambda time: time.minute)
	#cal['Hour:Minute'] = cal['Hour'].astype(str) + ':' + cal['Minute'].astype(str)
	cal.set_index('DateTime', inplace=True)

	# predict for each weekday
	for weekday in range(0, 6):
	print("Weekday {} \n".format(weekday))

	group = cal.groupby('Weekday').get_group(weekday)

	features = group.drop(['Busy'], axis=1)
	target = group['Busy']

	X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.3)

	xgb_model = xgb.XGBClassifier(n_estimators=1000, learning_rate=0.5, objective='reg:squarederror')
	xgb_model.fit(X_train, y_train)
	xgb_pred = xgb_model.predict(X_test)

	score = r2_score(y_test, xgb_pred)
	r_squared = xgb_model.score(X_test, y_test)
	rmse = mean_squared_error(y_test, xgb_pred, squared=False)
	print("XGB Score: {}".format(score))
	print("XGB R^2: {}".format(r_squared))
	print("XGB RMSE: {} \n".format(rmse))

	# low r2 score. Best is 1.0
	# Linear Regression
	#lin_reg = LinearRegression()
	#lin_reg.fit(X_train, y_train)
	#y_pred = lin_reg.predict(X_test)
	#score = r2_score(y_test, y_pred)
	#r_squared = lin_reg.score(X_test, y_test)
	#rmse = mean_squared_error(y_test, y_pred, squared=False)
	#print("Linear Regression Score: {}".format(score))
	#print("Linear Regression R^2: {}".format(r_squared))
	#print("Linear Regression RMSE: {} \n".format(rmse))

	forest_model = RandomForestClassifier(n_estimators=100, min_samples_split=200, random_state=1)
	forest_model.fit(X_train, y_train)
	forest_pred = forest_model.predict(X_test)
	score = r2_score(y_test, forest_pred)
	r_squared = forest_model.score(X_test, y_test)
	rmse = mean_squared_error(y_test, forest_pred, squared=False)
	print("Random Forest Score: {}".format(score))
	print("Random Forest R^2: {}".format(r_squared))
	print("Random Forest RMSE: {} \n".format(rmse))

	# predict probability
	xgb_pred_prob = xgb_model.predict_proba(X_test)
	preds = xgb_pred_prob[:,1]
	preds = pd.DataFrame(preds, columns=['Busy'])
	preds['Probability'] = preds['Busy']
	preds['Busy'] = preds['Probability'].apply(lambda prob: 1 if prob > 0.4 else 0)
	preds['DateTime'] = pd.to_datetime(y_test.index)
	preds['Time'] = preds['DateTime'].dt.time#strftime("%H:%M:%S")

	preds = preds.drop_duplicates('Time')
	preds.sort_values(by=['Time'], inplace=True)
	preds.drop('DateTime', axis=1, inplace=True)
	preds = preds[preds['Time'] > time(9)]
	preds = preds[preds['Time'] < time(18)]
	print("Weekday \n {}: {}".format(weekday, preds))