Skip to content

Instantly share code, notes, and snippets.

@snewcomer
Last active January 10, 2023 15:09
Show Gist options
  • Save snewcomer/e0cf2de4ae12943ebdad39b3d26afd20 to your computer and use it in GitHub Desktop.
Save snewcomer/e0cf2de4ae12943ebdad39b3d26afd20 to your computer and use it in GitHub Desktop.
import pandas as pd
import numpy as np
from datetime import datetime, time
#import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
cal = pd.read_csv('cal.csv', index_col=False)
# clean summary
cal = cal.query("summary not in ('Mid-week Meditation', 'Coffee Chat: Meet our New Hires!', 'PED Lunch & Learn Hold', 'AskPeople')")
cal = cal[cal['summary'].str.contains("Company Holiday") == False]
cal = cal[cal['summary'].str.contains("L&L") == False]
cal = cal.drop_duplicates()
# clean dtstart
parsed = pd.to_datetime(cal["dtstart"], errors="coerce").fillna(pd.to_datetime(cal["dtstart"], format="%Y-%d-%m", errors="coerce"))
ordinal = pd.to_numeric(cal["dtstart"], errors="coerce").apply(lambda x: pd.Timestamp("1899-12-30") + pd.Timedelta(x, unit="D"))
parsed_e = pd.to_datetime(cal["dtend"], errors="coerce").fillna(pd.to_datetime(cal["dtend"], format="%Y-%d-%m", errors="coerce"))
ordinal_e = pd.to_numeric(cal["dtend"], errors="coerce").apply(lambda x: pd.Timestamp("1899-12-30")+pd.Timedelta(x, unit="D"))
cal['dtstart'] = parsed.fillna(ordinal)
cal['dtend'] = parsed_e.fillna(ordinal_e)
# create unique column we can group and resample over
cal['DtCombined'] = cal['dtstart'].astype(str) + '_' + cal['dtend'].astype(str)
# still might have problems, drop Zulu
cal = cal.dropna()
cal['dtstart'] = cal['dtstart'].apply(lambda x: datetime.replace(x, tzinfo=None))
cal['dtend'] = cal['dtend'].apply(lambda x: datetime.replace(x, tzinfo=None))
cal['MeetingLength'] = cal['dtend'] - cal['dtstart']
cal['Date'] = cal['dtstart'].dt.date
cal['Date'] = pd.to_datetime(cal['Date'], errors="coerce")
cal['StartTime'] = cal['dtstart'].dt.time
cal['EndTime'] = cal['dtend'].dt.time
cal = cal.melt(id_vars=['summary', 'Date', 'dtstart', 'dtend', 'DtCombined', 'MeetingLength'], var_name='Start/End', value_name='TimeOfDay')
cal['Busy'] = 1
# conditional create column that we will ultimately resample
cal['DateTime'] = np.where(cal['Start/End'] == 'StartTime', cal['dtstart'], cal['dtend'])
cal.sort_values(by=['dtstart', 'TimeOfDay'], inplace=True)
cal.drop(['dtstart', 'dtend', 'Start/End'], axis=1, inplace=True)
cal = cal.groupby('DtCombined').apply(lambda x: x.drop_duplicates('DateTime').set_index('DateTime').resample('30Min').ffill()).reset_index('DtCombined', drop=True).reset_index()
cal['TimeOfDay'] = cal['DateTime'].dt.time
# resample drops non numeric columns
cal = cal.set_index('DateTime').resample('30Min').mean().reset_index()
cal['Busy'] = cal['Busy'].fillna(0)
cal['Weekday'] = cal['DateTime'].dt.weekday
# convert to int b/c models require it
#cal['Day'] = cal['DateTime'].apply(lambda time: time.day)
cal = cal.set_index('DateTime')['2021-05-01' :'2023-01-08'].reset_index()
cal['Hour'] = cal['DateTime'].apply(lambda time: time.hour)
cal['Minute'] = cal['DateTime'].apply(lambda time: time.minute)
#cal['Hour:Minute'] = cal['Hour'].astype(str) + ':' + cal['Minute'].astype(str)
cal.set_index('DateTime', inplace=True)
# predict for each weekday
for weekday in range(0, 6):
print("Weekday {} \n".format(weekday))
group = cal.groupby('Weekday').get_group(weekday)
features = group.drop(['Busy'], axis=1)
target = group['Busy']
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.3)
xgb_model = xgb.XGBClassifier(n_estimators=1000, learning_rate=0.5, objective='reg:squarederror')
xgb_model.fit(X_train, y_train)
xgb_pred = xgb_model.predict(X_test)
score = r2_score(y_test, xgb_pred)
r_squared = xgb_model.score(X_test, y_test)
rmse = mean_squared_error(y_test, xgb_pred, squared=False)
print("XGB Score: {}".format(score))
print("XGB R^2: {}".format(r_squared))
print("XGB RMSE: {} \n".format(rmse))
# low r2 score. Best is 1.0
# Linear Regression
#lin_reg = LinearRegression()
#lin_reg.fit(X_train, y_train)
#y_pred = lin_reg.predict(X_test)
#score = r2_score(y_test, y_pred)
#r_squared = lin_reg.score(X_test, y_test)
#rmse = mean_squared_error(y_test, y_pred, squared=False)
#print("Linear Regression Score: {}".format(score))
#print("Linear Regression R^2: {}".format(r_squared))
#print("Linear Regression RMSE: {} \n".format(rmse))
forest_model = RandomForestClassifier(n_estimators=100, min_samples_split=200, random_state=1)
forest_model.fit(X_train, y_train)
forest_pred = forest_model.predict(X_test)
score = r2_score(y_test, forest_pred)
r_squared = forest_model.score(X_test, y_test)
rmse = mean_squared_error(y_test, forest_pred, squared=False)
print("Random Forest Score: {}".format(score))
print("Random Forest R^2: {}".format(r_squared))
print("Random Forest RMSE: {} \n".format(rmse))
# predict probability
xgb_pred_prob = xgb_model.predict_proba(X_test)
preds = xgb_pred_prob[:,1]
preds = pd.DataFrame(preds, columns=['Busy'])
preds['Probability'] = preds['Busy']
preds['Busy'] = preds['Probability'].apply(lambda prob: 1 if prob > 0.4 else 0)
preds['DateTime'] = pd.to_datetime(y_test.index)
preds['Time'] = preds['DateTime'].dt.time#strftime("%H:%M:%S")
preds = preds.drop_duplicates('Time')
preds.sort_values(by=['Time'], inplace=True)
preds.drop('DateTime', axis=1, inplace=True)
preds = preds[preds['Time'] > time(9)]
preds = preds[preds['Time'] < time(18)]
print("Weekday \n {}: {}".format(weekday, preds))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment