Last active
January 10, 2023 15:09
-
-
Save snewcomer/e0cf2de4ae12943ebdad39b3d26afd20 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import numpy as np | |
from datetime import datetime, time | |
#import matplotlib.pyplot as plt | |
from sklearn.linear_model import LinearRegression | |
from sklearn.model_selection import train_test_split | |
from sklearn.metrics import r2_score, mean_squared_error | |
from sklearn.ensemble import RandomForestClassifier | |
import xgboost as xgb | |
cal = pd.read_csv('cal.csv', index_col=False) | |
# clean summary | |
cal = cal.query("summary not in ('Mid-week Meditation', 'Coffee Chat: Meet our New Hires!', 'PED Lunch & Learn Hold', 'AskPeople')") | |
cal = cal[cal['summary'].str.contains("Company Holiday") == False] | |
cal = cal[cal['summary'].str.contains("L&L") == False] | |
cal = cal.drop_duplicates() | |
# clean dtstart | |
parsed = pd.to_datetime(cal["dtstart"], errors="coerce").fillna(pd.to_datetime(cal["dtstart"], format="%Y-%d-%m", errors="coerce")) | |
ordinal = pd.to_numeric(cal["dtstart"], errors="coerce").apply(lambda x: pd.Timestamp("1899-12-30") + pd.Timedelta(x, unit="D")) | |
parsed_e = pd.to_datetime(cal["dtend"], errors="coerce").fillna(pd.to_datetime(cal["dtend"], format="%Y-%d-%m", errors="coerce")) | |
ordinal_e = pd.to_numeric(cal["dtend"], errors="coerce").apply(lambda x: pd.Timestamp("1899-12-30")+pd.Timedelta(x, unit="D")) | |
cal['dtstart'] = parsed.fillna(ordinal) | |
cal['dtend'] = parsed_e.fillna(ordinal_e) | |
# create unique column we can group and resample over | |
cal['DtCombined'] = cal['dtstart'].astype(str) + '_' + cal['dtend'].astype(str) | |
# still might have problems, drop Zulu | |
cal = cal.dropna() | |
cal['dtstart'] = cal['dtstart'].apply(lambda x: datetime.replace(x, tzinfo=None)) | |
cal['dtend'] = cal['dtend'].apply(lambda x: datetime.replace(x, tzinfo=None)) | |
cal['MeetingLength'] = cal['dtend'] - cal['dtstart'] | |
cal['Date'] = cal['dtstart'].dt.date | |
cal['Date'] = pd.to_datetime(cal['Date'], errors="coerce") | |
cal['StartTime'] = cal['dtstart'].dt.time | |
cal['EndTime'] = cal['dtend'].dt.time | |
cal = cal.melt(id_vars=['summary', 'Date', 'dtstart', 'dtend', 'DtCombined', 'MeetingLength'], var_name='Start/End', value_name='TimeOfDay') | |
cal['Busy'] = 1 | |
# conditional create column that we will ultimately resample | |
cal['DateTime'] = np.where(cal['Start/End'] == 'StartTime', cal['dtstart'], cal['dtend']) | |
cal.sort_values(by=['dtstart', 'TimeOfDay'], inplace=True) | |
cal.drop(['dtstart', 'dtend', 'Start/End'], axis=1, inplace=True) | |
cal = cal.groupby('DtCombined').apply(lambda x: x.drop_duplicates('DateTime').set_index('DateTime').resample('30Min').ffill()).reset_index('DtCombined', drop=True).reset_index() | |
cal['TimeOfDay'] = cal['DateTime'].dt.time | |
# resample drops non numeric columns | |
cal = cal.set_index('DateTime').resample('30Min').mean().reset_index() | |
cal['Busy'] = cal['Busy'].fillna(0) | |
cal['Weekday'] = cal['DateTime'].dt.weekday | |
# convert to int b/c models require it | |
#cal['Day'] = cal['DateTime'].apply(lambda time: time.day) | |
cal = cal.set_index('DateTime')['2021-05-01' :'2023-01-08'].reset_index() | |
cal['Hour'] = cal['DateTime'].apply(lambda time: time.hour) | |
cal['Minute'] = cal['DateTime'].apply(lambda time: time.minute) | |
#cal['Hour:Minute'] = cal['Hour'].astype(str) + ':' + cal['Minute'].astype(str) | |
cal.set_index('DateTime', inplace=True) | |
# predict for each weekday | |
for weekday in range(0, 6): | |
print("Weekday {} \n".format(weekday)) | |
group = cal.groupby('Weekday').get_group(weekday) | |
features = group.drop(['Busy'], axis=1) | |
target = group['Busy'] | |
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.3) | |
xgb_model = xgb.XGBClassifier(n_estimators=1000, learning_rate=0.5, objective='reg:squarederror') | |
xgb_model.fit(X_train, y_train) | |
xgb_pred = xgb_model.predict(X_test) | |
score = r2_score(y_test, xgb_pred) | |
r_squared = xgb_model.score(X_test, y_test) | |
rmse = mean_squared_error(y_test, xgb_pred, squared=False) | |
print("XGB Score: {}".format(score)) | |
print("XGB R^2: {}".format(r_squared)) | |
print("XGB RMSE: {} \n".format(rmse)) | |
# low r2 score. Best is 1.0 | |
# Linear Regression | |
#lin_reg = LinearRegression() | |
#lin_reg.fit(X_train, y_train) | |
#y_pred = lin_reg.predict(X_test) | |
#score = r2_score(y_test, y_pred) | |
#r_squared = lin_reg.score(X_test, y_test) | |
#rmse = mean_squared_error(y_test, y_pred, squared=False) | |
#print("Linear Regression Score: {}".format(score)) | |
#print("Linear Regression R^2: {}".format(r_squared)) | |
#print("Linear Regression RMSE: {} \n".format(rmse)) | |
forest_model = RandomForestClassifier(n_estimators=100, min_samples_split=200, random_state=1) | |
forest_model.fit(X_train, y_train) | |
forest_pred = forest_model.predict(X_test) | |
score = r2_score(y_test, forest_pred) | |
r_squared = forest_model.score(X_test, y_test) | |
rmse = mean_squared_error(y_test, forest_pred, squared=False) | |
print("Random Forest Score: {}".format(score)) | |
print("Random Forest R^2: {}".format(r_squared)) | |
print("Random Forest RMSE: {} \n".format(rmse)) | |
# predict probability | |
xgb_pred_prob = xgb_model.predict_proba(X_test) | |
preds = xgb_pred_prob[:,1] | |
preds = pd.DataFrame(preds, columns=['Busy']) | |
preds['Probability'] = preds['Busy'] | |
preds['Busy'] = preds['Probability'].apply(lambda prob: 1 if prob > 0.4 else 0) | |
preds['DateTime'] = pd.to_datetime(y_test.index) | |
preds['Time'] = preds['DateTime'].dt.time#strftime("%H:%M:%S") | |
preds = preds.drop_duplicates('Time') | |
preds.sort_values(by=['Time'], inplace=True) | |
preds.drop('DateTime', axis=1, inplace=True) | |
preds = preds[preds['Time'] > time(9)] | |
preds = preds[preds['Time'] < time(18)] | |
print("Weekday \n {}: {}".format(weekday, preds)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment