Skip to content

Instantly share code, notes, and snippets.

@snewcomer
Last active January 20, 2023 03:20
Show Gist options
  • Save snewcomer/67aadcd66e2dc1b53fedc84462b187c9 to your computer and use it in GitHub Desktop.
Save snewcomer/67aadcd66e2dc1b53fedc84462b187c9 to your computer and use it in GitHub Desktop.
Microsoft Outlook Export
import pandas as pd
import numpy as np
from datetime import datetime, time
from sklearn.linear_model import Lasso, Ridge, LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error, roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
import xgboost as xgb
cal = pd.read_csv('emily_cal.csv', parse_dates=[['Start Date', 'Start Time'], ['Start Date', 'End Time']])
# clean summary
cal = cal.query("Subject not in ('Pick up Momo')")
cal['all_day'] = cal['All day event'].map({'FALSE': False, 'TRUE': True})
cal = cal.query("all_day == False")
cal = cal.drop_duplicates()
cal = cal[['Subject', 'Start Date_Start Time', 'Start Date_End Time']]
cal.rename(columns={'Start Date_Start Time': 'dtstart', 'Start Date_End Time': 'dtend'}, inplace=True)
cal['dtstart'] = pd.to_datetime(cal['dtstart'])
cal['dtend'] = pd.to_datetime(cal['dtend'])
# create unique column we can group and resample over
cal['DtCombined'] = cal['dtstart'].astype(str) + '_' + cal['dtend'].astype(str)
# still might have problems, drop Zulu
cal = cal.dropna()
cal['MeetingLength'] = cal['dtend'] - cal['dtstart']
cal['StartTime'] = cal['dtstart'].dt.time
cal['EndTime'] = cal['dtend'].dt.time
cal = cal.melt(id_vars=['Subject', 'dtstart', 'dtend', 'DtCombined', 'MeetingLength'], var_name='Start/End', value_name='TimeOfDay')
cal['Busy'] = 1
# conditional create column that we will ultimately resample
cal['DateTime'] = np.where(cal['Start/End'] == 'StartTime', cal['dtstart'], cal['dtend'])
cal.sort_values(by=['dtstart', 'TimeOfDay'], inplace=True)
cal.drop(['dtstart', 'dtend', 'Start/End'], axis=1, inplace=True)
cal = cal.groupby('DtCombined').apply(lambda x: x.drop_duplicates('DateTime').set_index('DateTime').resample('30Min').ffill()).reset_index('DtCombined', drop=True).reset_index()
cal['TimeOfDay'] = cal['DateTime'].dt.time
# resample drops non numeric columns
cal = cal.set_index('DateTime').resample('30Min').mean().reset_index()
cal['Busy'] = cal['Busy'].fillna(0)
cal['Weekday'] = cal['DateTime'].dt.weekday
# convert to int b/c models require it
cal = cal.set_index('DateTime')['2021-05-01' :'2023-01-08'].reset_index()
cal['Hour'] = cal['DateTime'].apply(lambda time: time.hour)
cal['Minute'] = cal['DateTime'].apply(lambda time: time.minute)
cal.set_index('DateTime', inplace=True)
# predict for each weekday
DateIndex = {
0: 'Monday',
1: 'Tuesday',
2: 'Wednesday',
3: 'Thursday',
4: 'Friday',
}
for weekday in range(0, 5):
print("{}".format(DateIndex[weekday]))
group = cal.groupby('Weekday').get_group(weekday)
features = group.drop(['Busy'], axis=1)
target = group['Busy']
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.3)
xgb_model = xgb.XGBClassifier(n_estimators=1000, learning_rate=0.5, objective='reg:squarederror')
xgb_model.fit(X_train, y_train)
xgb_pred = xgb_model.predict(X_test)
score = r2_score(y_test, xgb_pred)
r_squared = xgb_model.score(X_test, y_test)
rmse = mean_squared_error(y_test, xgb_pred, squared=False)
print("XGB Score: {}".format(score))
print("XGB R^2: {}".format(r_squared))
print("XGB RMSE: {} \n".format(rmse))
# low r2 score. Best is 1.0
# Linear Regression
#lin_reg = LinearRegression()
#lin_reg.fit(X_train, y_train)
#y_pred = lin_reg.predict(X_test)
#score = r2_score(y_test, y_pred)
#r_squared = lin_reg.score(X_test, y_test)
#rmse = mean_squared_error(y_test, y_pred, squared=False)
#print("Linear Regression Score: {}".format(score))
#print("Linear Regression R^2: {}".format(r_squared))
#print("Linear Regression RMSE: {} \n".format(rmse))
forest_model = RandomForestClassifier(n_estimators=100, min_samples_split=200, random_state=1)
forest_model.fit(X_train, y_train)
forest_pred = forest_model.predict(X_test)
score = r2_score(y_test, forest_pred)
r_squared = forest_model.score(X_test, y_test)
rmse = mean_squared_error(y_test, forest_pred, squared=False)
print("Random Forest Score: {}".format(score))
print("Random Forest R^2: {}".format(r_squared))
print("Random Forest RMSE: {} \n".format(rmse))
knn_model = KNeighborsClassifier(n_neighbors=6)
knn_model.fit(X_train, y_train)
knn_pred = knn_model.predict(X_test)
score = r2_score(y_test, knn_pred)
r_squared = knn_model.score(X_test, y_test)
rmse = mean_squared_error(y_test, knn_pred, squared=False)
print("KNN Score: {}".format(score))
print("KNN ^2: {}".format(r_squared))
print("KNN RMSE: {} \n".format(rmse))
# predict probability at each 30 min interval
xgb_pred_prob = xgb_model.predict_proba(X_test)
preds = xgb_pred_prob[:,1]
preds = pd.DataFrame(preds, columns=['Busy'])
preds['Probability'] = preds['Busy']
preds['Busy'] = preds['Probability'].apply(lambda prob: 1 if prob > 0.5 else 0)
preds['DateTime'] = pd.to_datetime(y_test.index)
preds['Time'] = preds['DateTime'].dt.time#strftime("%H:%M:%S")
preds = preds.drop_duplicates('Time')
preds.sort_values(by=['Time'], inplace=True)
preds.drop('DateTime', axis=1, inplace=True)
preds = preds[preds['Time'] >= time(9)]
preds = preds[preds['Time'] <= time(18)]
preds.rename(columns={'Time': '{} Time'.format(DateIndex[weekday])}, inplace=True)
preds = preds.set_index('{} Time'.format(DateIndex[weekday]))
print("{}".format(preds))
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
logreg_pred = logreg.predict(X_test)
logreg_pred_probs = logreg.predict_proba(X_test)[:, 1] # slice positive class
# note LogisticRegression is not as good as predicting a chance model like KNN or XGB
print("ROC AUC: {}".format(roc_auc_score(y_test, logreg_pred_probs)))
# no indicative coef
#lasso = Lasso(alpha=0.3)
#Qlasso.fit(features, target)
#lasso_coef = lasso.coef_
#print("Lasso coef: {}".format(lasso_coef))
print("XGB: Model performance?")
print(confusion_matrix(y_test, xgb_pred))
print(classification_report(y_test, xgb_pred))
print("KNN: Model performance?")
print(confusion_matrix(y_test, knn_pred))
print(classification_report(y_test, knn_pred))
#plt.bar(["Weekday", "Hour", "Minute"], lasso_coef)
#plt.xticks(rotation=45)
#plt.show()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment