Last active
January 20, 2023 03:20
-
-
Save snewcomer/67aadcd66e2dc1b53fedc84462b187c9 to your computer and use it in GitHub Desktop.
Microsoft Outlook Export
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import numpy as np | |
from datetime import datetime, time | |
from sklearn.linear_model import Lasso, Ridge, LogisticRegression | |
from sklearn.metrics import classification_report, confusion_matrix | |
import matplotlib.pyplot as plt | |
from sklearn.linear_model import LinearRegression | |
from sklearn.model_selection import train_test_split | |
from sklearn.metrics import r2_score, mean_squared_error, roc_auc_score | |
from sklearn.ensemble import RandomForestClassifier | |
from sklearn.neighbors import KNeighborsClassifier | |
import xgboost as xgb | |
cal = pd.read_csv('emily_cal.csv', parse_dates=[['Start Date', 'Start Time'], ['Start Date', 'End Time']]) | |
# clean summary | |
cal = cal.query("Subject not in ('Pick up Momo')") | |
cal['all_day'] = cal['All day event'].map({'FALSE': False, 'TRUE': True}) | |
cal = cal.query("all_day == False") | |
cal = cal.drop_duplicates() | |
cal = cal[['Subject', 'Start Date_Start Time', 'Start Date_End Time']] | |
cal.rename(columns={'Start Date_Start Time': 'dtstart', 'Start Date_End Time': 'dtend'}, inplace=True) | |
cal['dtstart'] = pd.to_datetime(cal['dtstart']) | |
cal['dtend'] = pd.to_datetime(cal['dtend']) | |
# create unique column we can group and resample over | |
cal['DtCombined'] = cal['dtstart'].astype(str) + '_' + cal['dtend'].astype(str) | |
# still might have problems, drop Zulu | |
cal = cal.dropna() | |
cal['MeetingLength'] = cal['dtend'] - cal['dtstart'] | |
cal['StartTime'] = cal['dtstart'].dt.time | |
cal['EndTime'] = cal['dtend'].dt.time | |
cal = cal.melt(id_vars=['Subject', 'dtstart', 'dtend', 'DtCombined', 'MeetingLength'], var_name='Start/End', value_name='TimeOfDay') | |
cal['Busy'] = 1 | |
# conditional create column that we will ultimately resample | |
cal['DateTime'] = np.where(cal['Start/End'] == 'StartTime', cal['dtstart'], cal['dtend']) | |
cal.sort_values(by=['dtstart', 'TimeOfDay'], inplace=True) | |
cal.drop(['dtstart', 'dtend', 'Start/End'], axis=1, inplace=True) | |
cal = cal.groupby('DtCombined').apply(lambda x: x.drop_duplicates('DateTime').set_index('DateTime').resample('30Min').ffill()).reset_index('DtCombined', drop=True).reset_index() | |
cal['TimeOfDay'] = cal['DateTime'].dt.time | |
# resample drops non numeric columns | |
cal = cal.set_index('DateTime').resample('30Min').mean().reset_index() | |
cal['Busy'] = cal['Busy'].fillna(0) | |
cal['Weekday'] = cal['DateTime'].dt.weekday | |
# convert to int b/c models require it | |
cal = cal.set_index('DateTime')['2021-05-01' :'2023-01-08'].reset_index() | |
cal['Hour'] = cal['DateTime'].apply(lambda time: time.hour) | |
cal['Minute'] = cal['DateTime'].apply(lambda time: time.minute) | |
cal.set_index('DateTime', inplace=True) | |
# predict for each weekday | |
DateIndex = { | |
0: 'Monday', | |
1: 'Tuesday', | |
2: 'Wednesday', | |
3: 'Thursday', | |
4: 'Friday', | |
} | |
for weekday in range(0, 5): | |
print("{}".format(DateIndex[weekday])) | |
group = cal.groupby('Weekday').get_group(weekday) | |
features = group.drop(['Busy'], axis=1) | |
target = group['Busy'] | |
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.3) | |
xgb_model = xgb.XGBClassifier(n_estimators=1000, learning_rate=0.5, objective='reg:squarederror') | |
xgb_model.fit(X_train, y_train) | |
xgb_pred = xgb_model.predict(X_test) | |
score = r2_score(y_test, xgb_pred) | |
r_squared = xgb_model.score(X_test, y_test) | |
rmse = mean_squared_error(y_test, xgb_pred, squared=False) | |
print("XGB Score: {}".format(score)) | |
print("XGB R^2: {}".format(r_squared)) | |
print("XGB RMSE: {} \n".format(rmse)) | |
# low r2 score. Best is 1.0 | |
# Linear Regression | |
#lin_reg = LinearRegression() | |
#lin_reg.fit(X_train, y_train) | |
#y_pred = lin_reg.predict(X_test) | |
#score = r2_score(y_test, y_pred) | |
#r_squared = lin_reg.score(X_test, y_test) | |
#rmse = mean_squared_error(y_test, y_pred, squared=False) | |
#print("Linear Regression Score: {}".format(score)) | |
#print("Linear Regression R^2: {}".format(r_squared)) | |
#print("Linear Regression RMSE: {} \n".format(rmse)) | |
forest_model = RandomForestClassifier(n_estimators=100, min_samples_split=200, random_state=1) | |
forest_model.fit(X_train, y_train) | |
forest_pred = forest_model.predict(X_test) | |
score = r2_score(y_test, forest_pred) | |
r_squared = forest_model.score(X_test, y_test) | |
rmse = mean_squared_error(y_test, forest_pred, squared=False) | |
print("Random Forest Score: {}".format(score)) | |
print("Random Forest R^2: {}".format(r_squared)) | |
print("Random Forest RMSE: {} \n".format(rmse)) | |
knn_model = KNeighborsClassifier(n_neighbors=6) | |
knn_model.fit(X_train, y_train) | |
knn_pred = knn_model.predict(X_test) | |
score = r2_score(y_test, knn_pred) | |
r_squared = knn_model.score(X_test, y_test) | |
rmse = mean_squared_error(y_test, knn_pred, squared=False) | |
print("KNN Score: {}".format(score)) | |
print("KNN ^2: {}".format(r_squared)) | |
print("KNN RMSE: {} \n".format(rmse)) | |
# predict probability at each 30 min interval | |
xgb_pred_prob = xgb_model.predict_proba(X_test) | |
preds = xgb_pred_prob[:,1] | |
preds = pd.DataFrame(preds, columns=['Busy']) | |
preds['Probability'] = preds['Busy'] | |
preds['Busy'] = preds['Probability'].apply(lambda prob: 1 if prob > 0.5 else 0) | |
preds['DateTime'] = pd.to_datetime(y_test.index) | |
preds['Time'] = preds['DateTime'].dt.time#strftime("%H:%M:%S") | |
preds = preds.drop_duplicates('Time') | |
preds.sort_values(by=['Time'], inplace=True) | |
preds.drop('DateTime', axis=1, inplace=True) | |
preds = preds[preds['Time'] >= time(9)] | |
preds = preds[preds['Time'] <= time(18)] | |
preds.rename(columns={'Time': '{} Time'.format(DateIndex[weekday])}, inplace=True) | |
preds = preds.set_index('{} Time'.format(DateIndex[weekday])) | |
print("{}".format(preds)) | |
logreg = LogisticRegression() | |
logreg.fit(X_train, y_train) | |
logreg_pred = logreg.predict(X_test) | |
logreg_pred_probs = logreg.predict_proba(X_test)[:, 1] # slice positive class | |
# note LogisticRegression is not as good as predicting a chance model like KNN or XGB | |
print("ROC AUC: {}".format(roc_auc_score(y_test, logreg_pred_probs))) | |
# no indicative coef | |
#lasso = Lasso(alpha=0.3) | |
#Qlasso.fit(features, target) | |
#lasso_coef = lasso.coef_ | |
#print("Lasso coef: {}".format(lasso_coef)) | |
print("XGB: Model performance?") | |
print(confusion_matrix(y_test, xgb_pred)) | |
print(classification_report(y_test, xgb_pred)) | |
print("KNN: Model performance?") | |
print(confusion_matrix(y_test, knn_pred)) | |
print(classification_report(y_test, knn_pred)) | |
#plt.bar(["Weekday", "Hour", "Minute"], lasso_coef) | |
#plt.xticks(rotation=45) | |
#plt.show() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment