Skip to content

Instantly share code, notes, and snippets.

@pjankiewicz
Created October 11, 2014 20:24
Show Gist options
  • Save pjankiewicz/85a9fd1f1cb240e3519b to your computer and use it in GitHub Desktop.
Save pjankiewicz/85a9fd1f1cb240e3519b to your computer and use it in GitHub Desktop.
generic function (liberty fire)
def model_generic(data, model_f, n_folds=10, feature_selection_f=None, feature_transform_f=None, save_as=None, make_predictions=False, standard_model=False):
# save as
if os.path.exists(save_as):
print("Skipping",save_as)
return None
else:
touch(save_as)
nobs_tr = data["train"]["X"].shape[0]
if make_predictions:
nobs_ld = data["leaderboard"]["X"].shape[0]
tr_predictions = np.zeros(nobs_tr)
cv = cross_validation.KFold(n=nobs_tr, n_folds=n_folds, shuffle=True, random_state=13)
# cross validation
test_results = []
train_results = []
features_selected = []
# combine cross validation with prediction
my_cv = [(True, "train", "train", (tr_index, te_index)) for cv_i, (tr_index, te_index) in enumerate(cv)]
if make_predictions:
my_cv.append((False, "train", "leaderboard"))
tr_predictions = np.zeros(nobs_tr)
if make_predictions:
ld_predictions = np.empty(nobs_ld)
else:
ld_predictions = None
start_time = time.time()
for cv_i in range(len(my_cv)):
print("cv",cv_i+1)
validation, train_dataset, test_dataset = my_cv[cv_i][:3]
if validation:
tr_index, te_index = my_cv[cv_i][3]
X_tr, X_te = data[train_dataset]["X"].ix[tr_index,:], data[test_dataset]["X"].ix[te_index,:]
y_tr, y_te = data[train_dataset]["y"][tr_index] , data[test_dataset]["y"][te_index]
w_tr, w_te = data[train_dataset]["w"][tr_index] , data[test_dataset]["w"][te_index]
else:
X_tr, X_te = data[train_dataset]["X"], data[test_dataset]["X"]
y_tr, y_te = data[train_dataset]["y"], data[test_dataset]["y"]
w_tr, w_te = data[train_dataset]["w"], data[test_dataset]["w"]
# convert object to target averages
X_tr, X_te = factors_target_average(X_tr,y_tr,X_te)
# FEATURE TRANSFORM
# inputs X_tr,y_tr,w_tr,X_te,w_te
# outputs X_tr, X_te
if feature_transform_f is not None:
X_tr, X_te = feature_transform_f(X_tr,y_tr,w_tr,X_te,w_te)
# FEATURE TRANFORM END
# FEATURE SELECTION START
# inputs X_tr, y_tr, w_tr
# outputs selected_cols
if feature_selection_f is not None:
selected_cols = feature_selection_f(X_tr,y_tr,w_tr)
features_selected.append(selected_cols)
print("features:",sorted(selected_cols))
X_tr, X_te = X_tr.ix[:,selected_cols], X_te.ix[:,selected_cols]
# FEATURE SELECTION END
# MODEL START
# inputs X_tr, y_tr, w_tr, X_te, y_te, w_te
# outputs pred_tr, pred_te
if standard_model:
model_f.fit(X_tr,y_tr)
pred_tr = model_f.predict(X_tr)
pred_te = model_f.predict(X_te)
model = None
else:
model, pred_tr, pred_te = model_f(X_tr, y_tr, w_tr, X_te, w_te)
# MODEL END
if validation:
gini_test = normalized_weighted_gini(y_te, pred_te, w_te)
test_results.append(gini_test)
print("test",gini_test)
gini_train = normalized_weighted_gini(y_tr, pred_tr, w_tr)
train_results.append(gini_train)
print("train",gini_train)
#save holdout predictions
tr_predictions[te_index] = pred_te
else:
ld_predictions = pred_te
del X_tr, X_te, y_tr, y_te, w_tr, w_te
a = gc.collect()
y_tr = data[train_dataset]["y"]
w_tr = data[train_dataset]["w"]
final_test_result = normalized_weighted_gini(y_tr, tr_predictions, w_tr)
end_time = time.time()
print("final test (avg)",np.array(test_results).mean())
print("final train (avg)",np.array(train_results).mean())
model_obj = {
"tr_predictions": tr_predictions
,"ld_predictions": ld_predictions
,"test_cv_results": test_results
,"final_test_result": final_test_result
,"train_cv_results": train_results
,"start_time": start_time
,"end_time": end_time
,"duration": end_time - start_time
,"model": model
,"features_selected": features_selected
}
if save_as is not None:
tightpickle.dump(model_obj,save_as)
return model_obj
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment