Created
October 11, 2014 20:24
-
-
Save pjankiewicz/85a9fd1f1cb240e3519b to your computer and use it in GitHub Desktop.
generic function (liberty fire)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def model_generic(data, model_f, n_folds=10, feature_selection_f=None, feature_transform_f=None, save_as=None, make_predictions=False, standard_model=False): | |
# save as | |
if os.path.exists(save_as): | |
print("Skipping",save_as) | |
return None | |
else: | |
touch(save_as) | |
nobs_tr = data["train"]["X"].shape[0] | |
if make_predictions: | |
nobs_ld = data["leaderboard"]["X"].shape[0] | |
tr_predictions = np.zeros(nobs_tr) | |
cv = cross_validation.KFold(n=nobs_tr, n_folds=n_folds, shuffle=True, random_state=13) | |
# cross validation | |
test_results = [] | |
train_results = [] | |
features_selected = [] | |
# combine cross validation with prediction | |
my_cv = [(True, "train", "train", (tr_index, te_index)) for cv_i, (tr_index, te_index) in enumerate(cv)] | |
if make_predictions: | |
my_cv.append((False, "train", "leaderboard")) | |
tr_predictions = np.zeros(nobs_tr) | |
if make_predictions: | |
ld_predictions = np.empty(nobs_ld) | |
else: | |
ld_predictions = None | |
start_time = time.time() | |
for cv_i in range(len(my_cv)): | |
print("cv",cv_i+1) | |
validation, train_dataset, test_dataset = my_cv[cv_i][:3] | |
if validation: | |
tr_index, te_index = my_cv[cv_i][3] | |
X_tr, X_te = data[train_dataset]["X"].ix[tr_index,:], data[test_dataset]["X"].ix[te_index,:] | |
y_tr, y_te = data[train_dataset]["y"][tr_index] , data[test_dataset]["y"][te_index] | |
w_tr, w_te = data[train_dataset]["w"][tr_index] , data[test_dataset]["w"][te_index] | |
else: | |
X_tr, X_te = data[train_dataset]["X"], data[test_dataset]["X"] | |
y_tr, y_te = data[train_dataset]["y"], data[test_dataset]["y"] | |
w_tr, w_te = data[train_dataset]["w"], data[test_dataset]["w"] | |
# convert object to target averages | |
X_tr, X_te = factors_target_average(X_tr,y_tr,X_te) | |
# FEATURE TRANSFORM | |
# inputs X_tr,y_tr,w_tr,X_te,w_te | |
# outputs X_tr, X_te | |
if feature_transform_f is not None: | |
X_tr, X_te = feature_transform_f(X_tr,y_tr,w_tr,X_te,w_te) | |
# FEATURE TRANFORM END | |
# FEATURE SELECTION START | |
# inputs X_tr, y_tr, w_tr | |
# outputs selected_cols | |
if feature_selection_f is not None: | |
selected_cols = feature_selection_f(X_tr,y_tr,w_tr) | |
features_selected.append(selected_cols) | |
print("features:",sorted(selected_cols)) | |
X_tr, X_te = X_tr.ix[:,selected_cols], X_te.ix[:,selected_cols] | |
# FEATURE SELECTION END | |
# MODEL START | |
# inputs X_tr, y_tr, w_tr, X_te, y_te, w_te | |
# outputs pred_tr, pred_te | |
if standard_model: | |
model_f.fit(X_tr,y_tr) | |
pred_tr = model_f.predict(X_tr) | |
pred_te = model_f.predict(X_te) | |
model = None | |
else: | |
model, pred_tr, pred_te = model_f(X_tr, y_tr, w_tr, X_te, w_te) | |
# MODEL END | |
if validation: | |
gini_test = normalized_weighted_gini(y_te, pred_te, w_te) | |
test_results.append(gini_test) | |
print("test",gini_test) | |
gini_train = normalized_weighted_gini(y_tr, pred_tr, w_tr) | |
train_results.append(gini_train) | |
print("train",gini_train) | |
#save holdout predictions | |
tr_predictions[te_index] = pred_te | |
else: | |
ld_predictions = pred_te | |
del X_tr, X_te, y_tr, y_te, w_tr, w_te | |
a = gc.collect() | |
y_tr = data[train_dataset]["y"] | |
w_tr = data[train_dataset]["w"] | |
final_test_result = normalized_weighted_gini(y_tr, tr_predictions, w_tr) | |
end_time = time.time() | |
print("final test (avg)",np.array(test_results).mean()) | |
print("final train (avg)",np.array(train_results).mean()) | |
model_obj = { | |
"tr_predictions": tr_predictions | |
,"ld_predictions": ld_predictions | |
,"test_cv_results": test_results | |
,"final_test_result": final_test_result | |
,"train_cv_results": train_results | |
,"start_time": start_time | |
,"end_time": end_time | |
,"duration": end_time - start_time | |
,"model": model | |
,"features_selected": features_selected | |
} | |
if save_as is not None: | |
tightpickle.dump(model_obj,save_as) | |
return model_obj |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment