Last active
November 6, 2018 14:56
-
-
Save hanjae-jea/a105be37e0372144bac76b50414de439 to your computer and use it in GitHub Desktop.
LR 과제
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Define linear regression function | |
# You may use sklearn.linear_model.LinearRegression | |
# Your code here | |
def LinReg(X, y): | |
r = LinearRegression() | |
return r.fit(X, y) | |
# End your code | |
# Basic settings. DO NOT MODIFY | |
selected_feature = [] | |
sel_num = 100 | |
valid_split = 1/5 | |
cv = ShuffleSplit(n_splits=5, test_size=valid_split, random_state=0) | |
selected_train_error = [] | |
selected_valid_error = [] | |
# For greedy selection | |
for sel in range(sel_num) : | |
min_train_error = +1000 | |
min_valid_error = +1000 | |
min_feature = 0 | |
# For each feature | |
for i in range(X_dev.shape[1]) : | |
train_error_ith = [] | |
valid_error_ith = [] | |
# Select feature greedy | |
# Hint : There should be no duplicated feature in selected_feature | |
# Your code here | |
if i in selected_feature: | |
continue | |
X_dev_fs = X_dev[:,selected_feature + [i]] | |
# End your code | |
# For cross validation | |
for train_index, test_index in cv.split(X_dev) : | |
X_train, X_valid = X_dev_fs[train_index], X_dev_fs[test_index] | |
y_train, y_valid = y_dev[train_index], y_dev[test_index] | |
# Derive training error, validation error | |
# You may use sklearn.metrics.mean_squared_error, model.fit(), model.predict() | |
# Your code here | |
reg = LinReg(X_train, y_train) | |
train_pred = reg.predict(X_train) | |
train_err = mean_squared_error(train_pred, y_train) | |
valid_pred = reg.predict(X_valid) | |
valid_err = mean_squared_error(valid_pred, y_valid) | |
train_error_ith.append(train_err) | |
valid_error_ith.append(valid_err) | |
# End your code | |
# Select best performance feature set on each features | |
# You should choose the feature which has minimum mean cross validation error | |
# Your code here | |
t_e_avg = sum(train_error_ith) / len(train_error_ith) | |
v_e_avg = sum(valid_error_ith) / len(valid_error_ith) | |
if min_valid_error > v_e_avg: | |
min_feature = i | |
min_train_error = t_e_avg | |
min_valid_error = v_e_avg | |
# End your code | |
print('='*50) | |
print("# of selected feature(s) : {}".format(sel+1)) | |
print("Selected feature of this iteration : {}".format(min_feature)) | |
selected_feature.append(min_feature) | |
selected_train_error.append(min_train_error) | |
selected_valid_error.append(min_valid_error) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Select optimal feature set corresponding the minimum cross validation error | |
# Your code here | |
index = selected_valid_error.index(min(selected_valid_error)) | |
selected_feature = selected_feature[0:index] | |
X_dev_fs = X_dev[:,optimal_feature] | |
X_test_fs = X_test[:,optimal_feature] | |
# End your code | |
# Basic settings. DO NOT MODIFY | |
min_train_error = 1000 | |
min_valid_error = 1000 | |
optimal_param = np.array([]) | |
for train_index, test_index in cv.split(X_dev) : | |
X_train, X_valid = X_dev_fs[train_index], X_dev_fs[test_index] | |
y_train, y_valid = y_dev[train_index], y_dev[test_index] | |
# Derive training error, validation error for each fold | |
# For each fold, you need to compare error with previous minimum error. | |
# Your code here | |
reg = LinReg(X_train, y_train) | |
train_error = mean_squared_error(reg.predict(X_train), y_train) | |
valid_error = mean_squared_error(reg.predict(X_valid), y_valid) | |
if min_valid_error > valid_error: | |
min_train_error = train_error | |
min_valid_error = valid_error | |
optimal_param = reg.coef_ | |
test_error = mean_squared_error(reg.predict(X_test_fs), y_test) | |
# End your code | |
# Find the best model on each fold | |
# Derive test error with best performance model | |
# Your code here | |
#for dev in range(len(X_test_fs)): | |
# res.append(np.dot(optimal_param, X_test_fs[dev,:])) | |
# End your code | |
# Drop features of final model | |
print("Results") | |
print("# of selected features : {}".format(len(selected_feature))) | |
print("Selected features : ") | |
print(selected_feature) | |
# Drop test error and accuracy | |
print("Training error : {}".format(min_train_error)) | |
print("Validation error : {}".format(min_valid_error)) | |
print("Test error : {}".format(test_error)) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Please use below function | |
# logreg = LogisticRegression(C=coef, solver='lbfgs', max_iter=500) | |
from sklearn.linear_model import LogisticRegression | |
from sklearn.metrics import log_loss | |
coefs = [0.01, 0.05, 0.1, 0.5, 1, 10, 100] | |
opt_coef = 1 | |
# To plot losses on training and validation sets with varied parameter settings, | |
# save them on lists. | |
loss_tr, loss_va = [],[] | |
# Your code here | |
for coef in coefs: | |
lgr = LogisticRegression(C=coef, solver='lbfgs', max_iter=500) | |
lgr.fit(train, Y_tr) | |
loss_tr.append(log_loss(Y_tr, lgr.predict_proba(train))) | |
loss_va.append(log_loss(Y_va, lgr.predict_proba(valid))) | |
# End your code | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Your code here | |
from sklearn.metrics import accuracy_score | |
coef = 0.1 | |
lgr = LogisticRegression(C=coef, solver='lbfgs', max_iter=500) | |
lgr.fit(train, Y_tr) | |
test_loss = log_loss(Y_te, lgr.predict_proba(test)) | |
Y_pr = lgr.predict(test) | |
test_acc = accuracy_score(Y_pr, Y_te) | |
# End your code | |
#print regularization paramter of final model and drop test loss and accuracy | |
print ("Optimal : {}, Loss : {:2.3f}, Accuracy : {:3.2f}".format(coef, test_loss, test_acc*100)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment