Skip to content

Instantly share code, notes, and snippets.

@hanjae-jea
Last active November 6, 2018 14:56
Show Gist options
  • Save hanjae-jea/a105be37e0372144bac76b50414de439 to your computer and use it in GitHub Desktop.
Save hanjae-jea/a105be37e0372144bac76b50414de439 to your computer and use it in GitHub Desktop.
LR 과제
# Define linear regression function
# You may use sklearn.linear_model.LinearRegression
# Your code here
def LinReg(X, y):
r = LinearRegression()
return r.fit(X, y)
# End your code
# Basic settings. DO NOT MODIFY
selected_feature = []
sel_num = 100
valid_split = 1/5
cv = ShuffleSplit(n_splits=5, test_size=valid_split, random_state=0)
selected_train_error = []
selected_valid_error = []
# For greedy selection
for sel in range(sel_num) :
min_train_error = +1000
min_valid_error = +1000
min_feature = 0
# For each feature
for i in range(X_dev.shape[1]) :
train_error_ith = []
valid_error_ith = []
# Select feature greedy
# Hint : There should be no duplicated feature in selected_feature
# Your code here
if i in selected_feature:
continue
X_dev_fs = X_dev[:,selected_feature + [i]]
# End your code
# For cross validation
for train_index, test_index in cv.split(X_dev) :
X_train, X_valid = X_dev_fs[train_index], X_dev_fs[test_index]
y_train, y_valid = y_dev[train_index], y_dev[test_index]
# Derive training error, validation error
# You may use sklearn.metrics.mean_squared_error, model.fit(), model.predict()
# Your code here
reg = LinReg(X_train, y_train)
train_pred = reg.predict(X_train)
train_err = mean_squared_error(train_pred, y_train)
valid_pred = reg.predict(X_valid)
valid_err = mean_squared_error(valid_pred, y_valid)
train_error_ith.append(train_err)
valid_error_ith.append(valid_err)
# End your code
# Select best performance feature set on each features
# You should choose the feature which has minimum mean cross validation error
# Your code here
t_e_avg = sum(train_error_ith) / len(train_error_ith)
v_e_avg = sum(valid_error_ith) / len(valid_error_ith)
if min_valid_error > v_e_avg:
min_feature = i
min_train_error = t_e_avg
min_valid_error = v_e_avg
# End your code
print('='*50)
print("# of selected feature(s) : {}".format(sel+1))
print("Selected feature of this iteration : {}".format(min_feature))
selected_feature.append(min_feature)
selected_train_error.append(min_train_error)
selected_valid_error.append(min_valid_error)
# Select optimal feature set corresponding the minimum cross validation error
# Your code here
index = selected_valid_error.index(min(selected_valid_error))
selected_feature = selected_feature[0:index]
X_dev_fs = X_dev[:,optimal_feature]
X_test_fs = X_test[:,optimal_feature]
# End your code
# Basic settings. DO NOT MODIFY
min_train_error = 1000
min_valid_error = 1000
optimal_param = np.array([])
for train_index, test_index in cv.split(X_dev) :
X_train, X_valid = X_dev_fs[train_index], X_dev_fs[test_index]
y_train, y_valid = y_dev[train_index], y_dev[test_index]
# Derive training error, validation error for each fold
# For each fold, you need to compare error with previous minimum error.
# Your code here
reg = LinReg(X_train, y_train)
train_error = mean_squared_error(reg.predict(X_train), y_train)
valid_error = mean_squared_error(reg.predict(X_valid), y_valid)
if min_valid_error > valid_error:
min_train_error = train_error
min_valid_error = valid_error
optimal_param = reg.coef_
test_error = mean_squared_error(reg.predict(X_test_fs), y_test)
# End your code
# Find the best model on each fold
# Derive test error with best performance model
# Your code here
#for dev in range(len(X_test_fs)):
# res.append(np.dot(optimal_param, X_test_fs[dev,:]))
# End your code
# Drop features of final model
print("Results")
print("# of selected features : {}".format(len(selected_feature)))
print("Selected features : ")
print(selected_feature)
# Drop test error and accuracy
print("Training error : {}".format(min_train_error))
print("Validation error : {}".format(min_valid_error))
print("Test error : {}".format(test_error))
# Please use below function
# logreg = LogisticRegression(C=coef, solver='lbfgs', max_iter=500)
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss
coefs = [0.01, 0.05, 0.1, 0.5, 1, 10, 100]
opt_coef = 1
# To plot losses on training and validation sets with varied parameter settings,
# save them on lists.
loss_tr, loss_va = [],[]
# Your code here
for coef in coefs:
lgr = LogisticRegression(C=coef, solver='lbfgs', max_iter=500)
lgr.fit(train, Y_tr)
loss_tr.append(log_loss(Y_tr, lgr.predict_proba(train)))
loss_va.append(log_loss(Y_va, lgr.predict_proba(valid)))
# End your code
# Your code here
from sklearn.metrics import accuracy_score
coef = 0.1
lgr = LogisticRegression(C=coef, solver='lbfgs', max_iter=500)
lgr.fit(train, Y_tr)
test_loss = log_loss(Y_te, lgr.predict_proba(test))
Y_pr = lgr.predict(test)
test_acc = accuracy_score(Y_pr, Y_te)
# End your code
#print regularization paramter of final model and drop test loss and accuracy
print ("Optimal : {}, Loss : {:2.3f}, Accuracy : {:3.2f}".format(coef, test_loss, test_acc*100))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment