hanjae-jea · November 6, 2018 14:56
diff --git a/1-1.py b/1-1.py
 # Define linear regression function
 # You may use sklearn.linear_model.LinearRegression
 # Your code here
 def LinReg(X, y):
    r = LinearRegression()
    return r.fit(X, y)
 # End your code

 # Basic settings. DO NOT MODIFY
 selected_feature = []
 sel_num = 100
 valid_split = 1/5
 cv = ShuffleSplit(n_splits=5, test_size=valid_split, random_state=0)

 selected_train_error = []
 selected_valid_error = []
 # For greedy selection
 for sel in range(sel_num) :
    min_train_error = +1000
    min_valid_error = +1000
    min_feature = 0
    
    # For each feature
    for i in range(X_dev.shape[1]) :
        train_error_ith = []
        valid_error_ith = []
        
        # Select feature greedy
        # Hint : There should be no duplicated feature in selected_feature
        # Your code here
        if i in selected_feature:
            continue
        X_dev_fs = X_dev[:,selected_feature + [i]]
        # End your code
        
        # For cross validation
        for train_index, test_index in cv.split(X_dev) :
            X_train, X_valid = X_dev_fs[train_index], X_dev_fs[test_index]
            y_train, y_valid = y_dev[train_index], y_dev[test_index]
        
            # Derive training error, validation error
            # You may use sklearn.metrics.mean_squared_error, model.fit(), model.predict()
            # Your code here
            reg = LinReg(X_train, y_train)
            train_pred = reg.predict(X_train)
            train_err = mean_squared_error(train_pred, y_train)
            valid_pred = reg.predict(X_valid)
            valid_err = mean_squared_error(valid_pred, y_valid)
            train_error_ith.append(train_err)
            valid_error_ith.append(valid_err)
            # End your code
            
        # Select best performance feature set on each features
        # You should choose the feature which has minimum mean cross validation error
        # Your code here
        t_e_avg = sum(train_error_ith) / len(train_error_ith)
        v_e_avg = sum(valid_error_ith) / len(valid_error_ith)
        if min_valid_error > v_e_avg:
            min_feature = i
            min_train_error = t_e_avg
            min_valid_error = v_e_avg
        # End your code

    print('='*50)
    print("# of selected feature(s) : {}".format(sel+1))
    print("Selected feature of this iteration : {}".format(min_feature))
    selected_feature.append(min_feature)
    selected_train_error.append(min_train_error)
    selected_valid_error.append(min_valid_error)
diff --git a/1-3.py b/1-3.py
 # Select optimal feature set corresponding the minimum cross validation error
 # Your code here
 index = selected_valid_error.index(min(selected_valid_error))
 selected_feature = selected_feature[0:index]
 X_dev_fs = X_dev[:,optimal_feature]
 X_test_fs = X_test[:,optimal_feature]
 # End your code

 # Basic settings. DO NOT MODIFY
 min_train_error = 1000
 min_valid_error = 1000
 optimal_param = np.array([])

 for train_index, test_index in cv.split(X_dev) :
    X_train, X_valid = X_dev_fs[train_index], X_dev_fs[test_index]
    y_train, y_valid = y_dev[train_index], y_dev[test_index]
    
    # Derive training error, validation error for each fold
    # For each fold, you need to compare error with previous minimum error.
    # Your code here
    reg = LinReg(X_train, y_train)
    train_error = mean_squared_error(reg.predict(X_train), y_train)
    valid_error = mean_squared_error(reg.predict(X_valid), y_valid)
    if min_valid_error > valid_error:
        min_train_error = train_error
        min_valid_error = valid_error
        
        optimal_param = reg.coef_
        test_error = mean_squared_error(reg.predict(X_test_fs), y_test)
    # End your code

 # Find the best model on each fold
 # Derive test error with best performance model
 # Your code here
 #for dev in range(len(X_test_fs)):
 #    res.append(np.dot(optimal_param, X_test_fs[dev,:]))
 # End your code

 # Drop features of final model
 print("Results")
 print("# of selected features : {}".format(len(selected_feature)))
 print("Selected features : ")
 print(selected_feature)

 # Drop test error and accuracy
 print("Training error : {}".format(min_train_error))
 print("Validation error : {}".format(min_valid_error))
 print("Test error : {}".format(test_error))
diff --git a/2-1.py b/2-1.py
 # Please use below function
 # logreg = LogisticRegression(C=coef, solver='lbfgs', max_iter=500)

 from sklearn.linear_model import LogisticRegression
 from sklearn.metrics import log_loss

 coefs = [0.01, 0.05, 0.1, 0.5, 1, 10, 100]

 opt_coef = 1

 # To plot losses on training and validation sets with varied parameter settings, 
 # save them on lists.
 loss_tr, loss_va = [],[]

 # Your code here

 for coef in coefs:
    lgr = LogisticRegression(C=coef, solver='lbfgs', max_iter=500)
    lgr.fit(train, Y_tr)
    loss_tr.append(log_loss(Y_tr, lgr.predict_proba(train)))
    loss_va.append(log_loss(Y_va, lgr.predict_proba(valid)))
 # End your code

diff --git a/2-3.py b/2-3.py

 # Your code here
 from sklearn.metrics import accuracy_score
 coef = 0.1
 lgr = LogisticRegression(C=coef, solver='lbfgs', max_iter=500)
 lgr.fit(train, Y_tr)
 test_loss = log_loss(Y_te, lgr.predict_proba(test))
 Y_pr = lgr.predict(test)
 test_acc = accuracy_score(Y_pr, Y_te)
 # End your code


 #print regularization paramter of final model and drop test loss and accuracy
 print ("Optimal : {}, Loss : {:2.3f}, Accuracy : {:3.2f}".format(coef, test_loss, test_acc*100))
	# Define linear regression function
	# You may use sklearn.linear_model.LinearRegression
	# Your code here
	def LinReg(X, y):
	r = LinearRegression()
	return r.fit(X, y)
	# End your code

	# Basic settings. DO NOT MODIFY
	selected_feature = []
	sel_num = 100
	valid_split = 1/5
	cv = ShuffleSplit(n_splits=5, test_size=valid_split, random_state=0)

	selected_train_error = []
	selected_valid_error = []
	# For greedy selection
	for sel in range(sel_num) :
	min_train_error = +1000
	min_valid_error = +1000
	min_feature = 0

	# For each feature
	for i in range(X_dev.shape[1]) :
	train_error_ith = []
	valid_error_ith = []

	# Select feature greedy
	# Hint : There should be no duplicated feature in selected_feature
	# Your code here
	if i in selected_feature:
	continue
	X_dev_fs = X_dev[:,selected_feature + [i]]
	# End your code

	# For cross validation
	for train_index, test_index in cv.split(X_dev) :
	X_train, X_valid = X_dev_fs[train_index], X_dev_fs[test_index]
	y_train, y_valid = y_dev[train_index], y_dev[test_index]

	# Derive training error, validation error
	# You may use sklearn.metrics.mean_squared_error, model.fit(), model.predict()
	# Your code here
	reg = LinReg(X_train, y_train)
	train_pred = reg.predict(X_train)
	train_err = mean_squared_error(train_pred, y_train)
	valid_pred = reg.predict(X_valid)
	valid_err = mean_squared_error(valid_pred, y_valid)
	train_error_ith.append(train_err)
	valid_error_ith.append(valid_err)
	# End your code

	# Select best performance feature set on each features
	# You should choose the feature which has minimum mean cross validation error
	# Your code here
	t_e_avg = sum(train_error_ith) / len(train_error_ith)
	v_e_avg = sum(valid_error_ith) / len(valid_error_ith)
	if min_valid_error > v_e_avg:
	min_feature = i
	min_train_error = t_e_avg
	min_valid_error = v_e_avg
	# End your code

	print('='*50)
	print("# of selected feature(s) : {}".format(sel+1))
	print("Selected feature of this iteration : {}".format(min_feature))
	selected_feature.append(min_feature)
	selected_train_error.append(min_train_error)
	selected_valid_error.append(min_valid_error)
	# Select optimal feature set corresponding the minimum cross validation error
	# Your code here
	index = selected_valid_error.index(min(selected_valid_error))
	selected_feature = selected_feature[0:index]
	X_dev_fs = X_dev[:,optimal_feature]
	X_test_fs = X_test[:,optimal_feature]
	# End your code

	# Basic settings. DO NOT MODIFY
	min_train_error = 1000
	min_valid_error = 1000
	optimal_param = np.array([])

	for train_index, test_index in cv.split(X_dev) :
	X_train, X_valid = X_dev_fs[train_index], X_dev_fs[test_index]
	y_train, y_valid = y_dev[train_index], y_dev[test_index]

	# Derive training error, validation error for each fold
	# For each fold, you need to compare error with previous minimum error.
	# Your code here
	reg = LinReg(X_train, y_train)
	train_error = mean_squared_error(reg.predict(X_train), y_train)
	valid_error = mean_squared_error(reg.predict(X_valid), y_valid)
	if min_valid_error > valid_error:
	min_train_error = train_error
	min_valid_error = valid_error

	optimal_param = reg.coef_
	test_error = mean_squared_error(reg.predict(X_test_fs), y_test)
	# End your code

	# Find the best model on each fold
	# Derive test error with best performance model
	# Your code here
	#for dev in range(len(X_test_fs)):
	# res.append(np.dot(optimal_param, X_test_fs[dev,:]))
	# End your code

	# Drop features of final model
	print("Results")
	print("# of selected features : {}".format(len(selected_feature)))
	print("Selected features : ")
	print(selected_feature)

	# Drop test error and accuracy
	print("Training error : {}".format(min_train_error))
	print("Validation error : {}".format(min_valid_error))
	print("Test error : {}".format(test_error))
	# Please use below function
	# logreg = LogisticRegression(C=coef, solver='lbfgs', max_iter=500)

	from sklearn.linear_model import LogisticRegression
	from sklearn.metrics import log_loss

	coefs = [0.01, 0.05, 0.1, 0.5, 1, 10, 100]

	opt_coef = 1

	# To plot losses on training and validation sets with varied parameter settings,
	# save them on lists.
	loss_tr, loss_va = [],[]

	# Your code here

	for coef in coefs:
	lgr = LogisticRegression(C=coef, solver='lbfgs', max_iter=500)
	lgr.fit(train, Y_tr)
	loss_tr.append(log_loss(Y_tr, lgr.predict_proba(train)))
	loss_va.append(log_loss(Y_va, lgr.predict_proba(valid)))
	# End your code

	# Your code here
	from sklearn.metrics import accuracy_score
	coef = 0.1
	lgr = LogisticRegression(C=coef, solver='lbfgs', max_iter=500)
	lgr.fit(train, Y_tr)
	test_loss = log_loss(Y_te, lgr.predict_proba(test))
	Y_pr = lgr.predict(test)
	test_acc = accuracy_score(Y_pr, Y_te)
	# End your code


	#print regularization paramter of final model and drop test loss and accuracy
	print ("Optimal : {}, Loss : {:2.3f}, Accuracy : {:3.2f}".format(coef, test_loss, test_acc*100))