Skip to content

Instantly share code, notes, and snippets.

@seahrh
Created July 5, 2020 04:30
Show Gist options
  • Save seahrh/f81dbefa0e00f8a2082691d339b65506 to your computer and use it in GitHub Desktop.
Save seahrh/f81dbefa0e00f8a2082691d339b65506 to your computer and use it in GitHub Desktop.
Simple linear regression example (sklearn, L1 regularization)
import os
import random
import numpy as np
import pandas as pd
from sklearn.linear_model import Lasso
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.preprocessing import RobustScaler
from sklearn.decomposition import PCA
from sklearn import metrics
folds = 3
target = 'target'
names = [f'p{i + 1}' for i in range(500)]
train = pd.read_csv(f'train.csv', header=None, names=names + [target])
test = pd.read_csv(f'test.csv', header=None, names=names)
train.info()
test.info()
# standardization - tried it but model got worse
#scaler = RobustScaler()
#train[names] = scaler.fit_transform(train[names])
#test[names] = scaler.transform(test[names])
y_train = train[target]
x_train = train[names]
x_test = test[names]
# tried dimensionality reduction but model got worse
#pca = PCA(n_components=20)
#x_train = pca.fit_transform(x_train)
#x_test = pca.transform(x_test)
# L1 regularization
model = Lasso()
pipe = Pipeline([('model', model)])
param_grid = {
'model__alpha': [1.0],
'model__max_iter': [1000, 2000]
}
cv = GridSearchCV(pipe, cv=folds, param_grid=param_grid, scoring='neg_mean_absolute_error')
cv.fit(x_train, y_train)
print('best_params_={}\nbest_score_={}'.format(repr(cv.best_params_), repr(cv.best_score_)))
preds = cv.predict(x_test)
submission = pd.DataFrame({target: preds})
print(submission.head())
submission.to_csv('prediction.csv', index=False, header=None)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment