This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def catboost_target_encoder(train, test, cols_encode, target): | |
train_new = train.copy() | |
test_new = test.copy() | |
for column in cols_encode: | |
global_mean = train[target].mean() | |
cumulative_sum = train.groupby(column)[target].cumsum() - train[target] | |
cumulative_count = train.groupby(column).cumcount() | |
train_new[column + "_cat_mean_target"] = cumulative_sum/cumulative_count | |
train_new[column + "_cat_mean_target"].fillna(global_mean, inplace=True) | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sklearn.model_selection import KFold | |
def target_encoder_kfold(train_data, test_data, cols_encode, target, folds=10): | |
""" | |
Mean regularized target encoding based on kfold | |
""" | |
kf = KFold(n_splits=folds, random_state=1) | |
for col in cols_encode: | |
global_mean = train_data[target].mean() | |
for train_index, test_index in kf.split(train_data): |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# simulating 10,000 data points with 2 useless and 5 uniformly distributed features | |
X, y = make_friedman1(n_samples=10000, n_features=7, noise=0.0, random_state=11) | |
# train-validation split | |
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.20, random_state=42) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import lightgbm | |
********* Sklearn API ********** | |
# default lightgbm model with sklearn api | |
gbm = lightgbm.LGBMRegressor() | |
# updating objective function to custom | |
# default is "regression" | |
# also adding metrics to check different scores | |
gbm.set_params(**{'objective': custom_asymmetric_train}, metrics = ["mse", 'mae']) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def custom_asymmetric_train(y_true, y_pred): | |
residual = (y_true - y_pred).astype("float") | |
grad = np.where(residual<0, -2*10.0*residual, -2*residual) | |
hess = np.where(residual<0, 2*10.0, 2.0) | |
return grad, hess | |
def custom_asymmetric_valid(y_true, y_pred): | |
residual = (y_true - y_pred).astype("float") | |
loss = np.where(residual < 0, (residual**2)*10.0, residual**2) | |
return "custom_asymmetric_eval", np.mean(loss), False |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# huber loss | |
def huber(true, pred, delta): | |
loss = np.where(np.abs(true-pred) < delta , 0.5*((true-pred)**2), delta*np.abs(true - pred) - 0.5*(delta**2)) | |
return np.sum(loss) | |
# log cosh loss | |
def logcosh(true, pred): | |
loss = np.log(np.cosh(pred - true)) | |
return np.sum(loss) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# true: Array of true target variable | |
# pred: Array of predictions | |
def mse(true, pred): | |
return np.sum((true - pred)**2) | |
def mae(true, pred): | |
return np.sum(np.abs(true - pred)) | |
# also available in sklearn |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
x = ratings.drop([‘rating’],axis=1) | |
y = ratings[‘rating’].astype(np.float32) | |
data = ColumnarModelData.from_data_frame(path, val_indx, x, y, [‘userId’, ‘movieId’], 64) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# n_users: count unique users (671), n_movies: count unique movies (9066) | |
model = EmbeddingNet(n_users, n_movies) | |
# model.parameters() for back-propagation of weights | |
# lr = 1e-3, weight decay = 1e-5 and using adam optimizer | |
opt = optim.Adam(model.parameters(), 1e-3, weight_decay=1e-5) | |
# fitting model, | |
fit(model, data, 3, opt, F.mse_loss) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# nh = dimension of hidden linear layer | |
# p1 = dropout1 | |
# p2 = dropout2 | |
class EmbeddingNet(nn.Module): | |
def __init__(self, n_users, _n_movies, nh = 10, p1 = 0.05, p2= 0.5): | |
super().__init__() | |
(self.u, self.m, self.ub, self.mb) = [get_emb(*o) for o in [ | |
(n_users, n_factors), (n_movies, n_factors), | |
(n_users,1), (n_movies,1) |