Last active
June 14, 2022 09:23
-
-
Save rkdgusrn1212/1e49bd912042c2bf74b647b68f5dd64d to your computer and use it in GitHub Desktop.
Machine Learning : M-Estimate
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
for col in df.columns : | |
if df[col].dtype == "object" and df[col].nunique() > 10 : | |
print(col) #카테고리가 많은 feature일수록 target encoding이 필요하다 | |
if df[col].value_counts().any() < 5 : | |
print("required smoothing") #rare category가 존재하는 feature들은 smoothing을 적용한다. | |
#encoding split이랑 train split 나누기 | |
X_encode = df.sample(frac=0.20, random_state=0)#인코더 fitting용, encoding split | |
y_encode = X_encode.pop("target")#인코더로 transfrom할거, train split | |
X_pretrain = X.drop(X_encode.index) | |
y_train = y[X_pretrain.index] | |
from category_encoders import MEstimateEncoder | |
#해당 feature로 인코더 만들기 | |
encoder = MEstimateEncoder(cols=["feature_1"], m=5.0) | |
# encoding split에 fit하기 | |
encoder.fit(X_encode, y_encode) | |
# encoder로 train split 변환 | |
X_train = encoder.transform(X_pretrain) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment