Last active
June 1, 2022 01:46
-
-
Save rkdgusrn1212/05d80e47ad52c0bc57f5c06cb37cbc52 to your computer and use it in GitHub Desktop.
One-Hot encoding
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
from sklearn.preprocessing import OneHotEncoder | |
#너무 많은 차원으로 분리되지 않을 컬럼들만 인코딩한다. | |
low_cardinality_cols = [col for col in object_cols if X_train[col].nunique() < 10] | |
encoder = OneHotEncoder(handle_unknown='ignore', sparse=False) | |
OH_cols_train = pd.DataFrame(encoder.fit_transform(X_train[low_cardinality_cols])) | |
OH_cols_valid = pd.DataFrame(encoder.transform(X_valid[low_cardinality_cols])) | |
#인코딩하면 df의 index가 초기화된다. 다시 입력해주자 | |
OH_cols_train.index = X_train.index | |
OH_cols_valid.index = X_valid.index | |
#각 column 이름이 ascent int로 부여된다. column이름이 int면 model fit할때 경고가 뜬다. | |
#column이름을 각 column이 의미하는 카테고리로 이름을 바꿔주자. | |
OH_cols_train.columns = OH_encoder.get_feature_names_out() | |
OH_cols_valid.columns = OH_encoder.get_feature_names_out() | |
#일단 모든 object 컬럼을 제거한다. | |
num_X_train = X_train.drop(object_cols, axis=1) | |
num_X_valid = X_valid.drop(object_cols, axis=1) | |
#인코딩된 컬럼 붙여주기 | |
OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1) | |
OH_X_valid = pd.concat([num_X_valid, OH_cols_valid], axis=1) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment