This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import java.io.BufferedReader; | |
import java.io.IOException; | |
import java.io.InputStreamReader; | |
import java.util.Stack; | |
/** | |
* 피연산자 여러개 연산 가능합니다. | |
* | |
* ***예제1*** | |
* 수식을 한 줄에 입력해주세요. |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
for col in df.columns : | |
if df[col].dtype == "object" and df[col].nunique() > 10 : | |
print(col) #카테고리가 많은 feature일수록 target encoding이 필요하다 | |
if df[col].value_counts().any() < 5 : | |
print("required smoothing") #rare category가 존재하는 feature들은 smoothing을 적용한다. | |
#encoding split이랑 train split 나누기 | |
X_encode = df.sample(frac=0.20, random_state=0)#인코더 fitting용, encoding split | |
y_encode = X_encode.pop("target")#인코더로 transfrom할거, train split |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sklearn.decomposition import PCA | |
X = (X - X.mean(axis=0)) / X.std(axis=0) | |
pca = PCA() | |
X_pca = pca.fit_transform(X) | |
#X_pca는 2차원 ndarray다.row=index,column=principle component | |
#pca.component_로 row=principle component col=feature인 2차원 배열을 얻을 수 있다. |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sklearn.cluster import KMeans | |
#features 컬럼들을 평균 0, 표준편차 1로 표준화함. | |
X_scaled = X.loc[:, features]#[:]전채 행에서 features열만 가지고 군집화함. | |
X_scaled = (X_scaled - X_scaled.mean(axis=0)) / X_scaled.std(axis=0)#각 열에서의 행들의 평균, 행들의 표준편차 | |
#n_cluster는 군집 개수, n_init은 다른 랜덤 centeroid를 가지고 알고리즘을 수행될 횟수로 그 중 가장 군집화가 잘된 결과를 반환한다. | |
kmeans = KMeans(n_clusters=10, n_init=10) | |
X["Cluster"] = kmeans.fit_predict(X_scaled)#군집화 정보를 다시 학습 데이터에 넣어준다. |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#categorical x numerical | |
df_new = pd.get_dummies(df.cat_feat, prefix="catxnum_feat").mul(df.num_feat, axis=0) | |
#count columns gt 0 | |
df_new = pd.DataFrame(); | |
df_new["count"] = df[["feat_1","feat_2",]].gt(0.0).sum(axis=1) | |
#categorical feature의 각 카테고리가 "_" 기준으로 3개의 feature로 나누어 질때 | |
df_new = pd.DataFrame(); | |
df_new[["cat_feat_1","cat_feat_2","cat_feat_3"]] = df.cat_feat.str.split("_", n=2, expand=True) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import matplotlib.pyplot as plt | |
import numpy as np | |
import pandas as pd | |
import seaborn as sns | |
from sklearn.feature_selection import mutual_info_regression | |
# Set Matplotlib defaults | |
plt.style.use("seaborn-whitegrid") | |
plt.rc("figure", autolayout=True) | |
plt.rc( |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sklearn.model_selection import cross_val_score | |
from sklearn.pipeline import Pipeline | |
#클수록 성능이 안좋다는 의미이므로, mae를 음수로 반환한다. | |
scores = cross_val_score(my_pipeline, X, y, cv=5, scoring='neg_mean_absolute_error') |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sklearn.compose import ColumnTransformer | |
from sklearn.pipeline import Pipeline | |
from sklearn.impute import SimpleImputer | |
from sklearn.preprocessing import OneHotEncoder | |
from sklearn.ensemble import RandomForestRegressor | |
from sklearn.model_selection import train_test_split | |
#read data | |
train_data = pd.read_csv('train.csv', index_col='Id') | |
test_data = pd.read_csv('test.csv', index_col='Id') |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
from sklearn.preprocessing import OneHotEncoder | |
#너무 많은 차원으로 분리되지 않을 컬럼들만 인코딩한다. | |
low_cardinality_cols = [col for col in object_cols if X_train[col].nunique() < 10] | |
encoder = OneHotEncoder(handle_unknown='ignore', sparse=False) | |
OH_cols_train = pd.DataFrame(encoder.fit_transform(X_train[low_cardinality_cols])) | |
OH_cols_valid = pd.DataFrame(encoder.transform(X_valid[low_cardinality_cols])) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
from sklearn.preprocessing import OrdinalEncoder | |
# Categorical columns | |
object_cols = [col for col in train_data.columns if train_data[col].dtype == "object"] | |
# valid_data에서의 value set이 train_data에서의 value set의 부분집합인 column들, train_data에 fit된 encoder를 사용하기위한 필요조건이다. | |
good_cols = [col for col in object_cols if set(valid_data[col]).issubset(set(train_data[col]))] | |
ordinal_encoder = OrdinalEncoder() |
NewerOlder