Created
January 4, 2022 23:06
-
-
Save cnmoro/7ba9541af975dbb4fe1eb08658f97641 to your computer and use it in GitHub Desktop.
SKLearn Snippets
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# CLUSTERING | |
# Davies Bouldin Index -> Menor Melhor para escolha do K | |
# Descrição das estatísticas das features | |
df.groupby("cluster").describe() | |
centroids = kmeans.cluster_centers_ | |
max = centroids[0] | |
for i in range(max): | |
# Exibir as variâncias por feature | |
print(df.columns.values[i], "{:.4f}".format(centroids[:, i].var())) | |
# As features com maior variância são aquelas que | |
# diferenciaram mais os clusters | |
# Exemplo: feature IDADE | |
df.groupby("cluster")["IDADE"].describe() | |
# Analisar clusters separadamente por cada feature | |
description = df.groupby("cluster")["IDADE", "X", "Y", "Z"] | |
num_instances = description.size() | |
description = description.mean() | |
description['n_instances'] = num_instances | |
print(description) | |
############################################## | |
# CROSS-VALIDATION (K-FOLD VALIDATION) | |
from sklearn.model_selection import cross_validate | |
results = cross_validate(model, X, y, cv = 5, return_train_score=False) | |
avg_result = results['test_score'].mean() | |
print(avg_result) | |
# Com aleatoriedade | |
SEED = 301 | |
np.random.seed(SEED) | |
from sklearn.model_selection import KFold | |
cv = KFold(n_splits=5, shuffle=True) | |
from sklearn.model_selection import cross_validate | |
results = cross_validate(model, X, y, cv = cv, return_train_score=False) | |
avg_result = results['test_score'].mean() | |
print(avg_result) | |
# Com aleatoriedade e mantendo proporção do label | |
SEED = 301 | |
np.random.seed(SEED) | |
from sklearn.model_selection import StratifiedKFold | |
cv = StratifiedKFold(n_splits=5, shuffle=True) | |
from sklearn.model_selection import cross_validate | |
results = cross_validate(model, X, y, cv = cv, return_train_score=False) | |
avg_result = results['test_score'].mean() | |
print(avg_result) | |
# Geração de PipelineSEED = 301 | |
from sklearn.pipeline import Pipeline | |
SEED = 301 | |
np.random.seed(SEED) | |
scaler = StandardScaler() | |
modelo = SVC() | |
pipeline = Pipeline([ | |
('transformacao', scaler), | |
('estimador', modelo) | |
]) | |
cv = StratifiedKFold(n_splits=5, shuffle=True) | |
results = cross_validate(pipeline, X, y, cv = cv, return_train_score=False) | |
avg_result = results['test_score'].mean() | |
print(avg_result) | |
# GridSearchCV | |
from sklearn.model_selection import GridSearchCV | |
SEED=301 | |
np.random.seed(SEED) | |
parameter_grid = { | |
"max_depth" : [3, 5], | |
"min_samples_split": [32, 64, 128], | |
"min_samples_leaf": [32, 64, 128], | |
"criterion": ["gini", "entropy"] | |
} | |
busca = GridSearchCV(DecisionTreeClassifier(), | |
parameter_grid, | |
cv = StratifiedKFold(n_splits = 10)) | |
busca.fit(X, y) | |
print(busca.best_params_) | |
print(busca.best_score_ * 100) | |
resultados = pd.DataFrame(busca.cv_results_) | |
resultados.head() | |
# GridSearchCV com parâmetros aleatórios | |
from sklearn.model_selection import RandomizedSearchCV | |
SEED=301 | |
np.random.seed(SEED) | |
parameter_grid = { | |
"max_depth" : [3, 5], # da para pegar uma lista de numeros aleatorios aqui | |
"min_samples_split": [32, 64, 128], | |
"min_samples_leaf": [32, 64, 128], | |
"criterion": ["gini", "entropy"] | |
} | |
busca = RandomizedSearchCV(DecisionTreeClassifier(), | |
parameter_grid, | |
cv = StratifiedKFold(n_splits = 10)) | |
busca.fit(X, y) | |
print(busca.best_params_) | |
print(busca.best_score_ * 100) | |
resultados = pd.DataFrame(busca.cv_results_) | |
resultados.head() | |
#################################################### | |
# Classificador Multiclasse OneVsRest, um modelo por label | |
from sklearn.multiclass import OneVsRestClassifier | |
from sklearn.linear_model import LogisticRegressionClassifier | |
reg_log = LogisticRegressionClassifier() | |
classificador_onevsrest = OneVsRestClassifier(reg_log) | |
# y precisa ser uma matriz binária | |
import numpy as np | |
y = np.asarray(y) | |
classificador_onevsrest.fit(X, y) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment