Created
April 3, 2025 22:01
-
-
Save ryanpadilha/e95249727cf2f8516c26e019d83590e2 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
from sklearn.model_selection import train_test_split | |
from sklearn.tree import DecisionTreeClassifier | |
from sklearn.metrics import accuracy_score | |
from google.colab import drive | |
drive.mount('/content/drive') | |
# primeira forma de resolucao | |
# defining the column names | |
columns = [ | |
"age", "workclass", "fnlwgt", "education", "education-num", "marital-status", | |
"occupation", "relationship", "race", "sex", "capital-gain", "capital-loss", | |
"hours-per-week", "native-country", "income" | |
] | |
# loading the data | |
df_train = pd.read_csv("/content/drive/MyDrive/mex-data/adult/adult.data", names=columns, sep=",\s*", engine="python") | |
df_test = pd.read_csv("/content/drive/MyDrive/mex-data/adult/adult.test", names=columns, sep=",\s*", engine="python", skiprows=1) | |
# remove the extra spaces between string | |
df_train = df_train.apply(lambda x: x.str.strip() if x.dtype == "object" else x) | |
df_test = df_test.apply(lambda x: x.str.strip() if x.dtype == "object" else x) | |
# replace values '?' por NaN and remove lines with absent values | |
df_train.replace("?", np.nan, inplace=True) | |
df_test.replace("?", np.nan, inplace=True) | |
df_train.dropna(inplace=True) | |
df_test.dropna(inplace=True) | |
# convert the income var to numerical | |
df_train["income"] = df_train["income"].map({">50K": 1, "<=50K": 0}) | |
df_test["income"] = df_test["income"].map({">50K.": 1, "<=50K.": 0}) # O dataset de teste tem um "." no final das classes | |
# transform categories's var into numerical (one-hot encoding) | |
df_train = pd.get_dummies(df_train, columns=["workclass", "education", "marital-status", | |
"occupation", "relationship", "race", | |
"sex", "native-country"], drop_first=True) | |
df_test = pd.get_dummies(df_test, columns=["workclass", "education", "marital-status", | |
"occupation", "relationship", "race", | |
"sex", "native-country"], drop_first=True) | |
# Garantir que ambas as bases têm as mesmas colunas | |
missing_cols = set(df_train.columns) - set(df_test.columns) | |
for col in missing_cols: | |
df_test[col] = 0 # Adicionar colunas ausentes com valor 0 | |
# Garantir que as colunas estão na mesma ordem | |
df_test = df_test[df_train.columns] | |
# Separar variáveis preditoras e alvo | |
X_train = df_train.drop("income", axis=1) | |
y_train = df_train["income"] | |
X_test = df_test.drop("income", axis=1) | |
y_test = df_test["income"] | |
# Dividir os dados de treino em treino e validação (80% treino, 20% validação) | |
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=42) | |
# Criar e treinar o modelo de Árvore de Decisão | |
tree_model = DecisionTreeClassifier(random_state=42, max_depth=10, criterion="gini") | |
tree_model.fit(X_train, y_train) | |
# Fazer previsões no conjunto de validação | |
y_pred_valid = tree_model.predict(X_valid) | |
# Avaliar o modelo no conjunto de validação | |
accuracy_valid = accuracy_score(y_valid, y_pred_valid) | |
print(f"Acurácia da Árvore de Decisão no conjunto de validação: {accuracy_valid:.2f}") | |
# Fazer previsões no conjunto de teste | |
y_pred_test = tree_model.predict(X_test) | |
# Avaliar o modelo no conjunto de teste | |
accuracy_test = accuracy_score(y_test, y_pred_test) | |
print(f"Acurácia da Árvore de Decisão no conjunto de teste: {accuracy_test:.2f}") | |
# Exibir relatório de classificação | |
print("\nRelatório de Classificação:") | |
print(classification_report(y_test, y_pred_test)) | |
# Matriz de confusão | |
print("\nMatriz de Confusão:") | |
print(confusion_matrix(y_test, y_pred_test)) | |
# Nova forma de processamento -- Problema 2 | |
import pandas as pd | |
import numpy as np | |
from sklearn.model_selection import train_test_split | |
from sklearn.preprocessing import StandardScaler | |
from sklearn.tree import DecisionTreeClassifier | |
from sklearn.ensemble import RandomForestClassifier | |
from sklearn.svm import SVC | |
from sklearn.neural_network import MLPClassifier | |
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix | |
from google.colab import drive | |
drive.mount('/content/drive') | |
# Definir os nomes das colunas | |
columns = [ | |
"age", "workclass", "fnlwgt", "education", "education-num", "marital-status", | |
"occupation", "relationship", "race", "sex", "capital-gain", "capital-loss", | |
"hours-per-week", "native-country", "income" | |
] | |
# Carregar os dados | |
df_train = pd.read_csv("/content/drive/MyDrive/mex-data/adult/adult.data", names=columns, sep=",\s*", engine="python") | |
df_test = pd.read_csv("/content/drive/MyDrive/mex-data/adult/adult.test", names=columns, sep=",\s*", engine="python", skiprows=1) | |
# Remover espaços extras das strings | |
df_train = df_train.apply(lambda x: x.str.strip() if x.dtype == "object" else x) | |
df_test = df_test.apply(lambda x: x.str.strip() if x.dtype == "object" else x) | |
# Substituir valores '?' por NaN e remover linhas com valores ausentes | |
df_train.replace("?", np.nan, inplace=True) | |
df_test.replace("?", np.nan, inplace=True) | |
df_train.dropna(inplace=True) | |
df_test.dropna(inplace=True) | |
# Converter a variável income para numérica | |
df_train["income"] = df_train["income"].map({">50K": 1, "<=50K": 0}) | |
df_test["income"] = df_test["income"].map({">50K.": 1, "<=50K.": 0}) # O dataset de teste tem um "." no final das classes | |
# Transformar variáveis categóricas em numéricas (one-hot encoding) | |
df_train = pd.get_dummies(df_train, columns=["workclass", "education", "marital-status", | |
"occupation", "relationship", "race", | |
"sex", "native-country"], drop_first=True) | |
df_test = pd.get_dummies(df_test, columns=["workclass", "education", "marital-status", | |
"occupation", "relationship", "race", | |
"sex", "native-country"], drop_first=True) | |
# Garantir que ambas as bases têm as mesmas colunas | |
missing_cols = set(df_train.columns) - set(df_test.columns) | |
for col in missing_cols: | |
df_test[col] = 0 # Adicionar colunas ausentes com valor 0 | |
# Garantir que as colunas estão na mesma ordem | |
df_test = df_test[df_train.columns] | |
# Separar variáveis preditoras e alvo | |
X_train = df_train.drop("income", axis=1) | |
y_train = df_train["income"] | |
X_test = df_test.drop("income", axis=1) | |
y_test = df_test["income"] | |
# Normalizar os dados para modelos sensíveis a escala (SVM e Redes Neurais) | |
scaler = StandardScaler() | |
X_train_scaled = scaler.fit_transform(X_train) | |
X_test_scaled = scaler.transform(X_test) | |
# Criar e treinar os modelos | |
models = { | |
"Árvore de Decisão": DecisionTreeClassifier(random_state=42, max_depth=10), | |
"Random Forest": RandomForestClassifier(random_state=42, n_estimators=100), | |
"SVM": SVC(kernel="rbf", C=1.0), | |
"Rede Neural": MLPClassifier(hidden_layer_sizes=(50, 50), max_iter=500, random_state=42) | |
} | |
# Treinar e avaliar cada modelo | |
results = {} | |
for name, model in models.items(): | |
print(f"\nTreinando {name}...") | |
if name in ["SVM", "Rede Neural"]: | |
model.fit(X_train_scaled, y_train) # Modelos que precisam de dados normalizados | |
y_pred = model.predict(X_test_scaled) | |
else: | |
model.fit(X_train, y_train) | |
y_pred = model.predict(X_test) | |
accuracy = accuracy_score(y_test, y_pred) | |
results[name] = accuracy | |
print(f"Acurácia de {name}: {accuracy:.2f}") | |
print("\nRelatório de Classificação:") | |
print(classification_report(y_test, y_pred)) | |
print("\nMatriz de Confusão:") | |
print(confusion_matrix(y_test, y_pred)) | |
# Comparação dos modelos | |
print("\nResumo das Acurácias:") | |
for model, acc in results.items(): | |
print(f"{model}: {acc:.2f}") | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment