Skip to content

Instantly share code, notes, and snippets.

@ryanpadilha
Created April 3, 2025 22:01
Show Gist options
  • Save ryanpadilha/e95249727cf2f8516c26e019d83590e2 to your computer and use it in GitHub Desktop.
Save ryanpadilha/e95249727cf2f8516c26e019d83590e2 to your computer and use it in GitHub Desktop.
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from google.colab import drive
drive.mount('/content/drive')
# primeira forma de resolucao
# defining the column names
columns = [
"age", "workclass", "fnlwgt", "education", "education-num", "marital-status",
"occupation", "relationship", "race", "sex", "capital-gain", "capital-loss",
"hours-per-week", "native-country", "income"
]
# loading the data
df_train = pd.read_csv("/content/drive/MyDrive/mex-data/adult/adult.data", names=columns, sep=",\s*", engine="python")
df_test = pd.read_csv("/content/drive/MyDrive/mex-data/adult/adult.test", names=columns, sep=",\s*", engine="python", skiprows=1)
# remove the extra spaces between string
df_train = df_train.apply(lambda x: x.str.strip() if x.dtype == "object" else x)
df_test = df_test.apply(lambda x: x.str.strip() if x.dtype == "object" else x)
# replace values '?' por NaN and remove lines with absent values
df_train.replace("?", np.nan, inplace=True)
df_test.replace("?", np.nan, inplace=True)
df_train.dropna(inplace=True)
df_test.dropna(inplace=True)
# convert the income var to numerical
df_train["income"] = df_train["income"].map({">50K": 1, "<=50K": 0})
df_test["income"] = df_test["income"].map({">50K.": 1, "<=50K.": 0}) # O dataset de teste tem um "." no final das classes
# transform categories's var into numerical (one-hot encoding)
df_train = pd.get_dummies(df_train, columns=["workclass", "education", "marital-status",
"occupation", "relationship", "race",
"sex", "native-country"], drop_first=True)
df_test = pd.get_dummies(df_test, columns=["workclass", "education", "marital-status",
"occupation", "relationship", "race",
"sex", "native-country"], drop_first=True)
# Garantir que ambas as bases têm as mesmas colunas
missing_cols = set(df_train.columns) - set(df_test.columns)
for col in missing_cols:
df_test[col] = 0 # Adicionar colunas ausentes com valor 0
# Garantir que as colunas estão na mesma ordem
df_test = df_test[df_train.columns]
# Separar variáveis preditoras e alvo
X_train = df_train.drop("income", axis=1)
y_train = df_train["income"]
X_test = df_test.drop("income", axis=1)
y_test = df_test["income"]
# Dividir os dados de treino em treino e validação (80% treino, 20% validação)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
# Criar e treinar o modelo de Árvore de Decisão
tree_model = DecisionTreeClassifier(random_state=42, max_depth=10, criterion="gini")
tree_model.fit(X_train, y_train)
# Fazer previsões no conjunto de validação
y_pred_valid = tree_model.predict(X_valid)
# Avaliar o modelo no conjunto de validação
accuracy_valid = accuracy_score(y_valid, y_pred_valid)
print(f"Acurácia da Árvore de Decisão no conjunto de validação: {accuracy_valid:.2f}")
# Fazer previsões no conjunto de teste
y_pred_test = tree_model.predict(X_test)
# Avaliar o modelo no conjunto de teste
accuracy_test = accuracy_score(y_test, y_pred_test)
print(f"Acurácia da Árvore de Decisão no conjunto de teste: {accuracy_test:.2f}")
# Exibir relatório de classificação
print("\nRelatório de Classificação:")
print(classification_report(y_test, y_pred_test))
# Matriz de confusão
print("\nMatriz de Confusão:")
print(confusion_matrix(y_test, y_pred_test))
# Nova forma de processamento -- Problema 2
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from google.colab import drive
drive.mount('/content/drive')
# Definir os nomes das colunas
columns = [
"age", "workclass", "fnlwgt", "education", "education-num", "marital-status",
"occupation", "relationship", "race", "sex", "capital-gain", "capital-loss",
"hours-per-week", "native-country", "income"
]
# Carregar os dados
df_train = pd.read_csv("/content/drive/MyDrive/mex-data/adult/adult.data", names=columns, sep=",\s*", engine="python")
df_test = pd.read_csv("/content/drive/MyDrive/mex-data/adult/adult.test", names=columns, sep=",\s*", engine="python", skiprows=1)
# Remover espaços extras das strings
df_train = df_train.apply(lambda x: x.str.strip() if x.dtype == "object" else x)
df_test = df_test.apply(lambda x: x.str.strip() if x.dtype == "object" else x)
# Substituir valores '?' por NaN e remover linhas com valores ausentes
df_train.replace("?", np.nan, inplace=True)
df_test.replace("?", np.nan, inplace=True)
df_train.dropna(inplace=True)
df_test.dropna(inplace=True)
# Converter a variável income para numérica
df_train["income"] = df_train["income"].map({">50K": 1, "<=50K": 0})
df_test["income"] = df_test["income"].map({">50K.": 1, "<=50K.": 0}) # O dataset de teste tem um "." no final das classes
# Transformar variáveis categóricas em numéricas (one-hot encoding)
df_train = pd.get_dummies(df_train, columns=["workclass", "education", "marital-status",
"occupation", "relationship", "race",
"sex", "native-country"], drop_first=True)
df_test = pd.get_dummies(df_test, columns=["workclass", "education", "marital-status",
"occupation", "relationship", "race",
"sex", "native-country"], drop_first=True)
# Garantir que ambas as bases têm as mesmas colunas
missing_cols = set(df_train.columns) - set(df_test.columns)
for col in missing_cols:
df_test[col] = 0 # Adicionar colunas ausentes com valor 0
# Garantir que as colunas estão na mesma ordem
df_test = df_test[df_train.columns]
# Separar variáveis preditoras e alvo
X_train = df_train.drop("income", axis=1)
y_train = df_train["income"]
X_test = df_test.drop("income", axis=1)
y_test = df_test["income"]
# Normalizar os dados para modelos sensíveis a escala (SVM e Redes Neurais)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# Criar e treinar os modelos
models = {
"Árvore de Decisão": DecisionTreeClassifier(random_state=42, max_depth=10),
"Random Forest": RandomForestClassifier(random_state=42, n_estimators=100),
"SVM": SVC(kernel="rbf", C=1.0),
"Rede Neural": MLPClassifier(hidden_layer_sizes=(50, 50), max_iter=500, random_state=42)
}
# Treinar e avaliar cada modelo
results = {}
for name, model in models.items():
print(f"\nTreinando {name}...")
if name in ["SVM", "Rede Neural"]:
model.fit(X_train_scaled, y_train) # Modelos que precisam de dados normalizados
y_pred = model.predict(X_test_scaled)
else:
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
results[name] = accuracy
print(f"Acurácia de {name}: {accuracy:.2f}")
print("\nRelatório de Classificação:")
print(classification_report(y_test, y_pred))
print("\nMatriz de Confusão:")
print(confusion_matrix(y_test, y_pred))
# Comparação dos modelos
print("\nResumo das Acurácias:")
for model, acc in results.items():
print(f"{model}: {acc:.2f}")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment