Skip to content

Instantly share code, notes, and snippets.

@gabfssilva
Last active September 17, 2023 00:12
Show Gist options
  • Save gabfssilva/56876ecfe8cc29211f647b5d60e4443c to your computer and use it in GitHub Desktop.
Save gabfssilva/56876ecfe8cc29211f647b5d60e4443c to your computer and use it in GitHub Desktop.
Playing around with world data
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.metrics import accuracy_score, precision_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import PCA
import seaborn as sns
def load_data(filepath):
return pd.read_csv(filepath)
def exploratory_analysis(df):
return # just to avoid plotting stuff all the time
print(df.describe())
print(df.info())
sns.pairplot(df)
plt.show()
def preprocess_data(df):
df['GDP'] = df['GDP'].str.replace('$', '').str.replace(',', '').astype(float)
df.fillna(method='ffill', inplace=True)
# for col in df.select_dtypes(include=['float64', 'int64']).columns:
# df[col].fillna(df[col].mean(), inplace=True)
bins = [0, 1e10, 1e11, float('inf')]
labels = ['baixo', 'médio', 'alto']
df['GDP_category'] = pd.cut(df['GDP'], bins=bins, labels=labels)
category_copy = df['GDP_category'].copy()
hot_encoded = pd.get_dummies(df, drop_first=False)
hot_encoded['GDP_category'] = category_copy
return hot_encoded
def dimensionality_reduction(X_train, X_test):
pca = PCA()
pca.fit(X_train)
explained_variance_ratio = pca.explained_variance_ratio_
explained_variance_cumulative = np.cumsum(explained_variance_ratio)
ideal_pca_components = np.where(explained_variance_cumulative > 0.9)[0][0] + 1
pca = PCA(n_components=ideal_pca_components)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)
return X_train_pca, X_test_pca
def choose_models():
return [DecisionTreeClassifier(), GaussianNB(), SVC(), KNeighborsClassifier(n_neighbors=10)]
def train_model(model, X_train, y_train):
model.fit(X_train, y_train)
return model
def evaluate_model(model, X_test, y_test):
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted', zero_division=1) # Usando média ponderada para multiclasse
return accuracy, precision
def main():
df = load_data('world-data-2023.csv')
exploratory_analysis(df)
df = preprocess_data(df)
X_train, X_test, y_train, y_test = train_test_split(
df.drop(['GDP', 'GDP_category', 'GDP_category_baixo', 'GDP_category_médio', 'GDP_category_alto'], axis=1),
df['GDP_category'].copy(), test_size=0.1, random_state=42)
X_train = X_train.values
X_test = X_test.values
y_train = y_train.values
y_test = y_test.values
models = choose_models()
model_names = ["Decision Tree", "Naive Bayes", "SVM", "KNN"]
for model, name in zip(models, model_names):
trained_model = train_model(model, X_train, y_train)
accuracy, precision = evaluate_model(trained_model, X_test, y_test)
print(f"Accuracy & precision {name}: {accuracy} -- {precision}")
X_train, X_test = dimensionality_reduction(X_train, X_test)
for model, name in zip(models, model_names):
trained_model = train_model(model, X_train, y_train)
accuracy, precision = evaluate_model(trained_model, X_test, y_test)
print(f"[PCA] Accuracy & precision {name}: {accuracy} -- {precision}")
if __name__ == "__main__":
main()
@gabfssilva
Copy link
Author

Accuracy & precision Decision Tree: 0.8 -- 0.825
Accuracy & precision Naive Bayes: 0.35 -- 0.36666666666666664
Accuracy & precision SVM: 0.7 -- 0.8125
Accuracy & precision KNN: 0.85 -- 0.8846153846153847

// with PCA 90%

[PCA] Accuracy & precision Decision Tree: 0.65 -- 0.6404761904761904
[PCA] Accuracy & precision Naive Bayes: 0.5 -- 0.6060606060606061
[PCA] Accuracy & precision SVM: 0.8 -- 0.8571428571428571
[PCA] Accuracy & precision KNN: 0.85 -- 0.8846153846153847

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment