Skip to content

Instantly share code, notes, and snippets.

@audhiaprilliant
Created December 24, 2020 02:55
Show Gist options
  • Select an option

  • Save audhiaprilliant/e1864f5225991645be7afbd4056c9f3a to your computer and use it in GitHub Desktop.

Select an option

Save audhiaprilliant/e1864f5225991645be7afbd4056c9f3a to your computer and use it in GitHub Desktop.
How to choose the optimal threshold for imbalanced classification
# Import module for data manipulation
import pandas as pd
# Import module for linear algebra
import numpy as np
# Import module for data simulation
from sklearn.datasets import make_classification # Create a synthetic dataframe
from sklearn.linear_model import LogisticRegression # Classification model
from sklearn.model_selection import train_test_split # Split the dataframe
from sklearn.metrics import roc_curve # Calculate the ROC curve
from sklearn.metrics import precision_recall_curve # Calculate the Precision-Recall curve
from sklearn.metrics import f1_score # Calculate the F-score
# Import module for data visualization
from plotnine import *
import plotnine
# Generate the dataset
X, y = make_classification(n_samples = 10000, n_features = 2, n_redundant = 0,
n_clusters_per_class = 1, weights = [0.99], flip_y = 0, random_state = 0)
# Data partitioning
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0, stratify=y)
# Fit the model
reglogModel = LogisticRegression(random_state = 0)
reglogModel.fit(X_train, y_train)
# Predict the probabilities
y_pred = reglogModel.predict_proba(X_test)
# Get the probabilities for positive class
y_pred = y_pred[:, 1]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment