Created
January 6, 2023 15:54
-
-
Save marr75/9a9e7fa2e542b8313e899807943391cb to your computer and use it in GitHub Desktop.
Quick vectorized approach to calculating metrics at every potential breakpoint for a classification problem. Nice way to visualize precision, recall, and f1 in continuous graphs if you're introducing unsupervised machine learning or talking about very shallow decision trees.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def evaluate_threshold_binary_classification(x: pandas.Series, y: pandas.Series, reverse=False): | |
""" Calculate quality metrics such as True Positive, True Negative, False Positive, False Negative, Precision, Recall, and F1 | |
for all possible thresholds of a metric x being used to predict a classification against a binary class, y. | |
x: the independent variable we will use as a predictor, a continuos variable | |
y: the dependent variable representing the class we are predicting, 0/1 or bool | |
reverse: whether the x and y series have an inverse relationship | |
""" | |
predictor = pd.DataFrame( | |
{ | |
'discriminant': x * (1 and reverse or -1), | |
'class': y, | |
}, | |
index=normalized_gdf.index, | |
).sort_values('discriminant') | |
break_point_evaluation = predictor.assign( | |
# Predict positive for all rows up to the sorted discriminator | |
predicted_positive=predictor['class'].reset_index().index, | |
# Predict negative for all other rows, `count - predicted_true` | |
predicted_negative=lambda df: (df.predicted_positive * -1) + predictor['class'].count(), | |
# All previous 1/True values will be true positives, `cumulative_sum` | |
true_positive=predictor['class'].cumsum(), | |
# All previous 0/False values will be false positives, `~cumulative_sum` | |
false_positive=(~predictor['class']).cumsum(), | |
# Reversing the sort to find true values from the negative predictions, `predicted_negative - reverse_cumalitive_sum` | |
true_negative=lambda df: df.predicted_negative - predictor.sort_values('discriminant', ascending=False)['class'].cumsum(), | |
# The balance of the negative predictions are | |
false_negative=lambda df: df.predicted_negative - df.true_negative, | |
precision=lambda df: df.true_positive / (df.true_positive + df.false_positive), | |
recall=lambda df: df.true_positive / (df.true_positive + df.false_negative), | |
f1=lambda df: (2 * df.precision * df.recall) / (df.precision + df.recall), | |
discriminant=x, | |
) | |
return break_point_evaluation | |
evaluation = evaluate_threshold_binary_classification( | |
principal_components.PC0, | |
normalized_gdf['ERS U/R'].astype(bool), | |
) | |
evaluation[['discriminant', 'true_positive', 'true_negative', 'false_positive', 'false_negative']].plot('discriminant') | |
evaluation[['discriminant', 'precision', 'recall', 'f1']].plot('discriminant') | |
print( | |
f"Best performing threshold was: {principal_components.PC0[evaluation['f1'].idxmax()]} for {principal_components.PC0.name}." | |
) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment