Skip to content

Instantly share code, notes, and snippets.

@Brideau
Last active December 12, 2021 23:53
Show Gist options
  • Save Brideau/15ec54668ea15cb0eada554bae7d6f13 to your computer and use it in GitHub Desktop.
Save Brideau/15ec54668ea15cb0eada554bae7d6f13 to your computer and use it in GitHub Desktop.
import faker as f
import numpy as np
from numpy.random import default_rng
from pandas import DataFrame
from scipy.stats import truncnorm
from sklearn.metrics import average_precision_score
fake_bad_actor_generator = f.Faker()
rng = default_rng()
def generate_potential_bad_actor(position):
"""
The details of this function don't really matter. It just generates a
random 'potential bad actor' and labels it with whether it was actually
determined to be fraudulent, and how likely the model though there were
fraudulent.
"""
return {
"name": fake_bad_actor_generator.name(),
"email": fake_bad_actor_generator.ascii_email(),
"is_fraudulent": rng.binomial(1, (1000.0 - position) / 1000.0),
"probability_of_fraudulent": round(
truncnorm.rvs(a=0.0, b=(1000.0 - position) / 1000.0), 2
),
}
ordered_fraud_list = [generate_potential_bad_actor(r) for r in np.arange(0, 1000)]
ordered_fraud_df = DataFrame(ordered_fraud_list).sample(frac=1)
# Calculate Average Precision
average_precision_score(
y_true=ordered_fraud_df["is_fraudulent"],
y_score=ordered_fraud_df["probability_of_fraudulent"],
)
def precision_at_k(y_true, y_score, k, pos_label=1):
from sklearn.utils import column_or_1d
from sklearn.utils.multiclass import type_of_target
y_true_type = type_of_target(y_true)
if not (y_true_type == "binary"):
raise ValueError("y_true must be a binary column.")
# Makes this compatible with various array types
y_true_arr = column_or_1d(y_true)
y_score_arr = column_or_1d(y_score)
y_true_arr = y_true_arr == pos_label
desc_sort_order = np.argsort(y_score_arr)[::-1]
y_true_sorted = y_true_arr[desc_sort_order]
y_score_sorted = y_score_arr[desc_sort_order]
true_positives = y_true_sorted[:k].sum()
return true_positives / k
precision_at_k(
y_true=ordered_fraud_df["is_fraudulent"],
y_score=ordered_fraud_df["probability_of_fraudulent"],
k=40
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment