Created
November 20, 2019 08:38
-
-
Save Kingkha/be61a3fde837a6a208d6dae5d8ea1709 to your computer and use it in GitHub Desktop.
calculate roc_auc_score from two numpy ndarray
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Source: http://ethen8181.github.io/machine-learning/model_selection/auc/auc.html#Implementation | |
def _binary_clf_curve(y_true, y_score): | |
""" | |
Calculate true and false positives per binary classification | |
threshold (can be used for roc curve or precision/recall curve); | |
the calcuation makes the assumption that the positive case | |
will always be labeled as 1 | |
Parameters | |
---------- | |
y_true : 1d ndarray, shape = [n_samples] | |
True targets/labels of binary classification | |
y_score : 1d ndarray, shape = [n_samples] | |
Estimated probabilities or scores | |
Returns | |
------- | |
tps : 1d ndarray | |
True positives counts, index i records the number | |
of positive samples that got assigned a | |
score >= thresholds[i]. | |
The total number of positive samples is equal to | |
tps[-1] (thus false negatives are given by tps[-1] - tps) | |
fps : 1d ndarray | |
False positives counts, index i records the number | |
of negative samples that got assigned a | |
score >= thresholds[i]. | |
The total number of negative samples is equal to | |
fps[-1] (thus true negatives are given by fps[-1] - fps) | |
thresholds : 1d ndarray | |
Predicted score sorted in decreasing order | |
References | |
---------- | |
Github: scikit-learn _binary_clf_curve | |
- https://github.com/scikit-learn/scikit-learn/blob/ab93d65/sklearn/metrics/ranking.py#L263 | |
""" | |
# sort predicted scores in descending order | |
# and also reorder corresponding truth values | |
desc_score_indices = np.argsort(y_score)[::-1] | |
y_score = y_score[desc_score_indices] | |
y_true = y_true[desc_score_indices] | |
# y_score typically consists of tied values. Here we extract | |
# the indices associated with the distinct values. We also | |
# concatenate a value for the end of the curve | |
distinct_indices = np.where(np.diff(y_score))[0] | |
end = np.array([y_true.size - 1]) | |
threshold_indices = np.hstack((distinct_indices, end)) | |
thresholds = y_score[threshold_indices] | |
tps = np.cumsum(y_true)[threshold_indices] | |
# (1 + threshold_indices) = the number of positives | |
# at each index, thus number of data points minus true | |
# positives = false positives | |
fps = (1 + threshold_indices) - tps | |
return tps, fps, thresholds | |
def _roc_auc_score(y_true, y_score): | |
""" | |
Compute Area Under the Curve (AUC) from prediction scores | |
Parameters | |
---------- | |
y_true : 1d ndarray, shape = [n_samples] | |
True targets/labels of binary classification | |
y_score : 1d ndarray, shape = [n_samples] | |
Estimated probabilities or scores | |
Returns | |
------- | |
auc : float | |
""" | |
# ensure the target is binary | |
if np.unique(y_true).size != 2: | |
raise ValueError('Only two class should be present in y_true. ROC AUC score ' | |
'is not defined in that case.') | |
tps, fps, _ = _binary_clf_curve(y_true, y_score) | |
# convert count to rate | |
tpr = tps / tps[-1] | |
fpr = fps / fps[-1] | |
# compute AUC using the trapezoidal rule; | |
# appending an extra 0 is just to ensure the length matches | |
zero = np.array([0]) | |
tpr_diff = np.hstack((np.diff(tpr), zero)) | |
fpr_diff = np.hstack((np.diff(fpr), zero)) | |
auc = np.dot(tpr, fpr_diff) + np.dot(tpr_diff, fpr_diff) / 2 | |
return auc | |
if __name__ == '__main__': | |
y_true = np.array([0, 0, 1, 1]) | |
y_scores = np.array([0.1, 0.4, 0.35, 0.8]) | |
print(_roc_auc_score(y_true, y_scores)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment