Created
May 23, 2018 23:42
-
-
Save gpleiss/0b17bc4bd118b49050056cfcd5446c71 to your computer and use it in GitHub Desktop.
Reliability diagram code
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import torch | |
import numpy as np | |
from matplotlib import pyplot as plt | |
def make_model_diagrams(outputs, labels, n_bins=10): | |
""" | |
outputs - a torch tensor (size n x num_classes) with the outputs from the final linear layer | |
- NOT the softmaxes | |
labels - a torch tensor (size n) with the labels | |
""" | |
softmaxes = torch.nn.functional.softmax(outputs, 1) | |
confidences, predictions = softmaxes.max(1) | |
accuracies = torch.eq(predictions, labels) | |
f, rel_ax = plt.subplots(1, 2, figsize=(4, 2.5)) | |
# Reliability diagram | |
bins = torch.linspace(0, 1, n_bins + 1) | |
bins[-1] = 1.0001 | |
width = bins[1] - bins[0] | |
bin_indices = [confidences.ge(bin_lower) * confidences.lt(bin_upper) for bin_lower, bin_upper in zip(bins[:-1], bins[1:])] | |
bin_corrects = [torch.mean(accuracies[bin_index]) for bin_index in bin_indices] | |
bin_scores = [torch.mean(confidences[bin_index]) for bin_index in bin_scores] | |
confs = rel_ax.bar(bins[:-1], bin_corrects.numpy(), width=width) | |
gaps = rel_ax.bar(bins[:-1], (bin_scores - bin_corrects).numpy(), bottom=bin_corrects.numpy(), color=[1, 0.7, 0.7], alpha=0.5, width=width, hatch='//', edgecolor='r') | |
rel_ax.plot([0, 1], [0, 1], '--', color='gray') | |
rel_ax.legend([confs, gaps], ['Outputs', 'Gap'], loc='best', fontsize='small') | |
# Clean up | |
rel_ax.set_ylabel('Accuracy') | |
rel_ax.set_xlabel('Confidence') | |
f.tight_layout() | |
return f |
accuracies[bin_index].float()
should create a floating point tensor, so I'm confused if this line is causing the error.
Hi guys, I just tried this code (including the modification from @rsilveira79).
Everything went pretty okay. However, I find that I'm not able to plot the gap between confidence and accuracy.
For me, I found the problem is from the bin_corrects, which cannot directly be used correctly as the bottom parameter of the plt.bar() .
I made the following modification, which solves the problem for me. You may also try this if you are facing the same issue.
(with minor style changes)
def make_model_diagrams(outputs, labels, n_bins=10):
"""
outputs - a torch tensor (size n x num_classes) with the outputs from the final linear layer
- NOT the softmaxes
labels - a torch tensor (size n) with the labels
"""
softmaxes = torch.nn.functional.softmax(outputs, 1)
confidences, predictions = softmaxes.max(1)
accuracies = torch.eq(predictions, labels)
overall_accuracy = (predictions==labels).sum().item()/len(labels)
# Reliability diagram
bins = torch.linspace(0, 1, n_bins + 1)
width = 1.0 / n_bins
bin_centers = np.linspace(0, 1.0 - width, n_bins) + width / 2
bin_indices = [confidences.ge(bin_lower) * confidences.lt(bin_upper) for bin_lower, bin_upper in zip(bins[:-1], bins[1:])]
bin_corrects = np.array([ torch.mean(accuracies[bin_index].float()) for bin_index in bin_indices])
bin_scores = np.array([ torch.mean(confidences[bin_index].float()) for bin_index in bin_indices])
bin_corrects = np.nan_to_num(bin_corrects)
bin_scores = np.nan_to_num(bin_scores)
plt.figure(0, figsize=(8, 8))
gap = np.array(bin_scores - bin_corrects)
confs = plt.bar(bin_centers, bin_corrects, color=[0, 0, 1], width=width, ec='black')
bin_corrects = np.nan_to_num(np.array([bin_correct.cpu().numpy() for bin_correct in bin_corrects]))
gaps = plt.bar(bin_centers, gap, bottom=bin_corrects, color=[1, 0.7, 0.7], alpha=0.5, width=width, hatch='//', edgecolor='r')
plt.plot([0, 1], [0, 1], '--', color='gray')
plt.legend([confs, gaps], ['Accuracy', 'Gap'], loc='upper left', fontsize='x-large')
ece = _calculate_ece(outputs, labels)
# Clean up
bbox_props = dict(boxstyle="square", fc="lightgrey", ec="gray", lw=1.5)
plt.text(0.17, 0.82, "ECE: {:.4f}".format(ece), ha="center", va="center", size=20, weight = 'normal', bbox=bbox_props)
plt.title("Reliability Diagram", size=22)
plt.ylabel("Accuracy", size=18)
plt.xlabel("Confidence", size=18)
plt.xlim(0,1)
plt.ylim(0,1)
plt.savefig('reliability_diagram.png')
plt.show()
return ece
Using the unchanged ECE calculation method provided by @rsilveira79.
def _calculate_ece(logits, labels, n_bins=10):
"""
Calculates the Expected Calibration Error of a model.
(This isn't necessary for temperature scaling, just a cool metric).
The input to this loss is the logits of a model, NOT the softmax scores.
This divides the confidence outputs into equally-sized interval bins.
In each bin, we compute the confidence gap:
bin_gap = | avg_confidence_in_bin - accuracy_in_bin |
We then return a weighted average of the gaps, based on the number
of samples in each bin
See: Naeini, Mahdi Pakdaman, Gregory F. Cooper, and Milos Hauskrecht.
"Obtaining Well Calibrated Probabilities Using Bayesian Binning." AAAI.
2015.
"""
bin_boundaries = torch.linspace(0, 1, n_bins + 1)
bin_lowers = bin_boundaries[:-1]
bin_uppers = bin_boundaries[1:]
softmaxes = F.softmax(logits, dim=1)
confidences, predictions = torch.max(softmaxes, 1)
accuracies = predictions.eq(labels)
ece = torch.zeros(1, device=logits.device)
for bin_lower, bin_upper in zip(bin_lowers, bin_uppers):
# Calculated |confidence - accuracy| in each bin
in_bin = confidences.gt(bin_lower.item()) * confidences.le(bin_upper.item())
prop_in_bin = in_bin.float().mean()
if prop_in_bin.item() > 0:
accuracy_in_bin = accuracies[in_bin].float().mean()
avg_confidence_in_bin = confidences[in_bin].mean()
ece += torch.abs(avg_confidence_in_bin - accuracy_in_bin) * prop_in_bin
return ece.item()
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
The result of
accuracies = torch.eq(predictions, labels)
is a boolean tensor. How in linebin_corrects = np.array([ torch.mean(accuracies[bin_index].float()) for bin_index in bin_indices])
, we are calculating mean of a boolean tensor?I am getting this specific error
RuntimeError: Can only calculate the mean of floating types. Got Bool instead.
Do you have any idea how should I resolve that? Thank you.