Skip to content

Instantly share code, notes, and snippets.

@e-mon
Last active December 25, 2020 05:51
Show Gist options
  • Save e-mon/12aac08c4546f2981a804c515d4b3ff5 to your computer and use it in GitHub Desktop.
Save e-mon/12aac08c4546f2981a804c515d4b3ff5 to your computer and use it in GitHub Desktop.
def qwk(a1, a2):
"""
Source: https://www.kaggle.com/c/data-science-bowl-2019/discussion/114133#latest-660168
:param a1:
:param a2:
:param max_rat:
:return:
"""
max_rat = 3
a1 = np.asarray(a1, dtype=int)
a2 = np.asarray(a2, dtype=int)
hist1 = np.zeros((max_rat + 1, ))
hist2 = np.zeros((max_rat + 1, ))
o = 0
for k in range(a1.shape[0]):
i, j = a1[k], a2[k]
hist1[i] += 1
hist2[j] += 1
o += (i - j) * (i - j)
e = 0
for i in range(max_rat + 1):
for j in range(max_rat + 1):
e += hist1[i] * hist2[j] * (i - j) * (i - j)
e = e / a1.shape[0]
return 1 - o / e
class OptimizedRounderWithHistgram(object):
def __init__(self):
self.coef_ = None
self.dist = None
def fit(self, X, y):
self.dist = Counter(y)
for k in self.dist:
self.dist[k] /= y.shape[0]
def classify(self, bound, x):
if x <= bound[0]:
return 0
elif x <= bound[1]:
return 1
elif x <= bound[2]:
return 2
else:
return 3
def predict(self, preds):
acum = 0
bound = np.zeros(shape=(3,))
for i in range(3):
acum += self.dist[i]
bound[i] = np.percentile(preds, acum * 100)
self.coef_ = bound
return np.array(list(map(lambda x: self.classify(bound, x), preds)))
def coefficients(self):
return self.coef_
def to_bins(x, borders):
for i in range(len(borders)):
if x <= borders[i]:
return i
return len(borders)
class OptimizedRounder(object):
"""
An optimizer for rounding thresholds
to maximize Quadratic Weighted Kappa (QWK) score
# https://www.kaggle.com/naveenasaithambi/optimizedrounder-improved
"""
def __init__(self):
self.coef_ = 0
def _kappa_loss(self, coef, X, y, idx):
X_p = np.array([to_bins(pred, coef) for pred in X])
ll = -qwk(y, X_p)
return ll
def fit(self, X, y):
coef = [0.5, 1.5, 2.5]
golden1 = 0.618
golden2 = 1 - golden1
ab_start = [(0.5, 1.5), (1, 2), (1.5, 2.5)]
for it1 in range(10):
for idx in range(len(coef)):
# golden section search
a, b = ab_start[idx]
# calc losses
coef[idx] = a
la = self._kappa_loss(coef, X, y, idx)
coef[idx] = b
lb = self._kappa_loss(coef, X, y, idx)
for it in range(20):
# choose value
if la > lb:
a = b - (b - a) * golden1
coef[idx] = a
la = self._kappa_loss(coef, X, y, idx)
else:
b = b - (b - a) * golden2
coef[idx] = b
lb = self._kappa_loss(coef, X, y, idx)
self.coef_ = {'x': coef}
def predict(self, X, coef=None):
"""
Make predictions with specified thresholds
:param X: The raw predictions
:param coef: A list of coefficients that will be used for rounding
"""
if coef is None:
coef = self.coefficients()
return pd.cut(X, [-np.inf] + list(np.sort(coef)) + [np.inf], labels = [0, 1, 2, 3])
def coefficients(self):
"""
Return the optimized coefficients
"""
return self.coef_['x']
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment