e-mon · December 25, 2020 05:51
diff --git a/optimize_qwk.py b/optimize_qwk.py
 def qwk(a1, a2):
    """
    Source: https://www.kaggle.com/c/data-science-bowl-2019/discussion/114133#latest-660168

    :param a1:
    :param a2:
    :param max_rat:
    :return:
    """
    max_rat = 3
    a1 = np.asarray(a1, dtype=int)
    a2 = np.asarray(a2, dtype=int)

    hist1 = np.zeros((max_rat + 1, ))
    hist2 = np.zeros((max_rat + 1, ))

    o = 0
    for k in range(a1.shape[0]):
        i, j = a1[k], a2[k]
        hist1[i] += 1
        hist2[j] += 1
        o +=  (i - j) * (i - j)

    e = 0
    for i in range(max_rat + 1):
        for j in range(max_rat + 1):
            e += hist1[i] * hist2[j] * (i - j) * (i - j)

    e = e / a1.shape[0]

    return 1 - o / e

 class OptimizedRounderWithHistgram(object):
    def __init__(self):
        self.coef_ = None
        self.dist = None
    
    def fit(self, X, y):
        self.dist = Counter(y)
        for k in self.dist:
            self.dist[k] /= y.shape[0]
            
    def classify(self, bound, x):
        if x <= bound[0]:
            return 0
        elif x <= bound[1]:
            return 1
        elif x <= bound[2]:
            return 2
        else:
            return 3
        
    def predict(self, preds):
        acum = 0
        bound = np.zeros(shape=(3,))
        for i in range(3):
            acum += self.dist[i]
            bound[i] = np.percentile(preds, acum * 100)
        self.coef_ = bound
        
        return np.array(list(map(lambda x: self.classify(bound, x), preds)))

    def coefficients(self):
        return self.coef_

 def to_bins(x, borders):
    for i in range(len(borders)):
        if x <= borders[i]:
            return i
    return len(borders)

 class OptimizedRounder(object):
    """
    An optimizer for rounding thresholds
    to maximize Quadratic Weighted Kappa (QWK) score
    # https://www.kaggle.com/naveenasaithambi/optimizedrounder-improved
    """
    def __init__(self):
        self.coef_ = 0

    def _kappa_loss(self, coef, X, y, idx):
        X_p = np.array([to_bins(pred, coef) for pred in X])
        ll = -qwk(y, X_p)
        return ll
        
    def fit(self, X, y):
        coef = [0.5, 1.5, 2.5]
        golden1 = 0.618
        golden2 = 1 - golden1
        ab_start = [(0.5, 1.5), (1, 2), (1.5, 2.5)]
        for it1 in range(10):
            for idx in range(len(coef)):
                # golden section search
                a, b = ab_start[idx]
                # calc losses
                coef[idx] = a
                la = self._kappa_loss(coef, X, y, idx)
                coef[idx] = b
                lb = self._kappa_loss(coef, X, y, idx)
                for it in range(20):
                    # choose value
                    if la > lb:
                        a = b - (b - a) * golden1
                        coef[idx] = a
                        la = self._kappa_loss(coef, X, y, idx)
                    else:
                        b = b - (b - a) * golden2
                        coef[idx] = b
                        lb = self._kappa_loss(coef, X, y, idx)
        self.coef_ = {'x': coef}

    def predict(self, X, coef=None):
        """
        Make predictions with specified thresholds
        
        :param X: The raw predictions
        :param coef: A list of coefficients that will be used for rounding
        """
        if coef is None:
            coef = self.coefficients()
        return pd.cut(X, [-np.inf] + list(np.sort(coef)) + [np.inf], labels = [0, 1, 2, 3])


    def coefficients(self):
        """
        Return the optimized coefficients
        """
        return self.coef_['x']
	def qwk(a1, a2):
	"""
	Source: https://www.kaggle.com/c/data-science-bowl-2019/discussion/114133#latest-660168

	:param a1:
	:param a2:
	:param max_rat:
	:return:
	"""
	max_rat = 3
	a1 = np.asarray(a1, dtype=int)
	a2 = np.asarray(a2, dtype=int)

	hist1 = np.zeros((max_rat + 1, ))
	hist2 = np.zeros((max_rat + 1, ))

	o = 0
	for k in range(a1.shape[0]):
	i, j = a1[k], a2[k]
	hist1[i] += 1
	hist2[j] += 1
	o += (i - j) * (i - j)

	e = 0
	for i in range(max_rat + 1):
	for j in range(max_rat + 1):
	e += hist1[i] * hist2[j] * (i - j) * (i - j)

	e = e / a1.shape[0]

	return 1 - o / e

	class OptimizedRounderWithHistgram(object):
	def __init__(self):
	self.coef_ = None
	self.dist = None

	def fit(self, X, y):
	self.dist = Counter(y)
	for k in self.dist:
	self.dist[k] /= y.shape[0]

	def classify(self, bound, x):
	if x <= bound[0]:
	return 0
	elif x <= bound[1]:
	return 1
	elif x <= bound[2]:
	return 2
	else:
	return 3

	def predict(self, preds):
	acum = 0
	bound = np.zeros(shape=(3,))
	for i in range(3):
	acum += self.dist[i]
	bound[i] = np.percentile(preds, acum * 100)
	self.coef_ = bound

	return np.array(list(map(lambda x: self.classify(bound, x), preds)))

	def coefficients(self):
	return self.coef_

	def to_bins(x, borders):
	for i in range(len(borders)):
	if x <= borders[i]:
	return i
	return len(borders)

	class OptimizedRounder(object):
	"""
	An optimizer for rounding thresholds
	to maximize Quadratic Weighted Kappa (QWK) score
	# https://www.kaggle.com/naveenasaithambi/optimizedrounder-improved
	"""
	def __init__(self):
	self.coef_ = 0

	def _kappa_loss(self, coef, X, y, idx):
	X_p = np.array([to_bins(pred, coef) for pred in X])
	ll = -qwk(y, X_p)
	return ll

	def fit(self, X, y):
	coef = [0.5, 1.5, 2.5]
	golden1 = 0.618
	golden2 = 1 - golden1
	ab_start = [(0.5, 1.5), (1, 2), (1.5, 2.5)]
	for it1 in range(10):
	for idx in range(len(coef)):
	# golden section search
	a, b = ab_start[idx]
	# calc losses
	coef[idx] = a
	la = self._kappa_loss(coef, X, y, idx)
	coef[idx] = b
	lb = self._kappa_loss(coef, X, y, idx)
	for it in range(20):
	# choose value
	if la > lb:
	a = b - (b - a) * golden1
	coef[idx] = a
	la = self._kappa_loss(coef, X, y, idx)
	else:
	b = b - (b - a) * golden2
	coef[idx] = b
	lb = self._kappa_loss(coef, X, y, idx)
	self.coef_ = {'x': coef}

	def predict(self, X, coef=None):
	"""
	Make predictions with specified thresholds

	:param X: The raw predictions
	:param coef: A list of coefficients that will be used for rounding
	"""
	if coef is None:
	coef = self.coefficients()
	return pd.cut(X, [-np.inf] + list(np.sort(coef)) + [np.inf], labels = [0, 1, 2, 3])


	def coefficients(self):
	"""
	Return the optimized coefficients
	"""
	return self.coef_['x']