-
-
Save ahlusar1989/a1e1c34f6b03c48e85315e3c9f82a1e6 to your computer and use it in GitHub Desktop.
Yeo-Johnson Transformation
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: UTF-8 -*- | |
import warnings | |
import numpy as np | |
import pandas as pd | |
import sys | |
__author__ = "Mohsen Mesgarpour" | |
__copyright__ = "Copyright 2016, https://github.com/mesgarpour" | |
__credits__ = ["Mohsen Mesgarpour"] | |
__license__ = "GPL" | |
__version__ = "1.0" | |
__maintainer__ = "Mohsen Mesgarpour" | |
__email__ = "[email protected]" | |
class YeoJohnson: | |
""" | |
Computing Yeo-Johnson transofrmation, which is an extension of Box-Cox transformation | |
but can handle both positive and negative values. | |
References: | |
Weisberg, S. (2001). Yeo-Johnson Power Transformations. | |
Department of Applied Statistics, University of Minnesota. Retrieved June, 1, 2003. | |
https://www.stat.umn.edu/arc/yjpower.pdf | |
Adapted from CRAN - Package VGAM | |
""" | |
def fit(self, y, lmbda, derivative=0, epsilon=np.finfo(np.float).eps, inverse=False): | |
""" | |
:param y: The variable to be transformed (numeric array). | |
:param lmbda: The function's Lambda value (numeric value or array). | |
:param derivative: The derivative with respect to lambda | |
(non-negative integer; default: ordinary function evaluation). | |
:param epsilon: The lambda's tolerance (positive value). | |
:param inverse: The inverse transformation option (logical value). | |
:return: The Yeo-Johnson transformation or its inverse, or its derivatives with respect to lambda, of y. | |
""" | |
# Validate arguments | |
self.__validate(y, lmbda, derivative, epsilon, inverse) | |
# initialise | |
y = np.array(y, dtype=float) | |
result = y | |
if not (isinstance(lmbda, list) or isinstance(lmbda, np.ndarray)): | |
lmbda, y = np.broadcast_arrays(lmbda, y) | |
lmbda = np.array(lmbda, dtype=float) | |
l0 = np.abs(lmbda) > epsilon | |
l2 = np.abs(lmbda - 2) > epsilon | |
# Inverse | |
with warnings.catch_warnings(): # suppress warnings | |
warnings.simplefilter("ignore") | |
if inverse is True: | |
mask = np.where(((y >= 0) & l0) == True) | |
result[mask] = np.power(np.multiply(y[mask], lmbda[mask]) + 1, 1 / lmbda[mask]) - 1 | |
mask = np.where(((y >= 0) & ~l0) == True) | |
result[mask] = np.expm1(y[mask]) | |
mask = np.where(((y < 0) & l2) == True) | |
result[mask] = 1 - np.power(np.multiply(-(2 - lmbda[mask]), y[mask]) + 1, 1 / (2 - lmbda[mask])) | |
mask = np.where(((y < 0) & ~l2) == True) | |
result[mask] = -np.expm1(-y[mask]) | |
# Derivative | |
else: | |
if derivative == 0: | |
mask = np.where(((y >= 0) & l0) == True) | |
result[mask] = np.divide(np.power(y[mask] + 1, lmbda[mask]) - 1, lmbda[mask]) | |
mask = np.where(((y >= 0) & ~l0) == True) | |
result[mask] = np.log1p(y[mask]) | |
mask = np.where(((y < 0) & l2) == True) | |
result[mask] = np.divide(-(np.power(-y[mask] + 1, 2 - lmbda[mask]) - 1), 2 - lmbda[mask]) | |
mask = np.where(((y < 0) & ~l2) == True) | |
result[mask] = -np.log1p(-y[mask]) | |
# Not Derivative | |
else: | |
p = self.fit(y, lmbda, derivative=derivative - 1, epsilon=epsilon, inverse=inverse) | |
mask = np.where(((y >= 0) & l0) == True) | |
result[mask] = np.divide(np.multiply(np.power(y[mask] + 1, lmbda[mask]), | |
np.power(np.log1p(y[mask]), derivative)) - | |
np.multiply(derivative, p[mask]), lmbda[mask]) | |
mask = np.where(((y >= 0) & ~l0) == True) | |
result[mask] = np.divide(np.power(np.log1p(y[mask]), derivative + 1), derivative + 1) | |
mask = np.where(((y < 0) & l2) == True) | |
result[mask] = np.divide(-(np.multiply(np.power(-y[mask] + 1, 2 - lmbda[mask]), | |
np.power(-np.log1p(-y[mask]), derivative)) - | |
np.multiply(derivative, p[mask])), 2 - lmbda[mask]) | |
mask = np.where(((y < 0) & ~l2) == True) | |
result[mask] = np.divide(np.power(-np.log1p(-y[mask]), derivative + 1), derivative + 1) | |
return result | |
@staticmethod | |
def __validate(y, lmbda, derivative, epsilon, inverse): | |
try: | |
if not isinstance(y, (list, np.ndarray, pd.Series)): | |
raise Exception("Argument 'y' must be a list!") | |
if not isinstance(lmbda, (int, float, np.int, np.float)): | |
if not isinstance(lmbda, (list, np.ndarray, pd.Series)) or len(lmbda) != len(y): | |
raise Exception("Argument 'lmbda' must be a number " | |
"or a list, which its length matches 'y' argument!") | |
if not isinstance(derivative, (int, float, np.int, np.float)) or derivative < 0: | |
raise Exception("Argument 'derivative' must be a non-negative integer!") | |
if not isinstance(epsilon, (int, float, np.int, np.float)) or epsilon <= 0: | |
raise Exception("Argument 'epsilon' must be a positive number!") | |
if not isinstance(inverse, bool): | |
raise Exception("Argument 'inverse' must be boolean!") | |
if inverse is True and derivative != 0: | |
raise Exception("Argument 'derivative' must be zero " | |
"when argument 'inverse' is 'True'!") | |
except (): | |
sys.exit() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment