Last active
March 6, 2024 00:03
-
-
Save cheeseonamonkey/20e610ec9b0e9ec2450442fe41de6c98 to your computer and use it in GitHub Desktop.
python scalers and transforms cheatsheet
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import matplotlib.pyplot as plt | |
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, MaxAbsScaler, Normalizer | |
# Generate demonstration data ensuring positivity | |
better_datasets = [ | |
np.random.rand(100, 2) * 20 + 10, | |
np.random.rand(100, 2) * 100, | |
np.random.beta(100, 2, size=(100, 2)), | |
np.random.f(100, 2, size=(100, 2)), | |
np.random.normal(100, 2, size=(100, 2)), | |
np.random.binomial(100, 0.7, size=(100, 2)), | |
np.random.gamma(100, 2, size=(100, 2)), | |
np.random.exponential(10, size=(100, 2)) + 1, # Adding 1 to ensure positivity | |
np.random.uniform(low=1, high=60, size=(100, 2)) # Ensuring positivity | |
] | |
# Define scalers | |
scalers = { | |
'Standard Scaler': StandardScaler(), | |
'Min-Max Scaler': MinMaxScaler(), | |
'Robust Scaler': RobustScaler(), | |
'MaxAbs Scaler': MaxAbsScaler(), | |
'Normalizer': Normalizer() # Adding Normalizer scaler | |
} | |
plt.figure(figsize=(15, 10)) # Adjusted figsize | |
# Plot histograms for all scalers for each dataset | |
for dataset_index, dataset in enumerate(better_datasets): | |
for i, (scaler_name, scaler) in enumerate(scalers.items(), start=1): | |
ax = plt.subplot(10, len(scalers), dataset_index * len(scalers) + i) | |
scaled_data = scaler.fit_transform(dataset) | |
ax.hist(scaled_data[:, 0], bins=20, alpha=0.75, color='darkred', label='Feature 1') | |
ax.hist(scaled_data[:, 1], bins=20, alpha=0.75, color='darkblue', label='Feature 2') | |
ax.grid(True) | |
if dataset_index == 0: | |
ax.set_title(scaler_name, fontsize=12, fontweight='bold') | |
ax.set_xticks([]) | |
ax.set_yticks([]) | |
plt.tight_layout(w_pad=0.05, h_pad=0.3) | |
plt.show() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import matplotlib.pyplot as plt | |
from scipy.stats import boxcox | |
from sklearn.preprocessing import PowerTransformer | |
# Generate demonstration data ensuring positivity | |
better_datasets = [ | |
np.random.rand(100, 2) * 20 + 10, | |
np.random.rand(100, 2) * 100, | |
np.random.beta(100, 2, size=(100, 2)), | |
np.random.f(100, 2, size=(100, 2)), | |
np.random.normal(100, 2, size=(100, 2)), | |
np.random.binomial(100, 0.7, size=(100, 2)), | |
np.random.gamma(100, 2, size=(100, 2)), | |
np.random.exponential(10, size=(100, 2)) + 1, # Adding 1 to ensure positivity | |
np.random.uniform(low=1, high=60, size=(100, 2)) # Ensuring positivity | |
] | |
# Define transformers | |
transformers = { | |
'Log Transformation': np.log1p, | |
'Square Root Transformation': np.sqrt, | |
'Exponential Transformation': np.exp, | |
'Box-Cox Transformation': boxcox, | |
'Yeo-Johnson Transformation': PowerTransformer(method='yeo-johnson').fit | |
} | |
plt.figure(figsize=(12, 8)) | |
# Plot histograms for all transformers for each dataset | |
for dataset_index, dataset in enumerate(better_datasets): | |
for i, (transformer_name, transformer) in enumerate(transformers.items(), start=1): | |
ax = plt.subplot(10, len(transformers), dataset_index * len(transformers) + i) | |
if transformer_name == 'Box-Cox Transformation': | |
transformed_data, _ = boxcox(dataset.flatten()) | |
transformed_data = transformed_data.reshape(dataset.shape) | |
elif transformer_name == 'Yeo-Johnson Transformation': | |
transformer_instance = PowerTransformer(method='yeo-johnson').fit(dataset) | |
transformed_data = transformer_instance.transform(dataset) | |
else: | |
transformed_data = transformer(dataset) | |
ax.hist(transformed_data[:, 0], bins=20, alpha=0.75, color='darkred', label='Feature 1') | |
ax.hist(transformed_data[:, 1], bins=20, alpha=0.75, color='darkblue', label='Feature 2') | |
ax.grid(True) | |
if dataset_index == 0: | |
ax.set_title(transformer_name, fontsize=12, fontweight='bold') | |
ax.set_xticks([]) | |
ax.set_yticks([]) | |
plt.tight_layout(w_pad=0.05, h_pad=0.3) | |
plt.show() |
Author
cheeseonamonkey
commented
Mar 6, 2024
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment