Last active
June 26, 2024 11:13
-
-
Save jukujala/cbaadd3233b2637a12ccdf6e3be464d7 to your computer and use it in GitHub Desktop.
Generate data from different distributions, fit a log-normal and show how much fitted mean overvaluates data mean.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Generate data from different distributions, fit a log-normal and | |
show how much fitted mean overvaluates data mean. | |
Overvaluation 0.05 = 5% higher fitted log-norm mean than data mean. | |
Copy-paste of output: | |
* log normal data | |
- mean(empirical data): 4.22, mean(fitted long-norm): 4.26, overvaluation: 0.01 | |
* uniform data | |
- mean(empirical data): 0.50, mean(fitted long-norm): 0.64, overvaluation: 0.29 | |
* positive normal data | |
- mean(empirical data): 1.33, mean(fitted long-norm): 1.53, overvaluation: 0.15 | |
* truncated positive normal data | |
- mean(empirical data): 1.18, mean(fitted long-norm): 1.34, overvaluation: 0.14 | |
* ratio of normals data | |
- mean(empirical data): 6.13, mean(fitted long-norm): 2.68, overvaluation: -0.56 | |
* exp data | |
- mean(empirical data): 0.50, mean(fitted long-norm): 0.64, overvaluation: 0.29 | |
* gamma data | |
- mean(empirical data): 2.03, mean(fitted long-norm): 2.50, overvaluation: 0.23 | |
* power law data | |
- mean(empirical data): 0.08, mean(fitted long-norm): 1976909492656863.75, overvaluation: 23310471692441804.00 | |
* poisson data | |
- mean(empirical data): 3.95, mean(fitted long-norm): 4.01, overvaluation: 0.01 | |
* log normal data + few big outliers | |
- mean(empirical data): 4.78, mean(fitted long-norm): 4.53, overvaluation: -0.05 | |
* constant 85.0 data | |
- mean(empirical data): 85.00, mean(fitted long-norm): 85.00, overvaluation: -0.00 | |
* 50% 1.0 and 50% 85.0 data | |
- mean(empirical data): 43.00, mean(fitted long-norm): 108.69, overvaluation: 1.53 | |
""" | |
import numpy as np | |
import scipy.stats as stats | |
def fit_lognorm(some_data): | |
# fit some_data to log-norm an compare mean if fitted distribution vs empirical data | |
shape, loc, scale = stats.lognorm.fit(some_data, floc=0) | |
# log-norm distribution parameter explanations: | |
# https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.lognorm.html | |
sigma = shape | |
mu = np.log(scale) | |
lognorm_mean = np.exp(mu + sigma**2 / 2) | |
overval = lognorm_mean/np.mean(some_data)-1 | |
print(f" - sigma: {sigma}, mu: {mu}") | |
print(f" - mean(empirical data): {np.mean(some_data):.2f}, mean(fitted long-norm): {lognorm_mean:.2f}, overvaluation: {overval:.2f}") | |
# sanity check for log norm fitting: | |
# lognorm_samples = stats.lognorm.rvs(shape, loc=loc, scale=scale, size=1000) | |
# print(f"log-norm sample mean {np.mean(lognorm_samples)}") | |
shape, loc, scale = stats.lognorm.fit(some_data, floc=0, fs=1.0) | |
sigma = shape | |
mu = np.log(scale) | |
lognorm_mean = np.exp(mu + sigma**2 / 2) | |
overval = lognorm_mean/np.mean(some_data)-1 | |
print(f" - sigma: {sigma}, mu: {mu}") | |
print(f" - mean(empirical data): {np.mean(some_data):.2f}, mean(fitted long-norm): {lognorm_mean:.2f}, overvaluation: {overval:.2f}") | |
return shape, loc, scale | |
print("Generate data from different distributions, fit a log-normal and show how much fitted mean overvaluates data mean.") | |
print("Overvaluation 0.05 = 5% higher log-normal mean vs data\n\n") | |
print(" * log normal data") | |
some_data = np.random.lognormal(mean=1.0, sigma=1.0, size=1000) | |
fit_lognorm(some_data) | |
print(" * uniform data") | |
some_data = np.random.uniform(low=0.0, high=1.0, size=1000) | |
fit_lognorm(some_data) | |
print(" * positive normal data") | |
some_data = [] | |
while len(some_data) < 1000: | |
sample = np.random.normal(1.0, 1.0) | |
if sample > 0: | |
some_data.append(sample) | |
fit_lognorm(some_data) | |
print(" * truncated positive normal data") | |
some_data = [] | |
while len(some_data) < 1000: | |
sample = np.random.normal(1.0, 1.0) | |
sample = min(sample, 2.0) | |
if sample > 0: | |
some_data.append(sample) | |
fit_lognorm(some_data) | |
print(" * ratio of normals data") | |
some_data = [] | |
while len(some_data) < 1000: | |
sample1 = np.random.normal(1.0, 1.0) | |
sample2 = np.random.normal(1.0, 1.0) | |
sample = sample1/sample2 | |
if sample > 0: | |
some_data.append(sample) | |
fit_lognorm(some_data) | |
print(" * exp data") | |
mean = 2.0 # for example | |
scale = 1.0 / mean | |
some_data = np.random.exponential(scale, 1000) | |
fit_lognorm(some_data) | |
print(" * gamma data") | |
shape = 1.0 # Example value for k (alpha) | |
scale = 2.0 # Example value for theta (beta) | |
some_data = np.random.gamma(shape, scale, 1000) | |
fit_lognorm(some_data) | |
print(" * power law data") | |
shape_parameter = 0.1 | |
some_data = np.random.power(shape_parameter, 1000) | |
fit_lognorm(some_data) | |
print(" * poisson data") | |
lambda_param = 3.0 | |
some_data = [x+1 for x in np.random.poisson(lambda_param, 1000)] | |
fit_lognorm(some_data) | |
print(" * log normal data + few big outliers") | |
some_data = np.random.lognormal(mean=1.0, sigma=1.0, size=1000) | |
some_data[5] = 100.0 | |
some_data[500] = 100.0 | |
some_data[678] = 100.0 | |
fit_lognorm(some_data) | |
""" | |
import matplotlib.pyplot as plt | |
plt.hist(some_data, bins=30, label='some data') | |
plt.yscale('log') | |
plt.legend() | |
plt.xlabel('Value') | |
plt.ylabel('Frequency (log scale)') | |
plt.title('Histogram with Log Scale on Y-Axis') | |
plt.show() | |
""" | |
print(" * Dream games ios US IAP values (uncapped)") | |
import pandas as pd | |
df = pd.read_csv("dreamgames_ios_us_values_v2.csv") | |
some_data = df["label_final"].tolist() | |
fit_lognorm(some_data) | |
print(" * Dream games ios US IAP values (capped)") | |
some_data = df["label_value"].tolist() | |
fit_lognorm(some_data) | |
print(" * Dream games ios US IAP values (capped, normalized)") | |
some_data = df["normalized_label_value"].tolist() | |
fit_lognorm(some_data) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment