Last active
October 14, 2023 19:04
-
-
Save erikbern/23d299a95fc00bea306e775d4e9ffd68 to your computer and use it in GitHub Desktop.
Kaplan-Meier for multiple revenue events
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from matplotlib import pyplot | |
import random | |
import time | |
pyplot.style.use("ggplot") | |
now = time.time() | |
def generate_user(censor=now): | |
# Pick some point in time the user was created | |
t_created = t = now - random.random() * 1e7 | |
events = [] | |
# Simulate revenue/death events | |
while True: | |
if random.random() < 0.4: | |
break # death | |
# Let some random time pass | |
t += random.expovariate(1 / 1e6) | |
if t >= now: | |
break # censoring | |
# Revenue event | |
events.append((t - t_created, random.lognormvariate(5, 1))) | |
# Append the censoring event | |
events.append((now - t_created, None)) | |
return events | |
# Get users | |
users = [generate_user() for i in range(1000)] | |
# Now, do a form of Kaplan-Meier. | |
# First, unify all events and sort them | |
unified_events = sum(users, []) | |
unified_events.sort() | |
# Now, let's go through all events and compute the average contribution | |
n_users = len(users) | |
total_rev = 0 | |
ts = [] | |
ys = [] | |
for t, revenue in unified_events: | |
if revenue is None: | |
# Censoring event | |
n_users -= 1 | |
else: | |
# Some user had a revenue event | |
# Compute the average contribution over all non-censored users remaining in the dataset | |
total_rev += revenue / n_users | |
ts.append(t) | |
ys.append(total_rev) | |
# Plot Kaplan-Meier | |
pyplot.plot(ts, ys, label="Kaplan-Meier") | |
# Generate some non-censored data and average over a much larger dataset to compute the truth | |
# This is just so that we can compare the Kaplan-Meier estimate with the "real" estimate | |
fake_events = [] | |
n_users = 10000 | |
for i in range(n_users): | |
t = 0 | |
dead = False | |
while t < 1e7: | |
if random.random() < 0.4: | |
dead = True | |
# Let some random time pass | |
t += random.expovariate(1 / 1e6) | |
# Revenue event | |
if dead: | |
fake_events.append((t, 0)) | |
else: | |
fake_events.append((t, random.lognormvariate(5, 1))) | |
fake_events.sort() | |
ts = [] | |
ys = [] | |
total_rev = 0.0 | |
for t, rev in fake_events: | |
total_rev += rev / n_users | |
ts.append(t) | |
ys.append(total_rev) | |
# Plot the simulated version | |
pyplot.plot(ts, ys, label="Simulated non-censored") | |
pyplot.xlabel("Time (seconds)") | |
pyplot.ylabel("Average cumulative revenue from user") | |
pyplot.title("Kaplan-Meier modified to handle multiple revenue events") | |
pyplot.xlim([0, 1e7]) | |
pyplot.legend() | |
pyplot.show() |
Author
erikbern
commented
Jul 11, 2022
•
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment