Skip to content

Instantly share code, notes, and snippets.

@erikbern
Last active October 14, 2023 19:04
Show Gist options
  • Save erikbern/23d299a95fc00bea306e775d4e9ffd68 to your computer and use it in GitHub Desktop.
Save erikbern/23d299a95fc00bea306e775d4e9ffd68 to your computer and use it in GitHub Desktop.
Kaplan-Meier for multiple revenue events
from matplotlib import pyplot
import random
import time
pyplot.style.use("ggplot")
now = time.time()
def generate_user(censor=now):
# Pick some point in time the user was created
t_created = t = now - random.random() * 1e7
events = []
# Simulate revenue/death events
while True:
if random.random() < 0.4:
break # death
# Let some random time pass
t += random.expovariate(1 / 1e6)
if t >= now:
break # censoring
# Revenue event
events.append((t - t_created, random.lognormvariate(5, 1)))
# Append the censoring event
events.append((now - t_created, None))
return events
# Get users
users = [generate_user() for i in range(1000)]
# Now, do a form of Kaplan-Meier.
# First, unify all events and sort them
unified_events = sum(users, [])
unified_events.sort()
# Now, let's go through all events and compute the average contribution
n_users = len(users)
total_rev = 0
ts = []
ys = []
for t, revenue in unified_events:
if revenue is None:
# Censoring event
n_users -= 1
else:
# Some user had a revenue event
# Compute the average contribution over all non-censored users remaining in the dataset
total_rev += revenue / n_users
ts.append(t)
ys.append(total_rev)
# Plot Kaplan-Meier
pyplot.plot(ts, ys, label="Kaplan-Meier")
# Generate some non-censored data and average over a much larger dataset to compute the truth
# This is just so that we can compare the Kaplan-Meier estimate with the "real" estimate
fake_events = []
n_users = 10000
for i in range(n_users):
t = 0
dead = False
while t < 1e7:
if random.random() < 0.4:
dead = True
# Let some random time pass
t += random.expovariate(1 / 1e6)
# Revenue event
if dead:
fake_events.append((t, 0))
else:
fake_events.append((t, random.lognormvariate(5, 1)))
fake_events.sort()
ts = []
ys = []
total_rev = 0.0
for t, rev in fake_events:
total_rev += rev / n_users
ts.append(t)
ys.append(total_rev)
# Plot the simulated version
pyplot.plot(ts, ys, label="Simulated non-censored")
pyplot.xlabel("Time (seconds)")
pyplot.ylabel("Average cumulative revenue from user")
pyplot.title("Kaplan-Meier modified to handle multiple revenue events")
pyplot.xlim([0, 1e7])
pyplot.legend()
pyplot.show()
@erikbern
Copy link
Author

erikbern commented Jul 11, 2022

image

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment