Skip to content

Instantly share code, notes, and snippets.

@CamDavidsonPilon
Last active August 29, 2015 14:08
Show Gist options
  • Save CamDavidsonPilon/e78fddff916dc47145f9 to your computer and use it in GitHub Desktop.
Save CamDavidsonPilon/e78fddff916dc47145f9 to your computer and use it in GitHub Desktop.
The Class Imbalance Problem
import numpy as np
from numpy.random import binomial, beta
import pandas as pd
N = np.r_[ 750*np.arange(1,30) ]
p_assignment = 0.5
def sample_beta_posterior(N, C):
return beta(1+C,N-C+1)
trials = 10000
conversion = np.linspace(0.01,.5,20)
def pprint(results, p, N):
print """Total participants: %d, conversion probability: %.3f"""%(N, p)
print """Sample var. of delta: %.3f"""%results.var()
print """Sample mean of delta: %.4f"""%results.mean()
print "---------------------------------"
print
var_results = pd.DataFrame(np.zeros((len(N), len(conversion))), index=N, columns=conversion)
for n in N:
for p in conversion:
_results = np.zeros(trials)
for i in range(trials):
split = binomial(n, p_assignment)
N_A, N_B = split, n - split
C_A, C_B = binomial(N_A, p), binomial(N_B, p)
delta = 1.0*C_A/N_A - 1.0*C_B/N_B
_results[i] = delta
pprint(_results, p, n)
var_results[p].ix[n] = _results.var()
var_results.columns = map(lambda r: "%.2f"%r, var_results.columns)
plt.imshow(np.log(var_results.values), cmap=plt.cm.gist_heat, interpolation='none')
plt.yticks(np.arange(0.5, len(var_results.index), 1)[::4], var_results.index[::4])
plt.xticks(np.arange(0.5, len(var_results.columns), 1)[::3], var_results.columns[::3])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment