Last active
March 4, 2019 08:21
-
-
Save matthewfeickert/50020b1f2c5d91097b6d9be6383e9acb to your computer and use it in GitHub Desktop.
Working toy model of a simple signal + background model with pseudodata generated from the background model
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import matplotlib.pyplot as plt | |
import numpy as np | |
import pyhf | |
def generate_background(n_events, tau=50.0): | |
""" | |
Sample events from an exponential distribution | |
Args: | |
n_events: The number of events to sample | |
tau: The rate parameter that defines the exponential | |
Returns: | |
Numpy array | |
""" | |
return np.random.exponential(tau, n_events) | |
def generate_signal(n_events, mean=96.0, std=2.0): | |
""" | |
Sample events from an exponential distribution | |
Args: | |
n_events: The number of events to sample | |
mean: The mean of the normal distribution | |
std: The standard deviation of the normal distribution | |
Returns: | |
Numpy array | |
""" | |
return np.random.normal(mean, std, n_events) | |
def generate_pseudodata(n_sig, n_bkg): | |
signal_data = generate_signal(n_sig) | |
background_data = generate_background(n_bkg) | |
return np.concatenate((background_data, signal_data)) | |
def get_mc_counts(pdf, pars): | |
deltas, factors = pdf._modifications(pars) | |
allsum = pyhf.tensorlib.concatenate(deltas + [pyhf.tensorlib.astensor(pdf.thenom)]) | |
nom_plus_delta = pyhf.tensorlib.sum(allsum, axis=0) | |
nom_plus_delta = pyhf.tensorlib.reshape( | |
nom_plus_delta, (1,) + pyhf.tensorlib.shape(nom_plus_delta) | |
) | |
allfac = pyhf.tensorlib.concatenate(factors + [nom_plus_delta]) | |
return pyhf.tensorlib.product(allfac, axis=0) | |
# N.B.: order is the way that it is as there are only 2 samples | |
# Here be hacky hacky crap, turn and weep at your sins | |
def plot( | |
obs_data, pdf, par_name_dict, ax=None, order=[1, 0], labels=None, **par_settings | |
): | |
""" | |
Stacked histograms are hard, so cheat and use barplots instead | |
""" | |
if labels is None: | |
labels = [None] * len(order) | |
pars = pyhf.tensorlib.astensor(pdf.config.suggested_init()) | |
for k, v in par_settings.items(): | |
pars[par_name_dict[k]] = v | |
mc_counts = get_mc_counts(pdf, pars) | |
bottom = None | |
# nb: bar_data[0] because evaluating only one parset | |
bin_width = 2 | |
for i, sample_index in enumerate(order): | |
data = mc_counts[sample_index][0] | |
x = np.arange(0, bin_width * len(data), bin_width) | |
ax.bar( | |
x, | |
data, | |
2, | |
bottom=bottom, | |
align='edge', | |
alpha=1.0, | |
label=labels[sample_index], | |
) | |
bottom = data if i == 0 else bottom + data | |
# print(f'x: {x}\nlength: {len(x)}\n') | |
# print(f'obs_data: {obs_data}\nlength: {len(obs_data)}\n') | |
ax.scatter( | |
x + (bin_width) / 2, obs_data, c='k', alpha=1.0, label=labels[-1], zorder=99 | |
) | |
def fit_example(n_bkg=10000, n_sig=25, frac_signal_embedded=0): | |
np.random.seed(0) | |
# Think about the counting experiment you want to do | |
# Example 1: | |
# n_bkg = 10000 | |
# n_sig = 25 # Hard to tease out of the background | |
# frac_signal_embedded = 0 # Data is background only | |
# So can't resolve given uncertanties, so CLs should be consistent with bkg only | |
# Best fit should be unintersting as well | |
# Example 2: | |
# n_bkg = 10000 | |
# n_sig = 100 # Obvious bump | |
# frac_signal_embedded = 0 # Data is background only | |
# The S+B model poorly models the data | |
# As the data was generated from n_expected_events the background model does poorly also | |
# The results tell us that the modeling is just bad | |
# Example 3: | |
# n_bkg = 10000 | |
# n_sig = 100 # Obvious bump | |
# frac_signal_embedded = 1 # Data includes signal | |
# The S+B model described the data well (Break out champagne) | |
# The fit nicely pulls out the signal component | |
# Example 4: | |
# n_bkg = 10000 | |
# n_sig = 100 # Obvious bump | |
# frac_signal_embedded = 1.2 # Signal enhacement (some new physics) | |
# The fit is inconstent with mu=1 | |
n_expected_events = int((1.0 * n_bkg) + (1.0 * n_sig)) | |
# Set binning | |
bin_width = 2 # GeV | |
mass_range = [0, 150] # GeV | |
# numpy/matplotlib expect a right edge too | |
bin_edges = np.linspace( | |
mass_range[0], | |
mass_range[1] + bin_width, | |
int((mass_range[1] - mass_range[0]) / bin_width) + 1, | |
endpoint=False, | |
) | |
# Build models | |
bkg_data = generate_background(n_bkg) | |
sig_data = generate_signal(n_sig) | |
bkg_model = np.histogram(bkg_data, bins=bin_edges)[0].tolist() | |
bkg_uncerts = np.sqrt(bkg_model).tolist() | |
sig_model = np.histogram(sig_data, bins=bin_edges)[0].tolist() | |
pdf = pyhf.simplemodels.hepdata_like( | |
signal_data=sig_model, bkg_data=bkg_model, bkg_uncerts=bkg_uncerts | |
) | |
# ...and simulate Nature with those models | |
n_signal_embedded = int(frac_signal_embedded * n_sig) | |
n_pseudodata = int( | |
np.random.normal(n_expected_events, np.sqrt(n_expected_events), 1)[0] | |
) | |
pseudodata = generate_pseudodata( | |
n_signal_embedded, n_pseudodata - n_signal_embedded | |
) | |
pseudodata_hist = np.histogram(pseudodata, bins=bin_edges)[0].tolist() | |
# Visualize | |
# Plot models | |
plt.figure(figsize=(10, 7)) | |
plt.hist(bkg_data, bins=bin_edges, label='Background model') | |
plt.hist(sig_data, bins=bin_edges, label='Signal model') | |
plt.xlim(mass_range[0], mass_range[1]) | |
plt.xlabel('Invariant Mass [GeV]', fontsize=16) | |
plt.ylabel(f'Number of Candidates [Count / {bin_width} GeV]', fontsize=16) | |
plt.xticks(fontsize=14) | |
plt.yticks(fontsize=14) | |
plt.legend() | |
plt.savefig('bkg_plus_signal.pdf') | |
# Plot data with model | |
plt.hist(pseudodata, bins=bin_edges, alpha=0.6, label='Data') | |
plt.legend() | |
plt.savefig('data_overlaid.pdf') | |
# Ask what is the p-value to have observed the data given the | |
# presence of the signal model with signal strength of mu=1 (CLs_obs) | |
# ...and the p-value under the background only model (mu=0) (CLs_exp) | |
CLs_obs, CLs_exp = pyhf.utils.hypotest( | |
poi_test=1.0, | |
data=pseudodata_hist + pdf.config.auxdata, | |
pdf=pdf, | |
return_expected=True, | |
) | |
print(f'Observed: {CLs_obs}, Expected: {CLs_exp}') | |
# Given the observed data, what model parameters maximize the likelihood? | |
best_fit = pyhf.optimizer.unconstrained_bestfit( | |
pyhf.utils.loglambdav, | |
pseudodata_hist + pdf.config.auxdata, | |
pdf, | |
pdf.config.suggested_init(), | |
pdf.config.suggested_bounds(), | |
) | |
par_name_dict = {k: v['slice'].start for k, v in pdf.config.par_map.items()} | |
best_fit_values = {k: best_fit[v] for k, v in par_name_dict.items()} | |
fig, ax = plt.subplots(1, 1, figsize=(10, 7)) | |
plot( | |
pseudodata_hist, | |
pdf, | |
par_name_dict, | |
ax=ax, | |
labels=[ | |
'Signal model (post fit)', | |
'Background model (post fit)', | |
'Observed data', | |
], | |
**best_fit_values, | |
) | |
ax.plot( | |
[], [], ' ', label=r'Signal strength $\mu$:' + f" {best_fit_values['mu']:.3f}" | |
) | |
plt.xlim(mass_range[0], mass_range[1]) | |
plt.xlabel('Invariant Mass [GeV]', fontsize=16) | |
plt.ylabel(f'Number of Candidates [Count / {bin_width} GeV]', fontsize=16) | |
plt.xticks(fontsize=14) | |
plt.yticks(fontsize=14) | |
# manipulate the ordering as I forget how to do this properly | |
handles, labels = plt.gca().get_legend_handles_labels() | |
bump_head_to_tail = lambda lst: [*lst[1:], lst[0]] | |
plt.legend(bump_head_to_tail(handles), bump_head_to_tail(labels)) | |
# plt.show() | |
fig.savefig('best_fit.pdf') | |
if __name__ == '__main__': | |
fit_example() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
@stephensekula, if you find that NumPy is running into issues and hitting iteration limits then I would suggest switching over to one of the tensorlibs that actually has autograd in it. For example, to do this with TensorFlow then you'd just have to switch out the backend (after first installing the required additional libraries with
pip install pyhf[tensorflow]
)and then remember to have the session values get printed properly
The code stays the same, just what engine is doing the calculations changes. 👍