Skip to content

Instantly share code, notes, and snippets.

@andrzejnovak
Last active July 29, 2019 10:18
Show Gist options
  • Save andrzejnovak/137f7dd490f2bf767dea475373200966 to your computer and use it in GitHub Desktop.
Save andrzejnovak/137f7dd490f2bf767dea475373200966 to your computer and use it in GitHub Desktop.
def cut(tdf, ptlow=300, pthigh=2000, mlow = 20, mhigh = 220):
cdf = tdf[(tdf.fj_pt < pthigh) & (tdf.fj_pt>ptlow) &(tdf.fj_sdmass < mhigh) & (tdf.fj_sdmass>mlow)]
return cdf
def roc_input(frame, signal=["HCC"], include = ["HCC", "Light", "gBB", "gCC", "HBB"], norm=False, tagger='fj_doubleb'):
# Bkg def - filter unwanted
bkg = np.zeros(frame.shape[0])
for label in include:
bkg = np.add(bkg, frame['truth'+label].values )
bkg = [bool(x) for x in bkg]
tdf = frame[bkg] #tdf for temporary df
# Signal
truth = np.zeros(tdf.shape[0])
predict = np.zeros(tdf.shape[0])
prednorm = np.zeros(tdf.shape[0])
for label in signal:
truth += tdf['truth'+label].values
predict += tdf['predict'+label].values
for label in include:
prednorm += tdf['predict'+label].values
tag_vals = tdf[tagger].values
if norm == False:
return truth, predict, tag_vals
else:
return truth, np.divide(predict, prednorm), tag_vals
def compare_rocs(dfs=[], names=[], sigs=[["Hcc"]], bkgs=[["Hbb"]], norm=False, pt=[300,2000],
tagger_names=None,
flip = None,
title=None,
ignore = None,
plotname="", colors=[0,1], styles=['-','-'], year='2016', use_tagger=[False], log=True):
c_list = ['darkorange', 'steelblue', 'firebrick', 'purple', 'orangered']
f, ax = plt.subplots(figsize=(10, 10))
if tagger_names == None: tagger_names = ['fj_doubleb']*len(dfs)
if ignore == None: ignore = [False]*len(dfs)
if flip == None: flip = [False]*len(dfs)
for frame, tagger, name, sig, bkg, col, sty, db_id, skip, flip_this in zip(dfs, tagger_names, names, sigs, bkgs, colors, styles, use_tagger*len(colors), ignore, flip):
if skip: continue
mlow = 40; mhigh = 200
frame = cut(frame, ptlow=pt[0] , pthigh =pt[1], mlow = mlow, mhigh = mhigh)
truth, predict, db = roc_input(frame, signal=sig, include = sig+bkg, norm=norm, tagger=tagger)
if flip_this: predict = 1 - predict
if not db_id:
fpr, tpr, threshold = roc_curve(truth, predict)
else:
fpr, tpr, threshold = roc_curve(truth, db)
if type(col) == int: color_toplot = c_list[col]
else: color_toplot = col
if not name.startswith("raw:"): lab = "DeepDouble{}, AUC = {:.1f}\%".format(name, auc(fpr,tpr)*100)
else: lab = "{}, AUC = {:.1f}\%".format(name[len('raw:'):], auc(fpr,tpr)*100)
ax.plot(tpr, fpr, lw=4, label=lab, color=color_toplot, linestyle=sty)
ax.set_xlim(0,1)
ax.set_ylim(0.001,1)
sigs = sorted(list(set([item for sublist in sigs for item in sublist])))
bkgs = sorted(list(set([item for sublist in bkgs for item in sublist])))
print sigs
print bkgs
if len(sigs) == 1 and len(sigs[0]) == 3 and sigs[0][0] in ["H", "Z", "g"]:
xlab = '{} \\rightarrow {}'.format(sigs[0][0], sigs[0][-2]+'\\bar{'+sigs[0][-1]+'}')
ax.set_xlabel(r'Tagging efficiency ($\mathrm{}$)'.format('{'+xlab+'}'), ha='right', x=1.0)
else:
xlab = ['{} \\rightarrow {}'.format(l[0], l[-2]+'\\bar{'+l[-1]+'}') if l[0][0] in ["H", "Z", "g"] else l for l in sigs ]
ax.set_xlabel(r'Tagging efficiency ($\mathrm{}$)'.format("{"+", ".join(xlab)+"}"), ha='right', x=1.0)
if len(bkgs) == 1 and len(bkgs[0]) == 3 and bkgs[0][0] in ["H", "Z", "g"]:
ylab = '{} \\rightarrow {}'.format(bkgs[0][0], bkgs[0][-2]+'\\bar{'+bkgs[0][-1]+'}')
ax.set_ylabel(r'Mistagging rate ($\mathrm{}$)'.format('{'+ylab+'}'), ha='right', y=1.0)
else:
ylab = ['{} \\rightarrow {}'.format(l[0], l[-2]+'\\bar{'+l[-1]+'}') if l[0][0] in ["H", "Z", "g"] else l for l in bkgs ]
ax.set_ylabel(r'Mistagging rate ($\mathrm{}$)'.format("{"+", ".join(ylab)+"}"), ha='right', y=1.0)
import matplotlib.ticker as plticker
ax.xaxis.set_major_locator(plticker.MultipleLocator(base=0.1))
ax.xaxis.set_minor_locator(plticker.MultipleLocator(base=0.02))
#ax.yaxis.set_minor_locator(plticker.MultipleLocator(base=0.02))
ax.tick_params(direction='in', axis='both', which='major', labelsize=18, length=12 )
ax.tick_params(direction='in', axis='both', which='minor' , length=6)
ax.xaxis.set_ticks_position('both')
ax.yaxis.set_ticks_position('both')
if log: ax.semilogy()
ax.grid(which='minor', alpha=0.5, axis='y', linestyle='dotted')
ax.grid(which='major', alpha=0.9, linestyle='dotted')
leg = ax.legend(borderpad=1, frameon=False, loc=2, fontsize=18,
title = ""+str(int(round((pt[0]))))+" $\mathrm{<\ jet\ p_T\ <}$ "+str(int(round((pt[1]))))+" GeV" \
+ "\n "+str(int(round(mlow)))+" $\mathrm{<\ jet\ m_{sd}\ <}$ "+str(int(round(mhigh)))+" GeV"
)
leg._legend_box.align = "left"
ax.annotate(r'{} (13 TeV)'.format(year), xy=(1, 1.015), xycoords='axes fraction', fontsize=22, fontname='Helvetica',
ha='right', annotation_clip=False)
ax.annotate('$\mathbf{CMS}$', xy=(0.001, 1.015), xycoords='axes fraction', fontname='Helvetica', fontsize=28,
ha='left', annotation_clip=False)
ax.annotate('$Simulation\ Preliminary$', xy=(0.12, 1.015), xycoords='axes fraction', fontsize=22, fontname='Helvetica',
fontstyle='italic', ha='left', annotation_clip=False)
if title != None: ax.set_title(title)
if log: ax.semilogy()
if len(plotname) > 1:
f.savefig(os.path.join(savedir, "ROCComparison_"+plotname+"pt{}-{}".format(pt[0], pt[1])+".pdf"), dpi=400, transparent=True)
f.savefig(os.path.join(savedir, "ROCComparison_"+plotname+"pt{}-{}".format(pt[0], pt[1])+".png"), dpi=400, transparent=True)
else:
if norm: f.savefig(os.path.join(savedir, "ROCNormComparison_"+"+".join(names)+"pt{}-{}".format(pt[0], pt[1])+".pdf"), dpi=400, transparent=True)
else: f.savefig(os.path.join(savedir, "ROCComparison_"+"+".join(names)+"pt{}-{}".format(pt[0], pt[1])+".pdf"), dpi=400, transparent=True)
if norm: f.savefig(os.path.join(savedir, "ROCNormComparison_"+"+".join(names)+"pt{}-{}".format(pt[0], pt[1])+".png"), dpi=400, transparent=True)
else: f.savefig(os.path.join(savedir, "ROCComparison_"+"+".join(names)+"pt{}-{}".format(pt[0], pt[1])+".png"), dpi=400 , transparent=True)
plt.show()
for plow, phigh in zip([300,1000],[2000,1400]):
compare_rocs(dfs=[df2, df2, df2],
names=["raw:double-b (2016)", "BvL", "raw:DeepAK8:ZHbb"],
tagger_names = ['fj_doubleb', 'fj_DDBvL_mi', 'fj_DeepAK8HbbvsQCD_mi'],
use_tagger = [True, True, True],
colors=[0,1,'black'],
pt = [plow, phigh],
styles=['--', '-', '-'],
sigs=[["Zbb"], ["Zbb"], ["Zbb"] ],
bkgs=[["QCD"], ["QCD"], ["QCD"]], plotname="Zbb", year='2017')
import seaborn as sns
import scipy
smap = [(0, ()), #solid',
#(0, (5, 10)), #loosely dashed',
(0, (5, 5)), #dashed',
#(0, (5, 1)), #densely dashed',
#(0, (3, 10, 1, 10)), #loosely dashdotted',
#(0, (3, 5, 1, 5)), #dashdotted',
(0, (3, 1, 1, 1)), #densely dashdotted',
#(0, (3, 10, 1, 10, 1, 10)), #loosely dashdotdotted',
#(0, (3, 5, 1, 5, 1, 5)), #dashdotdotted',
(0, (3, 1, 1, 1, 1, 1)), #densely dashdotdotted',
#(0, (1, 10)), #loosely dotted',
(0, (1, 1)), #densely dotted',
(0, (1, 5)) #dotted',
]
cmap=sns.color_palette(sns.cubehelix_palette(7, reverse=True))
def sculpting3(frame, siglab="Hcc", sculp_label='Light', fname="", savedir="", pt=[300,2000],
onlyQCD=False, useTagger=None, tagger=" ", debug=False, year=2017, wps=None):
def cut(tdf, ptlow=800, pthigh=2000):
mlow, mhigh = 20, 220
cdf = tdf[(tdf.fj_pt < pthigh) & (tdf.fj_pt>ptlow) &(tdf.fj_sdmass < mhigh) & (tdf.fj_sdmass>mlow)]
return cdf
frame = cut(frame, ptlow=pt[0], pthigh=pt[1])
if siglab == sculp_label: return
def find_nearest(array,value):
idx = (np.abs(array-value)).argmin()
return idx, array[idx]
labels = frame.columns.values
labels = [label[len("truth"):] for label in labels if label.startswith("truth")]
if debug:
for label in labels:print(label, np.sum(frame['truth'+label]))
if useTagger != None :
truth, predict, use_tags = roc_input(frame, signal=[siglab], include = [siglab, sculp_label], tagger=useTagger)
predict = use_tags
else:
truth, predict, use_tags = roc_input(frame, signal=[siglab], include = [siglab, sculp_label])
fpr, tpr, threshold = roc_curve(truth, predict)
cuts = {}
if wps == None: wps [0.003,0.005, 0.01, 0.02, 0.05, 1.0]
effs = []
#for wp in [0.01, 0.05, 0.1, 0.25, 0.5, 1.0]: # % mistag rate
for wp in wps: # % mistag rate
idx, val = find_nearest(fpr, wp)
cuts[str(wp)] = threshold[idx] # threshold for deep double-b corresponding to ~1% mistag rate
effs.append(tpr[idx])
if debug: print( "Cuts", cuts)
print( fname, pt)
print( "mts =", wps[:-1])
print("efs =", np.round(effs[:-1],3))
print("cts =", np.round([cuts[str(key)] for key in wps[:-1]],2))
f, (ax1, ax2) = plt.subplots(2,1, figsize=(10,10), gridspec_kw = {'height_ratios':[3, 1]}, sharex=True)
f.subplots_adjust(hspace=0)
bins = np.linspace(20,220,11)
for i, (wp, cut) in enumerate(reversed(sorted(cuts.items()))):
#print wp
if debug: print( wp, cut)
if useTagger!= None: ctdf = frame[frame[useTagger].values > cut]
else: ctdf = frame[frame['predict'+siglab].values > cut]
if onlyQCD: weight = ctdf['truthQCD'].values
else: weight = ctdf['truth'+sculp_label].values
if float(wp) == 1.0: label = 'No tagging applied'
else: label = 'at {:3.1f}\% mistag rate'.format(float(wp)*100.)
if debug: print( 'Weights', np.sum(weight), len(weight), weight)
if debug: print( "Filling {} events".format(len(ctdf['fj_sdmass'].values[weight > 0.5])))
#n, bins = np.histogram(ctdf['fj_sdmass'].values, bins=bins, weights = weight, density=True)
evts = ctdf['fj_sdmass'].values[weight > 0.5]
grid = np.linspace(40,200, 501)
sns.kdeplot(evts, clip=(40,200), ax=ax1, label=label, color=cmap[i], lw=2.5, linestyle=smap[i])
if float(wp) == 1.:
kde0 = scipy.stats.gaussian_kde(evts)
ax2.plot(grid, np.zeros(len(grid)), color=cmap[i], lw=2.5, linestyle=smap[i])
else:
kde = scipy.stats.gaussian_kde(evts)
ax2.plot(grid, kde(grid)-kde0(grid), color=cmap[i], lw=2.5, linestyle=smap[i])
ax2.set_xlabel(r'$\mathrm{m_{SD}\ [GeV]}$', ha='right', x=1.0)
if onlyQCD:
ax1.set_ylabel(r'Normalized scale ({})'.format('QCD'), ha='right', y=1.0)
ax2.set_ylabel(r'Tagged - Nominal', ha='center')
else:
ax1.set_ylabel(r'Normalized scale ({})'.format(sculp_label.replace("Hcc", r"$\mathrm{H \rightarrow c\bar{c}}$").replace("Hbb", r"$\mathrm{H \rightarrow b\bar{b}}$")), ha='right', y=1.0)
import matplotlib.ticker as plticker
for ax in [ax1,ax2]:
ax.xaxis.set_major_locator(plticker.MultipleLocator(base=20))
ax.xaxis.set_minor_locator(plticker.MultipleLocator(base=10))
ax.tick_params(direction='in', axis='both', which='major', labelsize=15, length=12, labelleft=False )
ax.tick_params(direction='in', axis='both', which='minor' , length=6)
ax.xaxis.set_ticks_position('both')
ax.yaxis.set_ticks_position('both')
ax.yaxis.set_major_locator(plticker.MultipleLocator(base=0.002))
ax.yaxis.set_minor_locator(plticker.MultipleLocator(base=0.0004))
ax.grid(which='major', alpha=0.9, linestyle='dotted')
ax2.grid(which='minor', alpha=0.6, axis='y', linestyle='dotted')
# ax1.yaxis.set_major_locator(plticker.MultipleLocator(base=0.002))
# ax1.yaxis.set_minor_locator(plticker.MultipleLocator(base=0.0004))
# ax2.yaxis.set_major_locator(plticker.MultipleLocator(base=0.0008))
# ax2.yaxis.set_minor_locator(plticker.MultipleLocator(base=0.0004))
ax1.set_xlim(40, 200)
ymax = max(kde0(grid))
ax1.set_ylim(0,ymax*1.25)
subylim = max(max(abs(min((kde(grid[100:400])-kde0(grid[100:400])))), abs(max((kde(grid[100:400])-kde0(grid[100:400])))))*1.05, 0.004)
ax2.set_ylim(-subylim, subylim)
leg = ax1.legend(borderpad=1, frameon=False, loc='best', fontsize='x-small', \
#title = ""+str(int(round((min(frame.fj_pt)))))+" $\mathrm{<\ jet\ p_T\ <}$ "+str(int(round((max(frame.fj_pt)))))+" GeV" \
#+ "\n "+str(int(round((min(frame.fj_sdmass)))))+" $\mathrm{<\ jet\ m_{SD}\ <}$ "+str(int(round((max(frame.fj_sdmass)))))+" GeV"
title= tagger\
+ "\n" + r"Tagging efficiency (H $\rightarrow$ {})".format(siglab[-2:]) )
plt.setp(leg.get_title(), multialignment='center', fontsize='x-small')
leg._legend_box.align = "center"
mpt = ""+str(int(round((min(frame.fj_pt)))))+" $\mathrm{<\ jet\ p_T\ <}$ "+str(int(round((max(frame.fj_pt)))))+" GeV" \
+ "\n "+str(int(round((40))))+" $\mathrm{<\ jet\ m_{SD}\ <}$ "+str(int(round((200))))+" GeV" \
#+ "\n "+ tagger
ax1.annotate( mpt, xy=(0.05, 0.92), xycoords='axes fraction', ha='left', va='top',
annotation_clip=False, multialignment='center', fontsize='x-small')
# ax1.annotate(r'{} (13 TeV)'.format(year), xy=(1, 1.018), xycoords='axes fraction', fontsize=22, fontname='Helvetica',
# ha='right', annotation_clip=False)
# ax1.annotate(r'$\mathbf{CMS}$', xy=(0.001, 1.018), xycoords='axes fraction', fontname='Helvetica', fontsize=28,
# ha='left', annotation_clip=False)
# ax1.annotate(r'$Simulation\ Preliminary$', xy=(0.12, 1.018), xycoords='axes fraction', fontsize=22, fontname='Helvetica',
# fontstyle='italic', ha='left', annotation_clip=False)
ax1.annotate(r'Gaussian kde smoothing', xy=(0.03, 0.05), xycoords='axes fraction', fontsize=16, fontname='Helvetica',
fontstyle='italic', ha='left', annotation_clip=False)
import cmsstyle as cms
ax1 = cms.cms_annot(ax=ax1)
f.savefig(os.path.join(savedir,'kdeM_sculpting_'+fname+"_tag"+siglab+"_"+sculp_label+"_pt{}-{}".format(pt[0], pt[1])+'.png'), dpi=400, transparent=True)
f.savefig(os.path.join(savedir,'kdeM_sculpting_'+fname+"_tag"+siglab+"_"+sculp_label+"_pt{}-{}".format(pt[0], pt[1])+'.pdf'), dpi=400, transparent=True)
for plow, phigh in zip([350],[1200]):
sculpting3(df2, pt=[plow, phigh], siglab="Hbb", sculp_label='QCD', fname='doubleb', wps=[0.01,0.03, 0.06, 0.1, 1.0],
tagger='double-B', savedir=savedir, onlyQCD=True, useTagger='fj_doubleb')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment