Last active
July 29, 2019 10:18
-
-
Save andrzejnovak/137f7dd490f2bf767dea475373200966 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def cut(tdf, ptlow=300, pthigh=2000, mlow = 20, mhigh = 220): | |
cdf = tdf[(tdf.fj_pt < pthigh) & (tdf.fj_pt>ptlow) &(tdf.fj_sdmass < mhigh) & (tdf.fj_sdmass>mlow)] | |
return cdf | |
def roc_input(frame, signal=["HCC"], include = ["HCC", "Light", "gBB", "gCC", "HBB"], norm=False, tagger='fj_doubleb'): | |
# Bkg def - filter unwanted | |
bkg = np.zeros(frame.shape[0]) | |
for label in include: | |
bkg = np.add(bkg, frame['truth'+label].values ) | |
bkg = [bool(x) for x in bkg] | |
tdf = frame[bkg] #tdf for temporary df | |
# Signal | |
truth = np.zeros(tdf.shape[0]) | |
predict = np.zeros(tdf.shape[0]) | |
prednorm = np.zeros(tdf.shape[0]) | |
for label in signal: | |
truth += tdf['truth'+label].values | |
predict += tdf['predict'+label].values | |
for label in include: | |
prednorm += tdf['predict'+label].values | |
tag_vals = tdf[tagger].values | |
if norm == False: | |
return truth, predict, tag_vals | |
else: | |
return truth, np.divide(predict, prednorm), tag_vals | |
def compare_rocs(dfs=[], names=[], sigs=[["Hcc"]], bkgs=[["Hbb"]], norm=False, pt=[300,2000], | |
tagger_names=None, | |
flip = None, | |
title=None, | |
ignore = None, | |
plotname="", colors=[0,1], styles=['-','-'], year='2016', use_tagger=[False], log=True): | |
c_list = ['darkorange', 'steelblue', 'firebrick', 'purple', 'orangered'] | |
f, ax = plt.subplots(figsize=(10, 10)) | |
if tagger_names == None: tagger_names = ['fj_doubleb']*len(dfs) | |
if ignore == None: ignore = [False]*len(dfs) | |
if flip == None: flip = [False]*len(dfs) | |
for frame, tagger, name, sig, bkg, col, sty, db_id, skip, flip_this in zip(dfs, tagger_names, names, sigs, bkgs, colors, styles, use_tagger*len(colors), ignore, flip): | |
if skip: continue | |
mlow = 40; mhigh = 200 | |
frame = cut(frame, ptlow=pt[0] , pthigh =pt[1], mlow = mlow, mhigh = mhigh) | |
truth, predict, db = roc_input(frame, signal=sig, include = sig+bkg, norm=norm, tagger=tagger) | |
if flip_this: predict = 1 - predict | |
if not db_id: | |
fpr, tpr, threshold = roc_curve(truth, predict) | |
else: | |
fpr, tpr, threshold = roc_curve(truth, db) | |
if type(col) == int: color_toplot = c_list[col] | |
else: color_toplot = col | |
if not name.startswith("raw:"): lab = "DeepDouble{}, AUC = {:.1f}\%".format(name, auc(fpr,tpr)*100) | |
else: lab = "{}, AUC = {:.1f}\%".format(name[len('raw:'):], auc(fpr,tpr)*100) | |
ax.plot(tpr, fpr, lw=4, label=lab, color=color_toplot, linestyle=sty) | |
ax.set_xlim(0,1) | |
ax.set_ylim(0.001,1) | |
sigs = sorted(list(set([item for sublist in sigs for item in sublist]))) | |
bkgs = sorted(list(set([item for sublist in bkgs for item in sublist]))) | |
print sigs | |
print bkgs | |
if len(sigs) == 1 and len(sigs[0]) == 3 and sigs[0][0] in ["H", "Z", "g"]: | |
xlab = '{} \\rightarrow {}'.format(sigs[0][0], sigs[0][-2]+'\\bar{'+sigs[0][-1]+'}') | |
ax.set_xlabel(r'Tagging efficiency ($\mathrm{}$)'.format('{'+xlab+'}'), ha='right', x=1.0) | |
else: | |
xlab = ['{} \\rightarrow {}'.format(l[0], l[-2]+'\\bar{'+l[-1]+'}') if l[0][0] in ["H", "Z", "g"] else l for l in sigs ] | |
ax.set_xlabel(r'Tagging efficiency ($\mathrm{}$)'.format("{"+", ".join(xlab)+"}"), ha='right', x=1.0) | |
if len(bkgs) == 1 and len(bkgs[0]) == 3 and bkgs[0][0] in ["H", "Z", "g"]: | |
ylab = '{} \\rightarrow {}'.format(bkgs[0][0], bkgs[0][-2]+'\\bar{'+bkgs[0][-1]+'}') | |
ax.set_ylabel(r'Mistagging rate ($\mathrm{}$)'.format('{'+ylab+'}'), ha='right', y=1.0) | |
else: | |
ylab = ['{} \\rightarrow {}'.format(l[0], l[-2]+'\\bar{'+l[-1]+'}') if l[0][0] in ["H", "Z", "g"] else l for l in bkgs ] | |
ax.set_ylabel(r'Mistagging rate ($\mathrm{}$)'.format("{"+", ".join(ylab)+"}"), ha='right', y=1.0) | |
import matplotlib.ticker as plticker | |
ax.xaxis.set_major_locator(plticker.MultipleLocator(base=0.1)) | |
ax.xaxis.set_minor_locator(plticker.MultipleLocator(base=0.02)) | |
#ax.yaxis.set_minor_locator(plticker.MultipleLocator(base=0.02)) | |
ax.tick_params(direction='in', axis='both', which='major', labelsize=18, length=12 ) | |
ax.tick_params(direction='in', axis='both', which='minor' , length=6) | |
ax.xaxis.set_ticks_position('both') | |
ax.yaxis.set_ticks_position('both') | |
if log: ax.semilogy() | |
ax.grid(which='minor', alpha=0.5, axis='y', linestyle='dotted') | |
ax.grid(which='major', alpha=0.9, linestyle='dotted') | |
leg = ax.legend(borderpad=1, frameon=False, loc=2, fontsize=18, | |
title = ""+str(int(round((pt[0]))))+" $\mathrm{<\ jet\ p_T\ <}$ "+str(int(round((pt[1]))))+" GeV" \ | |
+ "\n "+str(int(round(mlow)))+" $\mathrm{<\ jet\ m_{sd}\ <}$ "+str(int(round(mhigh)))+" GeV" | |
) | |
leg._legend_box.align = "left" | |
ax.annotate(r'{} (13 TeV)'.format(year), xy=(1, 1.015), xycoords='axes fraction', fontsize=22, fontname='Helvetica', | |
ha='right', annotation_clip=False) | |
ax.annotate('$\mathbf{CMS}$', xy=(0.001, 1.015), xycoords='axes fraction', fontname='Helvetica', fontsize=28, | |
ha='left', annotation_clip=False) | |
ax.annotate('$Simulation\ Preliminary$', xy=(0.12, 1.015), xycoords='axes fraction', fontsize=22, fontname='Helvetica', | |
fontstyle='italic', ha='left', annotation_clip=False) | |
if title != None: ax.set_title(title) | |
if log: ax.semilogy() | |
if len(plotname) > 1: | |
f.savefig(os.path.join(savedir, "ROCComparison_"+plotname+"pt{}-{}".format(pt[0], pt[1])+".pdf"), dpi=400, transparent=True) | |
f.savefig(os.path.join(savedir, "ROCComparison_"+plotname+"pt{}-{}".format(pt[0], pt[1])+".png"), dpi=400, transparent=True) | |
else: | |
if norm: f.savefig(os.path.join(savedir, "ROCNormComparison_"+"+".join(names)+"pt{}-{}".format(pt[0], pt[1])+".pdf"), dpi=400, transparent=True) | |
else: f.savefig(os.path.join(savedir, "ROCComparison_"+"+".join(names)+"pt{}-{}".format(pt[0], pt[1])+".pdf"), dpi=400, transparent=True) | |
if norm: f.savefig(os.path.join(savedir, "ROCNormComparison_"+"+".join(names)+"pt{}-{}".format(pt[0], pt[1])+".png"), dpi=400, transparent=True) | |
else: f.savefig(os.path.join(savedir, "ROCComparison_"+"+".join(names)+"pt{}-{}".format(pt[0], pt[1])+".png"), dpi=400 , transparent=True) | |
plt.show() | |
for plow, phigh in zip([300,1000],[2000,1400]): | |
compare_rocs(dfs=[df2, df2, df2], | |
names=["raw:double-b (2016)", "BvL", "raw:DeepAK8:ZHbb"], | |
tagger_names = ['fj_doubleb', 'fj_DDBvL_mi', 'fj_DeepAK8HbbvsQCD_mi'], | |
use_tagger = [True, True, True], | |
colors=[0,1,'black'], | |
pt = [plow, phigh], | |
styles=['--', '-', '-'], | |
sigs=[["Zbb"], ["Zbb"], ["Zbb"] ], | |
bkgs=[["QCD"], ["QCD"], ["QCD"]], plotname="Zbb", year='2017') | |
import seaborn as sns | |
import scipy | |
smap = [(0, ()), #solid', | |
#(0, (5, 10)), #loosely dashed', | |
(0, (5, 5)), #dashed', | |
#(0, (5, 1)), #densely dashed', | |
#(0, (3, 10, 1, 10)), #loosely dashdotted', | |
#(0, (3, 5, 1, 5)), #dashdotted', | |
(0, (3, 1, 1, 1)), #densely dashdotted', | |
#(0, (3, 10, 1, 10, 1, 10)), #loosely dashdotdotted', | |
#(0, (3, 5, 1, 5, 1, 5)), #dashdotdotted', | |
(0, (3, 1, 1, 1, 1, 1)), #densely dashdotdotted', | |
#(0, (1, 10)), #loosely dotted', | |
(0, (1, 1)), #densely dotted', | |
(0, (1, 5)) #dotted', | |
] | |
cmap=sns.color_palette(sns.cubehelix_palette(7, reverse=True)) | |
def sculpting3(frame, siglab="Hcc", sculp_label='Light', fname="", savedir="", pt=[300,2000], | |
onlyQCD=False, useTagger=None, tagger=" ", debug=False, year=2017, wps=None): | |
def cut(tdf, ptlow=800, pthigh=2000): | |
mlow, mhigh = 20, 220 | |
cdf = tdf[(tdf.fj_pt < pthigh) & (tdf.fj_pt>ptlow) &(tdf.fj_sdmass < mhigh) & (tdf.fj_sdmass>mlow)] | |
return cdf | |
frame = cut(frame, ptlow=pt[0], pthigh=pt[1]) | |
if siglab == sculp_label: return | |
def find_nearest(array,value): | |
idx = (np.abs(array-value)).argmin() | |
return idx, array[idx] | |
labels = frame.columns.values | |
labels = [label[len("truth"):] for label in labels if label.startswith("truth")] | |
if debug: | |
for label in labels:print(label, np.sum(frame['truth'+label])) | |
if useTagger != None : | |
truth, predict, use_tags = roc_input(frame, signal=[siglab], include = [siglab, sculp_label], tagger=useTagger) | |
predict = use_tags | |
else: | |
truth, predict, use_tags = roc_input(frame, signal=[siglab], include = [siglab, sculp_label]) | |
fpr, tpr, threshold = roc_curve(truth, predict) | |
cuts = {} | |
if wps == None: wps [0.003,0.005, 0.01, 0.02, 0.05, 1.0] | |
effs = [] | |
#for wp in [0.01, 0.05, 0.1, 0.25, 0.5, 1.0]: # % mistag rate | |
for wp in wps: # % mistag rate | |
idx, val = find_nearest(fpr, wp) | |
cuts[str(wp)] = threshold[idx] # threshold for deep double-b corresponding to ~1% mistag rate | |
effs.append(tpr[idx]) | |
if debug: print( "Cuts", cuts) | |
print( fname, pt) | |
print( "mts =", wps[:-1]) | |
print("efs =", np.round(effs[:-1],3)) | |
print("cts =", np.round([cuts[str(key)] for key in wps[:-1]],2)) | |
f, (ax1, ax2) = plt.subplots(2,1, figsize=(10,10), gridspec_kw = {'height_ratios':[3, 1]}, sharex=True) | |
f.subplots_adjust(hspace=0) | |
bins = np.linspace(20,220,11) | |
for i, (wp, cut) in enumerate(reversed(sorted(cuts.items()))): | |
#print wp | |
if debug: print( wp, cut) | |
if useTagger!= None: ctdf = frame[frame[useTagger].values > cut] | |
else: ctdf = frame[frame['predict'+siglab].values > cut] | |
if onlyQCD: weight = ctdf['truthQCD'].values | |
else: weight = ctdf['truth'+sculp_label].values | |
if float(wp) == 1.0: label = 'No tagging applied' | |
else: label = 'at {:3.1f}\% mistag rate'.format(float(wp)*100.) | |
if debug: print( 'Weights', np.sum(weight), len(weight), weight) | |
if debug: print( "Filling {} events".format(len(ctdf['fj_sdmass'].values[weight > 0.5]))) | |
#n, bins = np.histogram(ctdf['fj_sdmass'].values, bins=bins, weights = weight, density=True) | |
evts = ctdf['fj_sdmass'].values[weight > 0.5] | |
grid = np.linspace(40,200, 501) | |
sns.kdeplot(evts, clip=(40,200), ax=ax1, label=label, color=cmap[i], lw=2.5, linestyle=smap[i]) | |
if float(wp) == 1.: | |
kde0 = scipy.stats.gaussian_kde(evts) | |
ax2.plot(grid, np.zeros(len(grid)), color=cmap[i], lw=2.5, linestyle=smap[i]) | |
else: | |
kde = scipy.stats.gaussian_kde(evts) | |
ax2.plot(grid, kde(grid)-kde0(grid), color=cmap[i], lw=2.5, linestyle=smap[i]) | |
ax2.set_xlabel(r'$\mathrm{m_{SD}\ [GeV]}$', ha='right', x=1.0) | |
if onlyQCD: | |
ax1.set_ylabel(r'Normalized scale ({})'.format('QCD'), ha='right', y=1.0) | |
ax2.set_ylabel(r'Tagged - Nominal', ha='center') | |
else: | |
ax1.set_ylabel(r'Normalized scale ({})'.format(sculp_label.replace("Hcc", r"$\mathrm{H \rightarrow c\bar{c}}$").replace("Hbb", r"$\mathrm{H \rightarrow b\bar{b}}$")), ha='right', y=1.0) | |
import matplotlib.ticker as plticker | |
for ax in [ax1,ax2]: | |
ax.xaxis.set_major_locator(plticker.MultipleLocator(base=20)) | |
ax.xaxis.set_minor_locator(plticker.MultipleLocator(base=10)) | |
ax.tick_params(direction='in', axis='both', which='major', labelsize=15, length=12, labelleft=False ) | |
ax.tick_params(direction='in', axis='both', which='minor' , length=6) | |
ax.xaxis.set_ticks_position('both') | |
ax.yaxis.set_ticks_position('both') | |
ax.yaxis.set_major_locator(plticker.MultipleLocator(base=0.002)) | |
ax.yaxis.set_minor_locator(plticker.MultipleLocator(base=0.0004)) | |
ax.grid(which='major', alpha=0.9, linestyle='dotted') | |
ax2.grid(which='minor', alpha=0.6, axis='y', linestyle='dotted') | |
# ax1.yaxis.set_major_locator(plticker.MultipleLocator(base=0.002)) | |
# ax1.yaxis.set_minor_locator(plticker.MultipleLocator(base=0.0004)) | |
# ax2.yaxis.set_major_locator(plticker.MultipleLocator(base=0.0008)) | |
# ax2.yaxis.set_minor_locator(plticker.MultipleLocator(base=0.0004)) | |
ax1.set_xlim(40, 200) | |
ymax = max(kde0(grid)) | |
ax1.set_ylim(0,ymax*1.25) | |
subylim = max(max(abs(min((kde(grid[100:400])-kde0(grid[100:400])))), abs(max((kde(grid[100:400])-kde0(grid[100:400])))))*1.05, 0.004) | |
ax2.set_ylim(-subylim, subylim) | |
leg = ax1.legend(borderpad=1, frameon=False, loc='best', fontsize='x-small', \ | |
#title = ""+str(int(round((min(frame.fj_pt)))))+" $\mathrm{<\ jet\ p_T\ <}$ "+str(int(round((max(frame.fj_pt)))))+" GeV" \ | |
#+ "\n "+str(int(round((min(frame.fj_sdmass)))))+" $\mathrm{<\ jet\ m_{SD}\ <}$ "+str(int(round((max(frame.fj_sdmass)))))+" GeV" | |
title= tagger\ | |
+ "\n" + r"Tagging efficiency (H $\rightarrow$ {})".format(siglab[-2:]) ) | |
plt.setp(leg.get_title(), multialignment='center', fontsize='x-small') | |
leg._legend_box.align = "center" | |
mpt = ""+str(int(round((min(frame.fj_pt)))))+" $\mathrm{<\ jet\ p_T\ <}$ "+str(int(round((max(frame.fj_pt)))))+" GeV" \ | |
+ "\n "+str(int(round((40))))+" $\mathrm{<\ jet\ m_{SD}\ <}$ "+str(int(round((200))))+" GeV" \ | |
#+ "\n "+ tagger | |
ax1.annotate( mpt, xy=(0.05, 0.92), xycoords='axes fraction', ha='left', va='top', | |
annotation_clip=False, multialignment='center', fontsize='x-small') | |
# ax1.annotate(r'{} (13 TeV)'.format(year), xy=(1, 1.018), xycoords='axes fraction', fontsize=22, fontname='Helvetica', | |
# ha='right', annotation_clip=False) | |
# ax1.annotate(r'$\mathbf{CMS}$', xy=(0.001, 1.018), xycoords='axes fraction', fontname='Helvetica', fontsize=28, | |
# ha='left', annotation_clip=False) | |
# ax1.annotate(r'$Simulation\ Preliminary$', xy=(0.12, 1.018), xycoords='axes fraction', fontsize=22, fontname='Helvetica', | |
# fontstyle='italic', ha='left', annotation_clip=False) | |
ax1.annotate(r'Gaussian kde smoothing', xy=(0.03, 0.05), xycoords='axes fraction', fontsize=16, fontname='Helvetica', | |
fontstyle='italic', ha='left', annotation_clip=False) | |
import cmsstyle as cms | |
ax1 = cms.cms_annot(ax=ax1) | |
f.savefig(os.path.join(savedir,'kdeM_sculpting_'+fname+"_tag"+siglab+"_"+sculp_label+"_pt{}-{}".format(pt[0], pt[1])+'.png'), dpi=400, transparent=True) | |
f.savefig(os.path.join(savedir,'kdeM_sculpting_'+fname+"_tag"+siglab+"_"+sculp_label+"_pt{}-{}".format(pt[0], pt[1])+'.pdf'), dpi=400, transparent=True) | |
for plow, phigh in zip([350],[1200]): | |
sculpting3(df2, pt=[plow, phigh], siglab="Hbb", sculp_label='QCD', fname='doubleb', wps=[0.01,0.03, 0.06, 0.1, 1.0], | |
tagger='double-B', savedir=savedir, onlyQCD=True, useTagger='fj_doubleb') | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment