andrzejnovak · July 29, 2019 10:18
diff --git a/ROC comparison from DF b/ROC comparison from DF
 def cut(tdf, ptlow=300, pthigh=2000, mlow = 20, mhigh = 220):
    cdf = tdf[(tdf.fj_pt < pthigh) & (tdf.fj_pt>ptlow) &(tdf.fj_sdmass < mhigh) & (tdf.fj_sdmass>mlow)]
    return cdf
    
 def roc_input(frame, signal=["HCC"], include = ["HCC", "Light", "gBB", "gCC", "HBB"], norm=False, tagger='fj_doubleb'):       
    # Bkg def - filter unwanted
    bkg = np.zeros(frame.shape[0])    
    for label in include:
        bkg = np.add(bkg,  frame['truth'+label].values )
    bkg = [bool(x) for x in bkg]
    tdf = frame[bkg] #tdf for temporary df   
    
    # Signal
    truth = np.zeros(tdf.shape[0])
    predict = np.zeros(tdf.shape[0])
    prednorm = np.zeros(tdf.shape[0])
    for label in signal:
        truth   += tdf['truth'+label].values
        predict += tdf['predict'+label].values
    for label in include:
        prednorm += tdf['predict'+label].values
    tag_vals = tdf[tagger].values
    if norm == False:
        return truth, predict, tag_vals
    else:
        return truth, np.divide(predict, prednorm), tag_vals
        
 def compare_rocs(dfs=[], names=[], sigs=[["Hcc"]], bkgs=[["Hbb"]], norm=False, pt=[300,2000],
                 tagger_names=None,
                 flip = None,
                 title=None, 
                 ignore = None, 
                 plotname="", colors=[0,1], styles=['-','-'], year='2016', use_tagger=[False], log=True):
    c_list = ['darkorange', 'steelblue', 'firebrick', 'purple', 'orangered']
    f, ax = plt.subplots(figsize=(10, 10))
    if tagger_names == None: tagger_names = ['fj_doubleb']*len(dfs)
    if ignore == None: ignore = [False]*len(dfs)
    if flip == None: flip = [False]*len(dfs)
    for frame, tagger, name, sig, bkg, col, sty, db_id, skip, flip_this in zip(dfs, tagger_names, names, sigs, bkgs, colors, styles, use_tagger*len(colors), ignore, flip):
        if skip: continue
        mlow = 40; mhigh = 200
        frame = cut(frame,  ptlow=pt[0] , pthigh =pt[1], mlow = mlow, mhigh = mhigh)       
        truth, predict, db =  roc_input(frame, signal=sig, include = sig+bkg, norm=norm, tagger=tagger)
        if flip_this: predict = 1 - predict
        if not db_id:
            fpr, tpr, threshold = roc_curve(truth, predict)
        else:
            fpr, tpr, threshold = roc_curve(truth, db)
        if type(col) == int: color_toplot = c_list[col]
        else: color_toplot = col
        if not name.startswith("raw:"): lab = "DeepDouble{}, AUC = {:.1f}\%".format(name, auc(fpr,tpr)*100)
        else: lab = "{}, AUC = {:.1f}\%".format(name[len('raw:'):], auc(fpr,tpr)*100)
        ax.plot(tpr, fpr, lw=4, label=lab, color=color_toplot, linestyle=sty)
        
    ax.set_xlim(0,1)
    ax.set_ylim(0.001,1)
    sigs = sorted(list(set([item for sublist in sigs for item in sublist])))
    bkgs = sorted(list(set([item for sublist in bkgs for item in sublist])))
    print sigs
    print bkgs
    if len(sigs) == 1 and len(sigs[0]) == 3 and sigs[0][0] in ["H", "Z", "g"]:
            xlab = '{} \\rightarrow {}'.format(sigs[0][0], sigs[0][-2]+'\\bar{'+sigs[0][-1]+'}') 
            ax.set_xlabel(r'Tagging efficiency ($\mathrm{}$)'.format('{'+xlab+'}'), ha='right', x=1.0)
    else: 
        xlab = ['{} \\rightarrow {}'.format(l[0], l[-2]+'\\bar{'+l[-1]+'}') if l[0][0] in ["H", "Z", "g"] else l for l in sigs ]
        ax.set_xlabel(r'Tagging efficiency ($\mathrm{}$)'.format("{"+", ".join(xlab)+"}"), ha='right', x=1.0)
    if len(bkgs) == 1 and len(bkgs[0]) == 3 and bkgs[0][0] in ["H", "Z", "g"]:
            ylab = '{} \\rightarrow {}'.format(bkgs[0][0], bkgs[0][-2]+'\\bar{'+bkgs[0][-1]+'}') 
            ax.set_ylabel(r'Mistagging rate ($\mathrm{}$)'.format('{'+ylab+'}'), ha='right', y=1.0)
    else:
        ylab = ['{} \\rightarrow {}'.format(l[0], l[-2]+'\\bar{'+l[-1]+'}') if l[0][0] in ["H", "Z", "g"] else l for l in bkgs ]
        ax.set_ylabel(r'Mistagging rate ($\mathrm{}$)'.format("{"+", ".join(ylab)+"}"), ha='right', y=1.0)
    import matplotlib.ticker as plticker
    ax.xaxis.set_major_locator(plticker.MultipleLocator(base=0.1))
    ax.xaxis.set_minor_locator(plticker.MultipleLocator(base=0.02))
    #ax.yaxis.set_minor_locator(plticker.MultipleLocator(base=0.02))
    ax.tick_params(direction='in', axis='both', which='major', labelsize=18, length=12 )
    ax.tick_params(direction='in', axis='both', which='minor' , length=6)
    ax.xaxis.set_ticks_position('both')
    ax.yaxis.set_ticks_position('both')    
    if log: ax.semilogy()
    ax.grid(which='minor', alpha=0.5, axis='y', linestyle='dotted')
    ax.grid(which='major', alpha=0.9, linestyle='dotted')
    leg = ax.legend(borderpad=1, frameon=False, loc=2, fontsize=18,
        title = ""+str(int(round((pt[0]))))+" $\mathrm{<\ jet\ p_T\ <}$ "+str(int(round((pt[1]))))+" GeV" \
      + "\n "+str(int(round(mlow)))+" $\mathrm{<\ jet\ m_{sd}\ <}$ "+str(int(round(mhigh)))+" GeV"
                   )
    leg._legend_box.align = "left"
    ax.annotate(r'{} (13 TeV)'.format(year), xy=(1, 1.015), xycoords='axes fraction', fontsize=22,  fontname='Helvetica', 
                ha='right', annotation_clip=False)
    ax.annotate('$\mathbf{CMS}$', xy=(0.001, 1.015), xycoords='axes fraction', fontname='Helvetica', fontsize=28,
                ha='left', annotation_clip=False)
    ax.annotate('$Simulation\ Preliminary$', xy=(0.12, 1.015), xycoords='axes fraction', fontsize=22, fontname='Helvetica',
                fontstyle='italic', ha='left', annotation_clip=False)
    
    if title != None: ax.set_title(title)
    
    if log: ax.semilogy()
    if len(plotname) > 1:
        f.savefig(os.path.join(savedir, "ROCComparison_"+plotname+"pt{}-{}".format(pt[0], pt[1])+".pdf"), dpi=400, transparent=True)
        f.savefig(os.path.join(savedir, "ROCComparison_"+plotname+"pt{}-{}".format(pt[0], pt[1])+".png"), dpi=400, transparent=True)
    else:
        if norm: f.savefig(os.path.join(savedir, "ROCNormComparison_"+"+".join(names)+"pt{}-{}".format(pt[0], pt[1])+".pdf"), dpi=400, transparent=True)
        else: f.savefig(os.path.join(savedir, "ROCComparison_"+"+".join(names)+"pt{}-{}".format(pt[0], pt[1])+".pdf"), dpi=400, transparent=True)
        if norm: f.savefig(os.path.join(savedir, "ROCNormComparison_"+"+".join(names)+"pt{}-{}".format(pt[0], pt[1])+".png"), dpi=400, transparent=True)
        else: f.savefig(os.path.join(savedir, "ROCComparison_"+"+".join(names)+"pt{}-{}".format(pt[0], pt[1])+".png"), dpi=400 , transparent=True)
    plt.show()

 for plow, phigh in zip([300,1000],[2000,1400]):
    compare_rocs(dfs=[df2, df2, df2], 
             names=["raw:double-b (2016)", "BvL", "raw:DeepAK8:ZHbb"],
             tagger_names = ['fj_doubleb', 'fj_DDBvL_mi', 'fj_DeepAK8HbbvsQCD_mi'],
             use_tagger = [True, True, True],
             colors=[0,1,'black'],
             pt = [plow, phigh],
             styles=['--', '-', '-'], 
             sigs=[["Zbb"], ["Zbb"], ["Zbb"] ], 
             bkgs=[["QCD"], ["QCD"], ["QCD"]],  plotname="Zbb", year='2017')


 import seaborn as sns
 import scipy

 smap = [(0, ()), #solid',               
     #(0, (5, 10)), #loosely dashed',      
     (0, (5, 5)), #dashed',              
     #(0, (5, 1)), #densely dashed',      
        
     #(0, (3, 10, 1, 10)), #loosely dashdotted',  
     #(0, (3, 5, 1, 5)), #dashdotted',          
     (0, (3, 1, 1, 1)), #densely dashdotted',  
    
     #(0, (3, 10, 1, 10, 1, 10)), #loosely dashdotdotted', 
     #(0, (3, 5, 1, 5, 1, 5)), #dashdotdotted',         
     (0, (3, 1, 1, 1, 1, 1)), #densely dashdotdotted', 

     #(0, (1, 10)), #loosely dotted',      
     (0, (1, 1)), #densely dotted',      
     (0, (1, 5))   #dotted',              
       ]
 cmap=sns.color_palette(sns.cubehelix_palette(7, reverse=True))

 def sculpting3(frame, siglab="Hcc", sculp_label='Light', fname="", savedir="", pt=[300,2000],
              onlyQCD=False, useTagger=None, tagger=" ", debug=False, year=2017, wps=None):
    def cut(tdf, ptlow=800, pthigh=2000):
        mlow, mhigh = 20, 220
        cdf = tdf[(tdf.fj_pt < pthigh) & (tdf.fj_pt>ptlow) &(tdf.fj_sdmass < mhigh) & (tdf.fj_sdmass>mlow)]
        return cdf
    frame = cut(frame, ptlow=pt[0], pthigh=pt[1])
    if siglab == sculp_label: return 
    def find_nearest(array,value):
        idx = (np.abs(array-value)).argmin()
        return idx, array[idx]
    
    labels = frame.columns.values
    labels = [label[len("truth"):] for label in labels if label.startswith("truth")]
    if debug: 
        for label in labels:print(label, np.sum(frame['truth'+label]))
    
    
    if useTagger != None : 
        truth, predict, use_tags = roc_input(frame, signal=[siglab], include = [siglab, sculp_label], tagger=useTagger)
        predict = use_tags
    else:
        truth, predict, use_tags = roc_input(frame, signal=[siglab], include = [siglab, sculp_label])
    fpr, tpr, threshold = roc_curve(truth, predict)

    cuts = {}
    if wps == None: wps [0.003,0.005, 0.01, 0.02, 0.05, 1.0]
    effs = []
    #for wp in [0.01, 0.05, 0.1, 0.25, 0.5, 1.0]: # % mistag rate
    for wp in wps: # % mistag rate
        idx, val = find_nearest(fpr, wp)
        cuts[str(wp)] = threshold[idx] # threshold for deep double-b corresponding to ~1% mistag rate
        effs.append(tpr[idx])
    if debug: print( "Cuts", cuts)
    print( fname, pt)
    print( "mts =", wps[:-1])
    print("efs =", np.round(effs[:-1],3))
    print("cts =", np.round([cuts[str(key)] for key in wps[:-1]],2))
        
    f, (ax1, ax2) = plt.subplots(2,1, figsize=(10,10), gridspec_kw = {'height_ratios':[3, 1]}, sharex=True)
    f.subplots_adjust(hspace=0)
    bins = np.linspace(20,220,11)
    for i, (wp, cut) in enumerate(reversed(sorted(cuts.items()))):
        #print wp
        if debug: print( wp, cut)
        if useTagger!= None: ctdf = frame[frame[useTagger].values > cut]
        else: ctdf = frame[frame['predict'+siglab].values > cut]
        if onlyQCD: weight = ctdf['truthQCD'].values
        else: weight = ctdf['truth'+sculp_label].values
        if float(wp) ==  1.0: label = 'No tagging applied'
        else: label = 'at {:3.1f}\% mistag rate'.format(float(wp)*100.)
        if debug: print( 'Weights', np.sum(weight), len(weight), weight)
        if debug: print( "Filling {} events".format(len(ctdf['fj_sdmass'].values[weight > 0.5])))
        #n, bins = np.histogram(ctdf['fj_sdmass'].values, bins=bins, weights = weight, density=True)
        evts = ctdf['fj_sdmass'].values[weight > 0.5]
        grid = np.linspace(40,200, 501)
        sns.kdeplot(evts, clip=(40,200), ax=ax1, label=label, color=cmap[i], lw=2.5, linestyle=smap[i])
        if float(wp) == 1.: 
            kde0 = scipy.stats.gaussian_kde(evts)
            ax2.plot(grid, np.zeros(len(grid)), color=cmap[i], lw=2.5, linestyle=smap[i])
        else:
            kde = scipy.stats.gaussian_kde(evts)
            ax2.plot(grid, kde(grid)-kde0(grid),  color=cmap[i], lw=2.5, linestyle=smap[i])
        
        ax2.set_xlabel(r'$\mathrm{m_{SD}\ [GeV]}$', ha='right', x=1.0)
    if onlyQCD: 
        ax1.set_ylabel(r'Normalized scale ({})'.format('QCD'), ha='right', y=1.0)
        ax2.set_ylabel(r'Tagged - Nominal', ha='center')
    else:
        ax1.set_ylabel(r'Normalized scale ({})'.format(sculp_label.replace("Hcc", r"$\mathrm{H \rightarrow c\bar{c}}$").replace("Hbb", r"$\mathrm{H \rightarrow b\bar{b}}$")), ha='right', y=1.0)
    import matplotlib.ticker as plticker
    for ax in [ax1,ax2]:
        ax.xaxis.set_major_locator(plticker.MultipleLocator(base=20))
        ax.xaxis.set_minor_locator(plticker.MultipleLocator(base=10))
        ax.tick_params(direction='in', axis='both', which='major', labelsize=15, length=12, labelleft=False )
        ax.tick_params(direction='in', axis='both', which='minor' , length=6)
        ax.xaxis.set_ticks_position('both')
        ax.yaxis.set_ticks_position('both')    
        ax.yaxis.set_major_locator(plticker.MultipleLocator(base=0.002))
        ax.yaxis.set_minor_locator(plticker.MultipleLocator(base=0.0004))
        ax.grid(which='major', alpha=0.9, linestyle='dotted')
    ax2.grid(which='minor', alpha=0.6, axis='y', linestyle='dotted')
    
 #     ax1.yaxis.set_major_locator(plticker.MultipleLocator(base=0.002))
 #     ax1.yaxis.set_minor_locator(plticker.MultipleLocator(base=0.0004))
 #     ax2.yaxis.set_major_locator(plticker.MultipleLocator(base=0.0008))
 #     ax2.yaxis.set_minor_locator(plticker.MultipleLocator(base=0.0004))

    ax1.set_xlim(40, 200)
    ymax = max(kde0(grid))
    ax1.set_ylim(0,ymax*1.25)
    subylim = max(max(abs(min((kde(grid[100:400])-kde0(grid[100:400])))), abs(max((kde(grid[100:400])-kde0(grid[100:400])))))*1.05, 0.004)
    ax2.set_ylim(-subylim, subylim)
   
    leg = ax1.legend(borderpad=1, frameon=False, loc='best', fontsize='x-small', \
        #title = ""+str(int(round((min(frame.fj_pt)))))+" $\mathrm{<\ jet\ p_T\ <}$ "+str(int(round((max(frame.fj_pt)))))+" GeV" \
        #+ "\n "+str(int(round((min(frame.fj_sdmass)))))+" $\mathrm{<\ jet\ m_{SD}\ <}$ "+str(int(round((max(frame.fj_sdmass)))))+" GeV"
        title= tagger\
        + "\n" + r"Tagging efficiency (H $\rightarrow$ {})".format(siglab[-2:]) )
    plt.setp(leg.get_title(), multialignment='center', fontsize='x-small')
    leg._legend_box.align = "center"
    
    mpt = ""+str(int(round((min(frame.fj_pt)))))+" $\mathrm{<\ jet\ p_T\ <}$ "+str(int(round((max(frame.fj_pt)))))+" GeV" \
      + "\n "+str(int(round((40))))+" $\mathrm{<\ jet\ m_{SD}\ <}$ "+str(int(round((200))))+" GeV" \
      #+ "\n "+ tagger 
    ax1.annotate( mpt, xy=(0.05, 0.92), xycoords='axes fraction', ha='left', va='top',
                annotation_clip=False, multialignment='center', fontsize='x-small')
 #     ax1.annotate(r'{} (13 TeV)'.format(year), xy=(1, 1.018), xycoords='axes fraction', fontsize=22,  fontname='Helvetica', 
 #                 ha='right', annotation_clip=False)
 #     ax1.annotate(r'$\mathbf{CMS}$', xy=(0.001, 1.018), xycoords='axes fraction', fontname='Helvetica', fontsize=28,
 #                 ha='left', annotation_clip=False)
 #     ax1.annotate(r'$Simulation\ Preliminary$', xy=(0.12, 1.018), xycoords='axes fraction', fontsize=22, fontname='Helvetica',
 #                 fontstyle='italic', ha='left', annotation_clip=False)
    ax1.annotate(r'Gaussian kde smoothing', xy=(0.03, 0.05), xycoords='axes fraction', fontsize=16, fontname='Helvetica',
                fontstyle='italic', ha='left', annotation_clip=False)
    import cmsstyle as cms
    ax1 = cms.cms_annot(ax=ax1)
    
    f.savefig(os.path.join(savedir,'kdeM_sculpting_'+fname+"_tag"+siglab+"_"+sculp_label+"_pt{}-{}".format(pt[0], pt[1])+'.png'), dpi=400, transparent=True)
    f.savefig(os.path.join(savedir,'kdeM_sculpting_'+fname+"_tag"+siglab+"_"+sculp_label+"_pt{}-{}".format(pt[0], pt[1])+'.pdf'), dpi=400, transparent=True)
    

 for plow, phigh in zip([350],[1200]):
     sculpting3(df2, pt=[plow, phigh], siglab="Hbb", sculp_label='QCD', fname='doubleb', wps=[0.01,0.03, 0.06, 0.1, 1.0],
                tagger='double-B', savedir=savedir, onlyQCD=True, useTagger='fj_doubleb')
	def cut(tdf, ptlow=300, pthigh=2000, mlow = 20, mhigh = 220):
	cdf = tdf[(tdf.fj_pt < pthigh) & (tdf.fj_pt>ptlow) &(tdf.fj_sdmass < mhigh) & (tdf.fj_sdmass>mlow)]
	return cdf

	def roc_input(frame, signal=["HCC"], include = ["HCC", "Light", "gBB", "gCC", "HBB"], norm=False, tagger='fj_doubleb'):
	# Bkg def - filter unwanted
	bkg = np.zeros(frame.shape[0])
	for label in include:
	bkg = np.add(bkg, frame['truth'+label].values )
	bkg = [bool(x) for x in bkg]
	tdf = frame[bkg] #tdf for temporary df

	# Signal
	truth = np.zeros(tdf.shape[0])
	predict = np.zeros(tdf.shape[0])
	prednorm = np.zeros(tdf.shape[0])
	for label in signal:
	truth += tdf['truth'+label].values
	predict += tdf['predict'+label].values
	for label in include:
	prednorm += tdf['predict'+label].values
	tag_vals = tdf[tagger].values
	if norm == False:
	return truth, predict, tag_vals
	else:
	return truth, np.divide(predict, prednorm), tag_vals

	def compare_rocs(dfs=[], names=[], sigs=[["Hcc"]], bkgs=[["Hbb"]], norm=False, pt=[300,2000],
	tagger_names=None,
	flip = None,
	title=None,
	ignore = None,
	plotname="", colors=[0,1], styles=['-','-'], year='2016', use_tagger=[False], log=True):
	c_list = ['darkorange', 'steelblue', 'firebrick', 'purple', 'orangered']
	f, ax = plt.subplots(figsize=(10, 10))
	if tagger_names == None: tagger_names = ['fj_doubleb']*len(dfs)
	if ignore == None: ignore = [False]*len(dfs)
	if flip == None: flip = [False]*len(dfs)
	for frame, tagger, name, sig, bkg, col, sty, db_id, skip, flip_this in zip(dfs, tagger_names, names, sigs, bkgs, colors, styles, use_tagger*len(colors), ignore, flip):
	if skip: continue
	mlow = 40; mhigh = 200
	frame = cut(frame, ptlow=pt[0] , pthigh =pt[1], mlow = mlow, mhigh = mhigh)
	truth, predict, db = roc_input(frame, signal=sig, include = sig+bkg, norm=norm, tagger=tagger)
	if flip_this: predict = 1 - predict
	if not db_id:
	fpr, tpr, threshold = roc_curve(truth, predict)
	else:
	fpr, tpr, threshold = roc_curve(truth, db)
	if type(col) == int: color_toplot = c_list[col]
	else: color_toplot = col
	if not name.startswith("raw:"): lab = "DeepDouble{}, AUC = {:.1f}\%".format(name, auc(fpr,tpr)*100)
	else: lab = "{}, AUC = {:.1f}\%".format(name[len('raw:'):], auc(fpr,tpr)*100)
	ax.plot(tpr, fpr, lw=4, label=lab, color=color_toplot, linestyle=sty)

	ax.set_xlim(0,1)
	ax.set_ylim(0.001,1)
	sigs = sorted(list(set([item for sublist in sigs for item in sublist])))
	bkgs = sorted(list(set([item for sublist in bkgs for item in sublist])))
	print sigs
	print bkgs
	if len(sigs) == 1 and len(sigs[0]) == 3 and sigs[0][0] in ["H", "Z", "g"]:
	xlab = '{} \\rightarrow {}'.format(sigs[0][0], sigs[0][-2]+'\\bar{'+sigs[0][-1]+'}')
	ax.set_xlabel(r'Tagging efficiency ($\mathrm{}$)'.format('{'+xlab+'}'), ha='right', x=1.0)
	else:
	xlab = ['{} \\rightarrow {}'.format(l[0], l[-2]+'\\bar{'+l[-1]+'}') if l[0][0] in ["H", "Z", "g"] else l for l in sigs ]
	ax.set_xlabel(r'Tagging efficiency ($\mathrm{}$)'.format("{"+", ".join(xlab)+"}"), ha='right', x=1.0)
	if len(bkgs) == 1 and len(bkgs[0]) == 3 and bkgs[0][0] in ["H", "Z", "g"]:
	ylab = '{} \\rightarrow {}'.format(bkgs[0][0], bkgs[0][-2]+'\\bar{'+bkgs[0][-1]+'}')
	ax.set_ylabel(r'Mistagging rate ($\mathrm{}$)'.format('{'+ylab+'}'), ha='right', y=1.0)
	else:
	ylab = ['{} \\rightarrow {}'.format(l[0], l[-2]+'\\bar{'+l[-1]+'}') if l[0][0] in ["H", "Z", "g"] else l for l in bkgs ]
	ax.set_ylabel(r'Mistagging rate ($\mathrm{}$)'.format("{"+", ".join(ylab)+"}"), ha='right', y=1.0)
	import matplotlib.ticker as plticker
	ax.xaxis.set_major_locator(plticker.MultipleLocator(base=0.1))
	ax.xaxis.set_minor_locator(plticker.MultipleLocator(base=0.02))
	#ax.yaxis.set_minor_locator(plticker.MultipleLocator(base=0.02))
	ax.tick_params(direction='in', axis='both', which='major', labelsize=18, length=12 )
	ax.tick_params(direction='in', axis='both', which='minor' , length=6)
	ax.xaxis.set_ticks_position('both')
	ax.yaxis.set_ticks_position('both')
	if log: ax.semilogy()
	ax.grid(which='minor', alpha=0.5, axis='y', linestyle='dotted')
	ax.grid(which='major', alpha=0.9, linestyle='dotted')
	leg = ax.legend(borderpad=1, frameon=False, loc=2, fontsize=18,
	title = ""+str(int(round((pt[0]))))+" $\mathrm{<\ jet\ p_T\ <}$ "+str(int(round((pt[1]))))+" GeV" \
	+ "\n "+str(int(round(mlow)))+" $\mathrm{<\ jet\ m_{sd}\ <}$ "+str(int(round(mhigh)))+" GeV"
	)
	leg._legend_box.align = "left"
	ax.annotate(r'{} (13 TeV)'.format(year), xy=(1, 1.015), xycoords='axes fraction', fontsize=22, fontname='Helvetica',
	ha='right', annotation_clip=False)
	ax.annotate('$\mathbf{CMS}$', xy=(0.001, 1.015), xycoords='axes fraction', fontname='Helvetica', fontsize=28,
	ha='left', annotation_clip=False)
	ax.annotate('$Simulation\ Preliminary$', xy=(0.12, 1.015), xycoords='axes fraction', fontsize=22, fontname='Helvetica',
	fontstyle='italic', ha='left', annotation_clip=False)

	if title != None: ax.set_title(title)

	if log: ax.semilogy()
	if len(plotname) > 1:
	f.savefig(os.path.join(savedir, "ROCComparison_"+plotname+"pt{}-{}".format(pt[0], pt[1])+".pdf"), dpi=400, transparent=True)
	f.savefig(os.path.join(savedir, "ROCComparison_"+plotname+"pt{}-{}".format(pt[0], pt[1])+".png"), dpi=400, transparent=True)
	else:
	if norm: f.savefig(os.path.join(savedir, "ROCNormComparison_"+"+".join(names)+"pt{}-{}".format(pt[0], pt[1])+".pdf"), dpi=400, transparent=True)
	else: f.savefig(os.path.join(savedir, "ROCComparison_"+"+".join(names)+"pt{}-{}".format(pt[0], pt[1])+".pdf"), dpi=400, transparent=True)
	if norm: f.savefig(os.path.join(savedir, "ROCNormComparison_"+"+".join(names)+"pt{}-{}".format(pt[0], pt[1])+".png"), dpi=400, transparent=True)
	else: f.savefig(os.path.join(savedir, "ROCComparison_"+"+".join(names)+"pt{}-{}".format(pt[0], pt[1])+".png"), dpi=400 , transparent=True)
	plt.show()

	for plow, phigh in zip([300,1000],[2000,1400]):
	compare_rocs(dfs=[df2, df2, df2],
	names=["raw:double-b (2016)", "BvL", "raw:DeepAK8:ZHbb"],
	tagger_names = ['fj_doubleb', 'fj_DDBvL_mi', 'fj_DeepAK8HbbvsQCD_mi'],
	use_tagger = [True, True, True],
	colors=[0,1,'black'],
	pt = [plow, phigh],
	styles=['--', '-', '-'],
	sigs=[["Zbb"], ["Zbb"], ["Zbb"] ],
	bkgs=[["QCD"], ["QCD"], ["QCD"]], plotname="Zbb", year='2017')


	import seaborn as sns
	import scipy

	smap = [(0, ()), #solid',
	#(0, (5, 10)), #loosely dashed',
	(0, (5, 5)), #dashed',
	#(0, (5, 1)), #densely dashed',

	#(0, (3, 10, 1, 10)), #loosely dashdotted',
	#(0, (3, 5, 1, 5)), #dashdotted',
	(0, (3, 1, 1, 1)), #densely dashdotted',

	#(0, (3, 10, 1, 10, 1, 10)), #loosely dashdotdotted',
	#(0, (3, 5, 1, 5, 1, 5)), #dashdotdotted',
	(0, (3, 1, 1, 1, 1, 1)), #densely dashdotdotted',

	#(0, (1, 10)), #loosely dotted',
	(0, (1, 1)), #densely dotted',
	(0, (1, 5)) #dotted',
	]
	cmap=sns.color_palette(sns.cubehelix_palette(7, reverse=True))

	def sculpting3(frame, siglab="Hcc", sculp_label='Light', fname="", savedir="", pt=[300,2000],
	onlyQCD=False, useTagger=None, tagger=" ", debug=False, year=2017, wps=None):
	def cut(tdf, ptlow=800, pthigh=2000):
	mlow, mhigh = 20, 220
	cdf = tdf[(tdf.fj_pt < pthigh) & (tdf.fj_pt>ptlow) &(tdf.fj_sdmass < mhigh) & (tdf.fj_sdmass>mlow)]
	return cdf
	frame = cut(frame, ptlow=pt[0], pthigh=pt[1])
	if siglab == sculp_label: return
	def find_nearest(array,value):
	idx = (np.abs(array-value)).argmin()
	return idx, array[idx]

	labels = frame.columns.values
	labels = [label[len("truth"):] for label in labels if label.startswith("truth")]
	if debug:
	for label in labels:print(label, np.sum(frame['truth'+label]))


	if useTagger != None :
	truth, predict, use_tags = roc_input(frame, signal=[siglab], include = [siglab, sculp_label], tagger=useTagger)
	predict = use_tags
	else:
	truth, predict, use_tags = roc_input(frame, signal=[siglab], include = [siglab, sculp_label])
	fpr, tpr, threshold = roc_curve(truth, predict)

	cuts = {}
	if wps == None: wps [0.003,0.005, 0.01, 0.02, 0.05, 1.0]
	effs = []
	#for wp in [0.01, 0.05, 0.1, 0.25, 0.5, 1.0]: # % mistag rate
	for wp in wps: # % mistag rate
	idx, val = find_nearest(fpr, wp)
	cuts[str(wp)] = threshold[idx] # threshold for deep double-b corresponding to ~1% mistag rate
	effs.append(tpr[idx])
	if debug: print( "Cuts", cuts)
	print( fname, pt)
	print( "mts =", wps[:-1])
	print("efs =", np.round(effs[:-1],3))
	print("cts =", np.round([cuts[str(key)] for key in wps[:-1]],2))

	f, (ax1, ax2) = plt.subplots(2,1, figsize=(10,10), gridspec_kw = {'height_ratios':[3, 1]}, sharex=True)
	f.subplots_adjust(hspace=0)
	bins = np.linspace(20,220,11)
	for i, (wp, cut) in enumerate(reversed(sorted(cuts.items()))):
	#print wp
	if debug: print( wp, cut)
	if useTagger!= None: ctdf = frame[frame[useTagger].values > cut]
	else: ctdf = frame[frame['predict'+siglab].values > cut]
	if onlyQCD: weight = ctdf['truthQCD'].values
	else: weight = ctdf['truth'+sculp_label].values
	if float(wp) == 1.0: label = 'No tagging applied'
	else: label = 'at {:3.1f}\% mistag rate'.format(float(wp)*100.)
	if debug: print( 'Weights', np.sum(weight), len(weight), weight)
	if debug: print( "Filling {} events".format(len(ctdf['fj_sdmass'].values[weight > 0.5])))
	#n, bins = np.histogram(ctdf['fj_sdmass'].values, bins=bins, weights = weight, density=True)
	evts = ctdf['fj_sdmass'].values[weight > 0.5]
	grid = np.linspace(40,200, 501)
	sns.kdeplot(evts, clip=(40,200), ax=ax1, label=label, color=cmap[i], lw=2.5, linestyle=smap[i])
	if float(wp) == 1.:
	kde0 = scipy.stats.gaussian_kde(evts)
	ax2.plot(grid, np.zeros(len(grid)), color=cmap[i], lw=2.5, linestyle=smap[i])
	else:
	kde = scipy.stats.gaussian_kde(evts)
	ax2.plot(grid, kde(grid)-kde0(grid), color=cmap[i], lw=2.5, linestyle=smap[i])

	ax2.set_xlabel(r'$\mathrm{m_{SD}\ [GeV]}$', ha='right', x=1.0)
	if onlyQCD:
	ax1.set_ylabel(r'Normalized scale ({})'.format('QCD'), ha='right', y=1.0)
	ax2.set_ylabel(r'Tagged - Nominal', ha='center')
	else:
	ax1.set_ylabel(r'Normalized scale ({})'.format(sculp_label.replace("Hcc", r"$\mathrm{H \rightarrow c\bar{c}}$").replace("Hbb", r"$\mathrm{H \rightarrow b\bar{b}}$")), ha='right', y=1.0)
	import matplotlib.ticker as plticker
	for ax in [ax1,ax2]:
	ax.xaxis.set_major_locator(plticker.MultipleLocator(base=20))
	ax.xaxis.set_minor_locator(plticker.MultipleLocator(base=10))
	ax.tick_params(direction='in', axis='both', which='major', labelsize=15, length=12, labelleft=False )
	ax.tick_params(direction='in', axis='both', which='minor' , length=6)
	ax.xaxis.set_ticks_position('both')
	ax.yaxis.set_ticks_position('both')
	ax.yaxis.set_major_locator(plticker.MultipleLocator(base=0.002))
	ax.yaxis.set_minor_locator(plticker.MultipleLocator(base=0.0004))
	ax.grid(which='major', alpha=0.9, linestyle='dotted')
	ax2.grid(which='minor', alpha=0.6, axis='y', linestyle='dotted')

	# ax1.yaxis.set_major_locator(plticker.MultipleLocator(base=0.002))
	# ax1.yaxis.set_minor_locator(plticker.MultipleLocator(base=0.0004))
	# ax2.yaxis.set_major_locator(plticker.MultipleLocator(base=0.0008))
	# ax2.yaxis.set_minor_locator(plticker.MultipleLocator(base=0.0004))

	ax1.set_xlim(40, 200)
	ymax = max(kde0(grid))
	ax1.set_ylim(0,ymax*1.25)
	subylim = max(max(abs(min((kde(grid[100:400])-kde0(grid[100:400])))), abs(max((kde(grid[100:400])-kde0(grid[100:400])))))*1.05, 0.004)
	ax2.set_ylim(-subylim, subylim)

	leg = ax1.legend(borderpad=1, frameon=False, loc='best', fontsize='x-small', \
	#title = ""+str(int(round((min(frame.fj_pt)))))+" $\mathrm{<\ jet\ p_T\ <}$ "+str(int(round((max(frame.fj_pt)))))+" GeV" \
	#+ "\n "+str(int(round((min(frame.fj_sdmass)))))+" $\mathrm{<\ jet\ m_{SD}\ <}$ "+str(int(round((max(frame.fj_sdmass)))))+" GeV"
	title= tagger\
	+ "\n" + r"Tagging efficiency (H $\rightarrow$ {})".format(siglab[-2:]) )
	plt.setp(leg.get_title(), multialignment='center', fontsize='x-small')
	leg._legend_box.align = "center"

	mpt = ""+str(int(round((min(frame.fj_pt)))))+" $\mathrm{<\ jet\ p_T\ <}$ "+str(int(round((max(frame.fj_pt)))))+" GeV" \
	+ "\n "+str(int(round((40))))+" $\mathrm{<\ jet\ m_{SD}\ <}$ "+str(int(round((200))))+" GeV" \
	#+ "\n "+ tagger
	ax1.annotate( mpt, xy=(0.05, 0.92), xycoords='axes fraction', ha='left', va='top',
	annotation_clip=False, multialignment='center', fontsize='x-small')
	# ax1.annotate(r'{} (13 TeV)'.format(year), xy=(1, 1.018), xycoords='axes fraction', fontsize=22, fontname='Helvetica',
	# ha='right', annotation_clip=False)
	# ax1.annotate(r'$\mathbf{CMS}$', xy=(0.001, 1.018), xycoords='axes fraction', fontname='Helvetica', fontsize=28,
	# ha='left', annotation_clip=False)
	# ax1.annotate(r'$Simulation\ Preliminary$', xy=(0.12, 1.018), xycoords='axes fraction', fontsize=22, fontname='Helvetica',
	# fontstyle='italic', ha='left', annotation_clip=False)
	ax1.annotate(r'Gaussian kde smoothing', xy=(0.03, 0.05), xycoords='axes fraction', fontsize=16, fontname='Helvetica',
	fontstyle='italic', ha='left', annotation_clip=False)
	import cmsstyle as cms
	ax1 = cms.cms_annot(ax=ax1)

	f.savefig(os.path.join(savedir,'kdeM_sculpting_'+fname+"_tag"+siglab+"_"+sculp_label+"_pt{}-{}".format(pt[0], pt[1])+'.png'), dpi=400, transparent=True)
	f.savefig(os.path.join(savedir,'kdeM_sculpting_'+fname+"_tag"+siglab+"_"+sculp_label+"_pt{}-{}".format(pt[0], pt[1])+'.pdf'), dpi=400, transparent=True)


	for plow, phigh in zip([350],[1200]):
	sculpting3(df2, pt=[plow, phigh], siglab="Hbb", sculp_label='QCD', fname='doubleb', wps=[0.01,0.03, 0.06, 0.1, 1.0],
	tagger='double-B', savedir=savedir, onlyQCD=True, useTagger='fj_doubleb')