mtzl · October 11, 2016 17:19
diff --git a/Orca PID Test b/Orca PID Test
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "from km3pipe.io import H5Chain\n",
    "\n",
    "prefix = '/home/mlotze/km3net/orca-9m/uncut/'\n",
    "n_events = {\n",
    "    prefix + 'numuon_cc.h5': 2400,\n",
    "    prefix + 'nuelec_cc.h5': 1200,\n",
    "    prefix + 'nuelec_nc.h5': 1200,\n",
    "    prefix + 'atmuon.h5': 2400,\n",
    "}\n",
    "files = list(n_events.keys())\n",
    "with H5Chain(files) as c:\n",
    "    df = c(n_events)['mva']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "plt.style.use('foursigma')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "df = df.fillna(0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "from km3pipe.io import guess_mc_feats\n",
    "\n",
    "mc_feats = guess_mc_feats(df)\n",
    "reco_feats = [feat for feat in df.columns if feat not in mc_feats]\n",
    "reco_feats\n",
    "X = df[reco_feats]\n",
    "mc_info = df[mc_feats]\n",
    "del(df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "from trawler import TargetGen\n",
    "\n",
    "mc_type = mc_info['dusj_MCtype']\n",
    "is_cc = np.array([True] * mc_type.shape[0])\n",
    "tg = TargetGen(is_cc, mc_type)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Counter({'cscd': 2400, 'atmu': 2400, 'track': 2400})\n",
      "Counter({0: 2400, 1: 2400, 2: 2400})\n",
      "['atmu' 'cscd' 'track']\n"
     ]
    }
   ],
   "source": [
    "from collections import Counter \n",
    "from sklearn.preprocessing import LabelEncoder\n",
    "\n",
    "target = tg.atmu_track_cscd\n",
    "le = LabelEncoder()\n",
    "y = le.fit_transform(target)\n",
    "\n",
    "print(Counter(target))\n",
    "print(Counter(y))\n",
    "print(le.classes_)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'atmu': 0, 'cscd': 1, 'track': 2}"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "num2tag = {num: le.inverse_transform(num) for num in np.unique(y)}\n",
    "tag2num = {num2tag[num]: num for num, tag in num2tag.items()}\n",
    "tag2num"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# make energy binning\n",
    "e_min = 0\n",
    "e_max = 100\n",
    "nbins = 25\n",
    "\n",
    "binlims = np.linspace(e_min, e_max, nbins+1)\n",
    "log_binlims = np.logspace(0, 2, nbins+1)\n",
    "energy = mc_info['dusj_MCen']\n",
    "log_energy = np.log10(energy) \n",
    "r = np.sqrt(np.power(X['tf_r_x'], 2) + np.power(X['tf_r_y'], 2))\n",
    "e_rec_cscd = X['dusj_d_E']\n",
    "e_rec_trak = X['tf_r_NuE']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['EventID',\n",
       " 'dusj_EventID',\n",
       " 'dusj_GlobalWeight',\n",
       " 'dusj_LivetimeSec',\n",
       " 'dusj_MCCh',\n",
       " 'dusj_MC_phi',\n",
       " 'dusj_MC_theta',\n",
       " 'dusj_MC_x',\n",
       " 'dusj_MC_y',\n",
       " 'dusj_MC_z',\n",
       " 'dusj_MCby',\n",
       " 'dusj_MCen',\n",
       " 'dusj_MCpr',\n",
       " 'dusj_MCtype',\n",
       " 'dusj_NEvents',\n",
       " 'dusj_OneWeight',\n",
       " 'tf_EventID',\n",
       " 'tf_MCen']"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sorted(mc_info.columns)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "tm = y == tag2num['track']\n",
    "cm = y == tag2num['cscd']\n",
    "mm = y == tag2num['atmu']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "lab = {'y': y}\n",
    "lab['w2'] = mc_info['dusj_OneWeight']\n",
    "lab['n_evts'] = mc_info['dusj_NEvents']\n",
    "lab['livetime'] = mc_info['dusj_LivetimeSec']\n",
    "lab['target'] = target\n",
    "lab['mc_bjorken_y'] = mc_info['dusj_MCby']\n",
    "lab['mc_energy'] = mc_info['dusj_MCen']\n",
    "lab['log_mc_energy'] = np.log10(mc_info['dusj_MCen'])\n",
    "lab = pd.DataFrame.from_dict(lab)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "from honda_2015 import HondaFlux\n",
    "zenith = mc_info['dusj_MC_theta']\n",
    "hf = HondaFlux('pkg/honda_2015/honda_2015/honda2015_frejus_solarmin.h5')\n",
    "# TODO: currently, atmu weights are = 1\n",
    "wgt_honda = np.ones_like(lab['mc_energy'])\n",
    "wgt_honda[tm] = hf('nu_mu', zenith[tm], energy[tm]) * lab['w2'][tm] / lab['n_evts'][tm]\n",
    "wgt_honda[cm] = hf('nu_e', zenith[cm], energy[cm]) * lab['w2'][cm] / lab['n_evts'][cm]\n",
    "wgt_honda[mm] = 1 / lab['livetime'][mm]\n",
    "lab['honda'] = wgt_honda"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "for target in ['track', 'atmu', 'cscd']:\n",
    "    plt.hist('mc_energy', weights='honda', data=lab[lab.target == target], histtype='stepfilled', alpha=.5, bins=log_binlims, label=target, )\n",
    "plt.legend()\n",
    "#plt.yscale('log')\n",
    "#plt.xscale('log')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "lab['mc_energy_bin'] = pd.cut(lab['mc_energy'], binlims, labels=False)\n",
    "lab['log_mc_energy_bin'] = pd.cut(lab['log_mc_energy'], binlims, labels=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "X_train, X_test, lab_train, lab_test, = train_test_split(X, lab, stratify=lab['y'], test_size=.33, random_state=42)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from sklearn.metrics import roc_auc_score, make_scorer\n",
    "clf_pipe = Pipeline([\n",
    "    ('remove_constant', VarianceThreshold()),    \n",
    "    ('quantile_scaler', RobustScaler()),\n",
    "    #('pca', PCA(svd_solver='auto')),\n",
    "    ('rf', RandomForestClassifier(\n",
    "                n_estimators=319, \n",
    "                max_features='auto',\n",
    "                criterion='entropy',\n",
    "                class_weight='balanced', \n",
    "                n_jobs=-1,\n",
    "                verbose=1\n",
    "                                )),\n",
    "])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "#skf = StratifiedKFold(n_splits=3, shuffle=True)\n",
    "#grid = GridSearchCV(\n",
    "#    clf_pipe, \n",
    "#    param_grid={\n",
    "#        'rf__criterion': ['entropy', 'gini'],\n",
    "#        #'rf__max_features': ['auto', None, 'log2']\n",
    "#        }, \n",
    "#    refit=True, \n",
    "#    #scoring='roc_auc', \n",
    "#    scoring=roc_auc_score(), \n",
    "#    cv=skf, verbose=1,\n",
    "#    #fit_params={'rf__sample_weight': wgt_train,},\n",
    "#)\n",
    "grid = clf_pipe"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "clf = grid.fit(X_train, lab_train['y'])\n",
    "score = clf.predict_proba(X_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "for t in ['cscd', 'track', 'atmu']:\n",
    "    lab_test[t] = score[:, tag2num[t]]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "lab_test.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "lab_test['pred'] = lab_test[['track', 'cscd', 'atmu']].idxmax(axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "binlims\n",
    "binwidth = (binlims[1] - binlims[0])\n",
    "x = binlims[:-1]\n",
    "lines = 'steps-post'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "#lab_test.groupby(pd.cut(lab_test['mc_energy'], binlims)).query()\n",
    "\n",
    "col = []\n",
    "for i in range(binlims.shape[0] - 1):\n",
    "    low = binlims[i]\n",
    "    up = binlims[i+1]\n",
    "    dat = lab_test.query('%f <= mc_energy < %f' % (low, up))\n",
    "    c = {}\n",
    "    for pred in ['cscd', 'track', 'atmu']:\n",
    "        n = dat.query('pred == \"%s\"' % pred).honda.sum()\n",
    "        for true in ['cscd', 'track', 'atmu']:\n",
    "            res = dat.query('target == \"%s\" and pred == \"%s\"' % (true, pred))\n",
    "            try:\n",
    "                co = res.honda.sum()/n\n",
    "            except ZeroDivisionError:\n",
    "                co = 0\n",
    "            c[\"%s_as_%s\" % (true, pred)] = co\n",
    "    col.append(c)\n",
    "col = pd.DataFrame(col)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "col"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "col.atmu_as_track.plot(ls='steps-post')\n",
    "col.track_as_track.plot(ls='steps-post')\n",
    "col.cscd_as_track.plot(ls='steps-post')\n",
    "plt.title('Prediction: Track')\n",
    "plt.ylim(-0.05, 1.05)\n",
    "plt.legend(loc=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "col.atmu_as_cscd.plot(ls='steps-post')\n",
    "col.track_as_cscd.plot(ls='steps-post')\n",
    "col.cscd_as_cscd.plot(ls='steps-post')\n",
    "plt.title('Prediction: Cascade')\n",
    "plt.ylim(-0.05, 1.05)\n",
    "plt.legend(loc=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "fig, (ax1, ax2, ax3) = plt.subplots(ncols=3, figsize=(10, 3))\n",
    "bins = (\n",
    "    np.linspace(0, 2, 11),\n",
    "    np.linspace(0, 1, 11), \n",
    ")\n",
    "ax1.hist2d('log_mc_energy', 'track', weights='honda', data=lab_test.query('target==\"track\"'), bins=bins)\n",
    "ax2.hist2d('log_mc_energy', 'track', weights='honda', data=lab_test.query('target==\"cscd\"'), bins=bins)\n",
    "ax3.hist2d('log_mc_energy', 'track', weights='honda', data=lab_test.query('target==\"atmu\"'))\n",
    "ax2.set_title('pred = track')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "fig, (ax1, ax2, ax3) = plt.subplots(ncols=3, figsize=(10, 3))\n",
    "bins = (\n",
    "    np.linspace(0, 2, 11),\n",
    "    np.linspace(0, 1, 11), \n",
    ")\n",
    "ax1.hist2d('log_mc_energy', 'cscd', weights='honda', data=lab_test.query('target==\"track\"'), bins=bins)\n",
    "ax2.hist2d('log_mc_energy', 'cscd', weights='honda', data=lab_test.query('target==\"cscd\"'), bins=bins)\n",
    "ax3.hist2d('log_mc_energy', 'cscd', weights='honda', data=lab_test.query('target==\"atmu\"'))\n",
    "ax2.set_title('pred = cscd')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "fig, (ax1, ax2, ax3) = plt.subplots(ncols=3, figsize=(10, 3))\n",
    "bins = (\n",
    "    np.linspace(0, 2, 11),\n",
    "    np.linspace(0, 1, 11), \n",
    ")\n",
    "ax1.hist2d('log_mc_energy', 'atmu', weights='honda', data=lab_test.query('target==\"track\"'), bins=bins)\n",
    "ax2.hist2d('log_mc_energy', 'atmu', weights='honda', data=lab_test.query('target==\"cscd\"'), bins=bins)\n",
    "ax3.hist2d('log_mc_energy', 'atmu', weights='honda', data=lab_test.query('target==\"atmu\"'))\n",
    "ax2.set_title('pred = atmu')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "plt.title('''Prediction: \"It's a Track\"''')\n",
    "norm = np.sum(np.array((track_as_track, cscd_as_track)), axis=0)\n",
    "plt.plot(x, track_as_track/norm, label=r'$\\nu_\\mu$', ls=lines)\n",
    "plt.plot(x, cscd_as_track/norm, label=r'$\\nu_e$', ls=lines)\n",
    "plt.ylim(-0.05, 1.05)\n",
    "plt.ylabel('Fraction of \"Track\" population')\n",
    "plt.legend(loc=0)\n",
    "plt.xlabel('Energy / GeV')\n",
    "plt.savefig('plots/pred_track_composition.svg')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "plt.title('''Prediction: \"It's a Cascade\"''')\n",
    "norm = np.sum(np.array((track_as_cscd, cscd_as_cscd)), axis=0)\n",
    "plt.plot(x, track_as_cscd/norm, label=r'$\\nu_\\mu$', ls=lines)\n",
    "plt.plot(x, cscd_as_cscd/norm, label=r'$\\nu_e$', ls=lines)\n",
    "plt.ylim(-0.05, 1.05)\n",
    "plt.ylabel('Fraction of \"Cascade\" population')\n",
    "plt.legend(loc=0)\n",
    "plt.xlabel('Energy / GeV')\n",
    "plt.savefig('plots/pred_cscd_composition.svg')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "plt.title(r'''How get $\\nu_\\mu$ labeled?''')\n",
    "norm = np.sum(np.array((track_as_track, track_as_cscd)), axis=0)\n",
    "plt.plot(x, track_as_track/norm, label='Tagged: Track', ls=lines)\n",
    "plt.plot(x, track_as_cscd/norm, label='Tagged: Cascade', ls=lines)\n",
    "plt.ylim(-0.05, 1.05)\n",
    "plt.ylabel(r'% of $\\nu_\\mu$ ')\n",
    "plt.legend(loc=0)\n",
    "plt.xlabel('Energy / GeV')\n",
    "plt.savefig('plots/nu_mu_predictions.svg')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "plt.title(r'''How get $\\nu_e$ labeled?''')\n",
    "norm = np.sum(np.array((cscd_as_track, cscd_as_cscd)), axis=0)\n",
    "plt.plot(x, cscd_as_track/norm, label='Tagged: Track', ls=lines)\n",
    "plt.plot(x, cscd_as_cscd/norm, label='Tagged: Cascade', ls=lines)\n",
    "plt.ylim(-0.05, 1.05)\n",
    "plt.ylabel(r'% of $\\nu_e$ ')\n",
    "plt.legend(loc=0)\n",
    "plt.xlabel('Energy / GeV')\n",
    "plt.savefig('plots/nu_e_and_NC_predictions.svg')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "plt.plot(x, aucs, ls=lines)\n",
    "plt.ylim(0, 1)\n",
    "plt.title('Overall (Cut-Independent) PID Performance') \n",
    "plt.ylabel('Area under ROC Curve (AUC)')\n",
    "plt.xlabel('Energy / GeV')\n",
    "plt.savefig('plots/auc.svg')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "print('Hi!')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "plt.title(r'How do $\\nu$ get labeled?')\n",
    "plt.hist(score[y_test == 0], bins=np.linspace(0, 1, 50+1), histtype='stepfilled', alpha=.5, label=r'$\\nu_e$')\n",
    "plt.hist(score[y_test == 1], bins=np.linspace(0, 1, 50+1), histtype='stepfilled', alpha=.5, label=r'$\\nu_\\mu$')\n",
    "plt.legend()\n",
    "plt.xlabel('Confidence of \"Event is Track\"')\n",
    "plt.yscale('log')\n",
    "plt.savefig('plots/rf_score.svg')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "print(sorted(mc_test.columns))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "out = mc_test.copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "df = pd.DataFrame.from_dict({\n",
    "        'zenith_cscd': X_test['dusj_d_theta'],\n",
    "        'zenith_track': np.arccos(-1 * X_test['tf_r_dir_z']) * 180 / np.pi,\n",
    "        'zenith_mc': mc_test['dusj_MC_theta'],\n",
    "        'azimuth_cscd': X_test['dusj_d_phi'],\n",
    "        'azimuth_track': np.arctan2(X_test['tf_r_dir_x'], X_test['tf_r_dir_y']) * 180 / np.pi,\n",
    "        'azimuth_mc': mc_test['dusj_MC_phi'],\n",
    "        'energy_cscd': X_test['dusj_d_E'],\n",
    "        'energy_track': X_test['tf_r_NuE'],\n",
    "        'energy_mc': mc_test['dusj_MCen'], \n",
    "        'bjorken_y': X_test['tf_r_by'], \n",
    "        'bjorken_y_mc': mc_test['dusj_MCby'],\n",
    "        'proba_track': score, \n",
    "        'w2': mc_test['dusj_OneWeight'],\n",
    "        'n_evts': mc_test['dusj_NEvents'],\n",
    "        'livetime_sec': mc_test['dusj_LivetimeSec'],\n",
    "        'type_mc': mc_test['dusj_MCtype'],\n",
    "        'true_label': target_test,\n",
    "        'true_y': y_test,\n",
    "        'flux_honda2015': honda_test,\n",
    "    })"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "#h5 = h5py.File('df_test.h5', 'a')\n",
    "#h5.create_dataset('pid_out', data=df.to_records())\n",
    "#h5.close()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "e_max = 10"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "cut = 'energy_mc < %d and true_label == \"cscd\"' % e_max\n",
    "plt.hexbin('proba_track', 'bjorken_y_mc', data=df.query(cut), gridsize=10, extent=(0, 1, 0, 1))\n",
    "plt.ylabel('bjorken_y_mc')\n",
    "plt.xlabel('confidence \"Event is Track\"')\n",
    "plt.title(cut)\n",
    "plt.colorbar()\n",
    "plt.savefig('plots/pid_vs_mc_bjorken-y_cscd_%d.png' % e_max)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "cut = 'energy_mc < %d and true_label == \"track\"'  % e_max\n",
    "plt.hexbin('proba_track', 'bjorken_y_mc', data=df.query(cut), gridsize=10, extent=(0, 1, 0, 1))\n",
    "plt.ylabel('bjorken_y_mc')\n",
    "plt.xlabel('confidence \"Event is Track\"')\n",
    "plt.title(cut)\n",
    "plt.colorbar()\n",
    "plt.savefig('plots/pid_vs_mc_bjorken-y_track_%d.png' % e_max)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "    #variances = clf.best_estimator_.steps[0][1].variances_\n",
    "#rf = clf.best_estimator_.steps[-1][1]\n",
    "#imp = pd.DataFrame.from_dict({\n",
    "#        'features': X_test.columns[variances != 0],\n",
    "#        'importance': rf.feature_importances_\n",
    "#})\n",
    "#imp.sort_values(by='importance')[::-1].head(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "fig, (ax_cscd, ax_track) = plt.subplots(ncols=2, figsize=(10, 4))\n",
    "from matplotlib.colors import LogNorm\n",
    "lims = (0, 1, 0, 2)\n",
    "p1 = ax_cscd.hexbin(score[y_test == 0], np.log10(mc_test['dusj_MCen'][y_test == 0]), gridsize=25, norm=LogNorm(), extent=lims)\n",
    "p2 = ax_track.hexbin(score[y_test == 1], np.log10(mc_test['dusj_MCen'][y_test == 1]), gridsize=25, norm=LogNorm(), extent=lims)\n",
    "fig.suptitle('Unweighted Input')\n",
    "plt.colorbar(p1, ax=ax_cscd)\n",
    "plt.colorbar(p2, ax=ax_track)\n",
    "ax_cscd.set_title('true cscd')\n",
    "ax_track.set_title('true track')\n",
    "ax_cscd.set_ylabel('log10(Energy)')\n",
    "ax_cscd.set_xlabel('Probability(Track)')\n",
    "plt.tight_layout()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "fig, (ax_cscd, ax_track) = plt.subplots(ncols=2, figsize=(10, 4))\n",
    "from matplotlib.colors import LogNorm\n",
    "lims = (0, 1, 0, 2)\n",
    "p1 = ax_cscd.hexbin(score[y_test == 0], np.log10(mc_test['dusj_MCen'][y_test == 0]), gridsize=25, norm=LogNorm(), extent=lims)\n",
    "p2 = ax_track.hexbin(score[y_test == 1], np.log10(mc_test['dusj_MCen'][y_test == 1]), gridsize=25, norm=LogNorm(), extent=lims)\n",
    "fig.suptitle('Unweighted Input')\n",
    "plt.colorbar(p1, ax=ax_cscd)\n",
    "plt.colorbar(p2, ax=ax_track)\n",
    "ax_cscd.set_title('true cscd')\n",
    "ax_track.set_title('true track')\n",
    "ax_cscd.set_ylabel('log10(Energy)')\n",
    "ax_cscd.set_xlabel('Probability(Track)')\n",
    "plt.tight_layout()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(8, 3))\n",
    "p1 = ax1.hexbin(pred_atmu, np.log10(X['tf_r_NuE'][Y_abs == 13]), gridsize=25, norm=LogNorm()\n",
    "                )\n",
    "                #, extent=lims\n",
    "p2 = ax2.hexbin(pred_atmu, np.log10(X['dusj_d_E'][Y_abs == 13]), gridsize=25, norm=LogNorm()\n",
    "                #, extent=lims\n",
    "                )\n",
    "plt.colorbar(p1, ax=ax1)\n",
    "plt.colorbar(p2, ax=ax2)\n",
    "ax1.set_ylabel('log10(NuE)')\n",
    "ax2.set_ylabel('log10(d_E)')\n",
    "ax1.set_xlabel('Probability(Track)')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "fig, (ax_cscd, ax_trak) = plt.subplots(ncols=2, figsize=(8, 3))\n",
    "ax_cscd.hist(energy[y == 1], bins=lin_bins, histtype='stepfilled', alpha=.5, label='mc')\n",
    "ax_cscd.hist(e_rec_cscd[y == 1], bins=lin_bins, histtype='stepfilled', alpha=.5, label='shower reco')\n",
    "ax_trak.hist(energy[y == 2], bins=lin_bins, histtype='stepfilled', alpha=.5, label='mc')\n",
    "ax_trak.hist(e_rec_trak[y == 2], bins=lin_bins, histtype='stepfilled', alpha=.5, label='track reco')\n",
    "ax_cscd.set_title('cscd')\n",
    "ax_trak.set_title('nu_mu cc')\n",
    "for ax in ax_trak, ax_cscd:\n",
    "    #ax.set_xscale('log')\n",
    "    ax.set_xlabel('Energy / GeV')\n",
    "    ax.legend()\n",
    "    #ax.set_yscale('log')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(8, 3))\n",
    "lims = (0, 100, 0, 100)\n",
    "p1 = ax1.hexbin(energy[Y_abs == 12], e_rec_cscd[Y_abs == 12], gridsize=25, norm=LogNorm(), extent=lims)\n",
    "p2 = ax2.hexbin(energy[Y_abs == 12], e_rec_cscd[Y_abs == 12], gridsize=25, extent=lims)\n",
    "plt.colorbar(p1, ax=ax1)\n",
    "plt.colorbar(p2, ax=ax2)\n",
    "for ax in ax1, ax2:\n",
    "    ax.set_xlabel('e_mc')\n",
    "    ax.set_ylabel('e_reco')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(8, 3))\n",
    "p1 = ax1.hexbin(energy[Y_abs == 14], e_rec_trak[Y_abs == 14], gridsize=25, norm=LogNorm())\n",
    "p2 = ax2.hexbin(energy[Y_abs == 14], e_rec_trak[Y_abs == 14], gridsize=25,)\n",
    "plt.colorbar(p1, ax=ax1)\n",
    "plt.colorbar(p2, ax=ax2)\n",
    "for ax in ax1, ax2:\n",
    "    ax.set_xlabel('e_mc')\n",
    "    ax.set_ylabel('e_reco')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
 }