Created
October 14, 2016 15:17
-
-
Save mtzl/1954d9663ebc1d773f799c125b3347c7 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"from km3pipe.io import H5Chain\n", | |
"\n", | |
"prefix = '/home/mlotze/km3net/orca-9m/uncut/'\n", | |
"n_events = {\n", | |
" prefix + 'numuon_cc.h5': 2400,\n", | |
" prefix + 'nuelec_cc.h5': 1200,\n", | |
" prefix + 'nuelec_nc.h5': 1200,\n", | |
" prefix + 'atmuon.h5': 2400,\n", | |
"}\n", | |
"files = list(n_events.keys())\n", | |
"with H5Chain(files) as c:\n", | |
" df = c(n_events)['mva']" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"plt.style.use('foursigma')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"df = df.fillna(0)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"from km3pipe.io import guess_mc_feats\n", | |
"\n", | |
"mc_feats = guess_mc_feats(df)\n", | |
"reco_feats = [feat for feat in df.columns if feat not in mc_feats]\n", | |
"reco_feats\n", | |
"X = df[reco_feats]\n", | |
"mc_info = df[mc_feats]\n", | |
"del(df)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>dusj_geoCoverage_R105h10dusj_angle6dusj_lmin2dusj_best_DusjOrcaUsingProbabilitiesFinalFit_FitResult</th>\n", | |
" <th>dusj_Slopeo100dusj_0</th>\n", | |
" <th>dusj_best_SecondDusjOrcaVertexFit_OUTVicinityNumber</th>\n", | |
" <th>dusj_CoverageR105h10dusj_geoCoverage_angle45_lmin2dusj_RecoLNSRecoParticle</th>\n", | |
" <th>dusj_CoverageR105h10dusj_geoCoverage_angle6dusj_lmin2dusj_RecoLNSRecoParticle</th>\n", | |
" <th>tf_r_by</th>\n", | |
" <th>dusj_Slope5dusj_1000</th>\n", | |
" <th>dusj_best_FirstDusjOrcaVertexFit_OUTVicinityNumber</th>\n", | |
" <th>dusj_Chi220dusj_1000</th>\n", | |
" <th>dusj_best_SecondDusjOrcaVertexFit_OUTFiducalNumber</th>\n", | |
" <th>...</th>\n", | |
" <th>tf_r_NuE</th>\n", | |
" <th>dusj_Slope25_1000</th>\n", | |
" <th>dusj_Chi25dusj_1000</th>\n", | |
" <th>dusj_best_DusjOrcaUsingProbabilitiesFinalFit_BjorkenY</th>\n", | |
" <th>dusj_TimeResidualMean</th>\n", | |
" <th>dusj_MiddleInertia</th>\n", | |
" <th>dusj_HitPatternCharge_finalShowerHits_10degAroundCherAngle_Charge_scalarProduct</th>\n", | |
" <th>dusj_Slopedusj_1000</th>\n", | |
" <th>tf_r_z</th>\n", | |
" <th>dusj_CounterStartOverStartTrack</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>0.85</td>\n", | |
" <td>0.000379</td>\n", | |
" <td>6.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.875</td>\n", | |
" <td>0.000356</td>\n", | |
" <td>11.0</td>\n", | |
" <td>0.007329</td>\n", | |
" <td>6.0</td>\n", | |
" <td>...</td>\n", | |
" <td>3.253906</td>\n", | |
" <td>0.000357</td>\n", | |
" <td>0.008111</td>\n", | |
" <td>0.90</td>\n", | |
" <td>-58.201857</td>\n", | |
" <td>1042.242152</td>\n", | |
" <td>0.791955</td>\n", | |
" <td>0.000358</td>\n", | |
" <td>39.326953</td>\n", | |
" <td>0.000000</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>0.90</td>\n", | |
" <td>0.000412</td>\n", | |
" <td>1.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>-1234.000</td>\n", | |
" <td>0.000383</td>\n", | |
" <td>2.0</td>\n", | |
" <td>0.014976</td>\n", | |
" <td>6.0</td>\n", | |
" <td>...</td>\n", | |
" <td>-1234.000000</td>\n", | |
" <td>0.000384</td>\n", | |
" <td>0.016860</td>\n", | |
" <td>0.05</td>\n", | |
" <td>-48.548806</td>\n", | |
" <td>1151.558136</td>\n", | |
" <td>0.811148</td>\n", | |
" <td>0.000385</td>\n", | |
" <td>-1234.000000</td>\n", | |
" <td>0.000000</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>1.00</td>\n", | |
" <td>0.000409</td>\n", | |
" <td>1.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.875</td>\n", | |
" <td>0.000353</td>\n", | |
" <td>2.0</td>\n", | |
" <td>0.010872</td>\n", | |
" <td>9.0</td>\n", | |
" <td>...</td>\n", | |
" <td>3.534375</td>\n", | |
" <td>0.000355</td>\n", | |
" <td>0.016733</td>\n", | |
" <td>0.15</td>\n", | |
" <td>-85.728439</td>\n", | |
" <td>1313.900324</td>\n", | |
" <td>0.977715</td>\n", | |
" <td>0.000357</td>\n", | |
" <td>-35.791298</td>\n", | |
" <td>0.090909</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>0.95</td>\n", | |
" <td>0.000383</td>\n", | |
" <td>4.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.875</td>\n", | |
" <td>0.000409</td>\n", | |
" <td>4.0</td>\n", | |
" <td>0.007701</td>\n", | |
" <td>8.0</td>\n", | |
" <td>...</td>\n", | |
" <td>2.112500</td>\n", | |
" <td>0.000407</td>\n", | |
" <td>0.007904</td>\n", | |
" <td>0.15</td>\n", | |
" <td>0.178051</td>\n", | |
" <td>1298.805544</td>\n", | |
" <td>0.838437</td>\n", | |
" <td>0.000406</td>\n", | |
" <td>-50.005799</td>\n", | |
" <td>0.100000</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>1.00</td>\n", | |
" <td>0.000414</td>\n", | |
" <td>2.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>-1234.000</td>\n", | |
" <td>0.000420</td>\n", | |
" <td>5.0</td>\n", | |
" <td>0.001300</td>\n", | |
" <td>8.0</td>\n", | |
" <td>...</td>\n", | |
" <td>-1234.000000</td>\n", | |
" <td>0.000419</td>\n", | |
" <td>0.001795</td>\n", | |
" <td>0.75</td>\n", | |
" <td>-6.026219</td>\n", | |
" <td>1596.577549</td>\n", | |
" <td>0.812488</td>\n", | |
" <td>0.000417</td>\n", | |
" <td>-1234.000000</td>\n", | |
" <td>0.285714</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"<p>5 rows × 78 columns</p>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" dusj_geoCoverage_R105h10dusj_angle6dusj_lmin2dusj_best_DusjOrcaUsingProbabilitiesFinalFit_FitResult \\\n", | |
"0 0.85 \n", | |
"1 0.90 \n", | |
"2 1.00 \n", | |
"3 0.95 \n", | |
"4 1.00 \n", | |
"\n", | |
" dusj_Slopeo100dusj_0 dusj_best_SecondDusjOrcaVertexFit_OUTVicinityNumber \\\n", | |
"0 0.000379 6.0 \n", | |
"1 0.000412 1.0 \n", | |
"2 0.000409 1.0 \n", | |
"3 0.000383 4.0 \n", | |
"4 0.000414 2.0 \n", | |
"\n", | |
" dusj_CoverageR105h10dusj_geoCoverage_angle45_lmin2dusj_RecoLNSRecoParticle \\\n", | |
"0 0.0 \n", | |
"1 0.0 \n", | |
"2 0.0 \n", | |
"3 0.0 \n", | |
"4 0.0 \n", | |
"\n", | |
" dusj_CoverageR105h10dusj_geoCoverage_angle6dusj_lmin2dusj_RecoLNSRecoParticle \\\n", | |
"0 0.0 \n", | |
"1 0.0 \n", | |
"2 0.0 \n", | |
"3 0.0 \n", | |
"4 0.0 \n", | |
"\n", | |
" tf_r_by dusj_Slope5dusj_1000 \\\n", | |
"0 0.875 0.000356 \n", | |
"1 -1234.000 0.000383 \n", | |
"2 0.875 0.000353 \n", | |
"3 0.875 0.000409 \n", | |
"4 -1234.000 0.000420 \n", | |
"\n", | |
" dusj_best_FirstDusjOrcaVertexFit_OUTVicinityNumber dusj_Chi220dusj_1000 \\\n", | |
"0 11.0 0.007329 \n", | |
"1 2.0 0.014976 \n", | |
"2 2.0 0.010872 \n", | |
"3 4.0 0.007701 \n", | |
"4 5.0 0.001300 \n", | |
"\n", | |
" dusj_best_SecondDusjOrcaVertexFit_OUTFiducalNumber \\\n", | |
"0 6.0 \n", | |
"1 6.0 \n", | |
"2 9.0 \n", | |
"3 8.0 \n", | |
"4 8.0 \n", | |
"\n", | |
" ... tf_r_NuE dusj_Slope25_1000 \\\n", | |
"0 ... 3.253906 0.000357 \n", | |
"1 ... -1234.000000 0.000384 \n", | |
"2 ... 3.534375 0.000355 \n", | |
"3 ... 2.112500 0.000407 \n", | |
"4 ... -1234.000000 0.000419 \n", | |
"\n", | |
" dusj_Chi25dusj_1000 dusj_best_DusjOrcaUsingProbabilitiesFinalFit_BjorkenY \\\n", | |
"0 0.008111 0.90 \n", | |
"1 0.016860 0.05 \n", | |
"2 0.016733 0.15 \n", | |
"3 0.007904 0.15 \n", | |
"4 0.001795 0.75 \n", | |
"\n", | |
" dusj_TimeResidualMean dusj_MiddleInertia \\\n", | |
"0 -58.201857 1042.242152 \n", | |
"1 -48.548806 1151.558136 \n", | |
"2 -85.728439 1313.900324 \n", | |
"3 0.178051 1298.805544 \n", | |
"4 -6.026219 1596.577549 \n", | |
"\n", | |
" dusj_HitPatternCharge_finalShowerHits_10degAroundCherAngle_Charge_scalarProduct \\\n", | |
"0 0.791955 \n", | |
"1 0.811148 \n", | |
"2 0.977715 \n", | |
"3 0.838437 \n", | |
"4 0.812488 \n", | |
"\n", | |
" dusj_Slopedusj_1000 tf_r_z dusj_CounterStartOverStartTrack \n", | |
"0 0.000358 39.326953 0.000000 \n", | |
"1 0.000385 -1234.000000 0.000000 \n", | |
"2 0.000357 -35.791298 0.090909 \n", | |
"3 0.000406 -50.005799 0.100000 \n", | |
"4 0.000417 -1234.000000 0.285714 \n", | |
"\n", | |
"[5 rows x 78 columns]" | |
] | |
}, | |
"execution_count": 9, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"X[:5]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"from trawler import TargetGen\n", | |
"\n", | |
"mc_type = mc_info['dusj_MCtype']\n", | |
"is_cc = np.array([True] * mc_type.shape[0])\n", | |
"tg = TargetGen(is_cc, mc_type)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Counter({'cscd': 2400, 'atmu': 2400, 'track': 2400})\n", | |
"Counter({0: 2400, 1: 2400, 2: 2400})\n", | |
"['atmu' 'cscd' 'track']\n" | |
] | |
} | |
], | |
"source": [ | |
"from collections import Counter \n", | |
"from sklearn.preprocessing import LabelEncoder\n", | |
"\n", | |
"target = tg.atmu_track_cscd\n", | |
"le = LabelEncoder()\n", | |
"y = le.fit_transform(target)\n", | |
"\n", | |
"print(Counter(target))\n", | |
"print(Counter(y))\n", | |
"print(le.classes_)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"{'atmu': 0, 'cscd': 1, 'track': 2}" | |
] | |
}, | |
"execution_count": 7, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"num2tag = {num: le.inverse_transform(num) for num in np.unique(y)}\n", | |
"tag2num = {num2tag[num]: num for num, tag in num2tag.items()}\n", | |
"tag2num" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"# make energy binning\n", | |
"e_min = 0\n", | |
"e_max = 100\n", | |
"nbins = 25\n", | |
"\n", | |
"binlims = np.linspace(e_min, e_max, nbins+1)\n", | |
"log_binlims = np.logspace(0, 2, nbins+1)\n", | |
"energy = mc_info['dusj_MCen']\n", | |
"log_energy = np.log10(energy) \n", | |
"r = np.sqrt(np.power(X['tf_r_x'], 2) + np.power(X['tf_r_y'], 2))\n", | |
"e_rec_cscd = X['dusj_d_E']\n", | |
"e_rec_trak = X['tf_r_NuE']" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"['EventID',\n", | |
" 'dusj_EventID',\n", | |
" 'dusj_GlobalWeight',\n", | |
" 'dusj_LivetimeSec',\n", | |
" 'dusj_MCCh',\n", | |
" 'dusj_MC_phi',\n", | |
" 'dusj_MC_theta',\n", | |
" 'dusj_MC_x',\n", | |
" 'dusj_MC_y',\n", | |
" 'dusj_MC_z',\n", | |
" 'dusj_MCby',\n", | |
" 'dusj_MCen',\n", | |
" 'dusj_MCpr',\n", | |
" 'dusj_MCtype',\n", | |
" 'dusj_NEvents',\n", | |
" 'dusj_OneWeight',\n", | |
" 'tf_EventID',\n", | |
" 'tf_MCen']" | |
] | |
}, | |
"execution_count": 9, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"sorted(mc_info.columns)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"tm = y == tag2num['track']\n", | |
"cm = y == tag2num['cscd']\n", | |
"mm = y == tag2num['atmu']" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"lab = {'y': y}\n", | |
"lab['w2'] = mc_info['dusj_OneWeight']\n", | |
"lab['n_evts'] = mc_info['dusj_NEvents']\n", | |
"lab['livetime'] = mc_info['dusj_LivetimeSec']\n", | |
"lab['target'] = target\n", | |
"lab['mc_bjorken_y'] = mc_info['dusj_MCby']\n", | |
"lab['mc_energy'] = mc_info['dusj_MCen']\n", | |
"lab['log_mc_energy'] = np.log10(mc_info['dusj_MCen'])\n", | |
"lab = pd.DataFrame.from_dict(lab)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"from honda_2015 import HondaFlux\n", | |
"zenith = mc_info['dusj_MC_theta']\n", | |
"hf = HondaFlux('pkg/honda_2015/honda_2015/honda2015_frejus_solarmin.h5')\n", | |
"# TODO: currently, atmu weights are = 1\n", | |
"wgt_honda = np.ones_like(lab['mc_energy'])\n", | |
"wgt_honda[tm] = hf('nu_mu', zenith[tm], energy[tm]) * lab['w2'][tm] / lab['n_evts'][tm]\n", | |
"wgt_honda[cm] = hf('nu_e', zenith[cm], energy[cm]) * lab['w2'][cm] / lab['n_evts'][cm]\n", | |
"wgt_honda[mm] = 1 / lab['livetime'][mm]\n", | |
"lab['honda'] = wgt_honda" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false, | |
"scrolled": false | |
}, | |
"outputs": [], | |
"source": [ | |
"for target in ['track', 'atmu', 'cscd']:\n", | |
" plt.hist('mc_energy', weights='honda', data=lab[lab.target == target], histtype='stepfilled', alpha=.5, bins=log_binlims, label=target, )\n", | |
"plt.legend()\n", | |
"#plt.yscale('log')\n", | |
"#plt.xscale('log')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"lab['mc_energy_bin'] = pd.cut(lab['mc_energy'], binlims, labels=False)\n", | |
"lab['log_mc_energy_bin'] = pd.cut(lab['log_mc_energy'], binlims, labels=False)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"X_train, X_test, lab_train, lab_test, = train_test_split(X, lab, stratify=lab['y'], test_size=.33, random_state=42)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"from sklearn.metrics import roc_auc_score, make_scorer\n", | |
"clf_pipe = Pipeline([\n", | |
" ('remove_constant', VarianceThreshold()), \n", | |
" ('quantile_scaler', RobustScaler()),\n", | |
" #('pca', PCA(svd_solver='auto')),\n", | |
" ('rf', RandomForestClassifier(\n", | |
" n_estimators=319, \n", | |
" max_features='auto',\n", | |
" criterion='entropy',\n", | |
" class_weight='balanced', \n", | |
" n_jobs=-1,\n", | |
" verbose=1\n", | |
" )),\n", | |
"])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"#skf = StratifiedKFold(n_splits=3, shuffle=True)\n", | |
"#grid = GridSearchCV(\n", | |
"# clf_pipe, \n", | |
"# param_grid={\n", | |
"# 'rf__criterion': ['entropy', 'gini'],\n", | |
"# #'rf__max_features': ['auto', None, 'log2']\n", | |
"# }, \n", | |
"# refit=True, \n", | |
"# #scoring='roc_auc', \n", | |
"# scoring=roc_auc_score(), \n", | |
"# cv=skf, verbose=1,\n", | |
"# #fit_params={'rf__sample_weight': wgt_train,},\n", | |
"#)\n", | |
"grid = clf_pipe" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"clf = grid.fit(X_train, lab_train['y'])\n", | |
"score = clf.predict_proba(X_test)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"for t in ['cscd', 'track', 'atmu']:\n", | |
" lab_test[t] = score[:, tag2num[t]]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"lab_test.head()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"lab_test['pred'] = lab_test[['track', 'cscd', 'atmu']].idxmax(axis=1)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"binlims\n", | |
"binwidth = (binlims[1] - binlims[0])\n", | |
"x = binlims[:-1]\n", | |
"lines = 'steps-post'" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"#lab_test.groupby(pd.cut(lab_test['mc_energy'], binlims)).query()\n", | |
"\n", | |
"col = []\n", | |
"for i in range(binlims.shape[0] - 1):\n", | |
" low = binlims[i]\n", | |
" up = binlims[i+1]\n", | |
" dat = lab_test.query('%f <= mc_energy < %f' % (low, up))\n", | |
" c = {}\n", | |
" for pred in ['cscd', 'track', 'atmu']:\n", | |
" n = dat.query('pred == \"%s\"' % pred).honda.sum()\n", | |
" for true in ['cscd', 'track', 'atmu']:\n", | |
" res = dat.query('target == \"%s\" and pred == \"%s\"' % (true, pred))\n", | |
" try:\n", | |
" co = res.honda.sum()/n\n", | |
" except ZeroDivisionError:\n", | |
" co = 0\n", | |
" c[\"%s_as_%s\" % (true, pred)] = co\n", | |
" col.append(c)\n", | |
"col = pd.DataFrame(col)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"col" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"col.atmu_as_track.plot(ls='steps-post')\n", | |
"col.track_as_track.plot(ls='steps-post')\n", | |
"col.cscd_as_track.plot(ls='steps-post')\n", | |
"plt.title('Prediction: Track')\n", | |
"plt.ylim(-0.05, 1.05)\n", | |
"plt.legend(loc=0)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"col.atmu_as_cscd.plot(ls='steps-post')\n", | |
"col.track_as_cscd.plot(ls='steps-post')\n", | |
"col.cscd_as_cscd.plot(ls='steps-post')\n", | |
"plt.title('Prediction: Cascade')\n", | |
"plt.ylim(-0.05, 1.05)\n", | |
"plt.legend(loc=0)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"fig, (ax1, ax2, ax3) = plt.subplots(ncols=3, figsize=(10, 3))\n", | |
"bins = (\n", | |
" np.linspace(0, 2, 11),\n", | |
" np.linspace(0, 1, 11), \n", | |
")\n", | |
"ax1.hist2d('log_mc_energy', 'track', weights='honda', data=lab_test.query('target==\"track\"'), bins=bins)\n", | |
"ax2.hist2d('log_mc_energy', 'track', weights='honda', data=lab_test.query('target==\"cscd\"'), bins=bins)\n", | |
"ax3.hist2d('log_mc_energy', 'track', weights='honda', data=lab_test.query('target==\"atmu\"'))\n", | |
"ax2.set_title('pred = track')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"fig, (ax1, ax2, ax3) = plt.subplots(ncols=3, figsize=(10, 3))\n", | |
"bins = (\n", | |
" np.linspace(0, 2, 11),\n", | |
" np.linspace(0, 1, 11), \n", | |
")\n", | |
"ax1.hist2d('log_mc_energy', 'cscd', weights='honda', data=lab_test.query('target==\"track\"'), bins=bins)\n", | |
"ax2.hist2d('log_mc_energy', 'cscd', weights='honda', data=lab_test.query('target==\"cscd\"'), bins=bins)\n", | |
"ax3.hist2d('log_mc_energy', 'cscd', weights='honda', data=lab_test.query('target==\"atmu\"'))\n", | |
"ax2.set_title('pred = cscd')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"fig, (ax1, ax2, ax3) = plt.subplots(ncols=3, figsize=(10, 3))\n", | |
"bins = (\n", | |
" np.linspace(0, 2, 11),\n", | |
" np.linspace(0, 1, 11), \n", | |
")\n", | |
"ax1.hist2d('log_mc_energy', 'atmu', weights='honda', data=lab_test.query('target==\"track\"'), bins=bins)\n", | |
"ax2.hist2d('log_mc_energy', 'atmu', weights='honda', data=lab_test.query('target==\"cscd\"'), bins=bins)\n", | |
"ax3.hist2d('log_mc_energy', 'atmu', weights='honda', data=lab_test.query('target==\"atmu\"'))\n", | |
"ax2.set_title('pred = atmu')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"plt.title('''Prediction: \"It's a Track\"''')\n", | |
"norm = np.sum(np.array((track_as_track, cscd_as_track)), axis=0)\n", | |
"plt.plot(x, track_as_track/norm, label=r'$\\nu_\\mu$', ls=lines)\n", | |
"plt.plot(x, cscd_as_track/norm, label=r'$\\nu_e$', ls=lines)\n", | |
"plt.ylim(-0.05, 1.05)\n", | |
"plt.ylabel('Fraction of \"Track\" population')\n", | |
"plt.legend(loc=0)\n", | |
"plt.xlabel('Energy / GeV')\n", | |
"plt.savefig('plots/pred_track_composition.svg')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"plt.title('''Prediction: \"It's a Cascade\"''')\n", | |
"norm = np.sum(np.array((track_as_cscd, cscd_as_cscd)), axis=0)\n", | |
"plt.plot(x, track_as_cscd/norm, label=r'$\\nu_\\mu$', ls=lines)\n", | |
"plt.plot(x, cscd_as_cscd/norm, label=r'$\\nu_e$', ls=lines)\n", | |
"plt.ylim(-0.05, 1.05)\n", | |
"plt.ylabel('Fraction of \"Cascade\" population')\n", | |
"plt.legend(loc=0)\n", | |
"plt.xlabel('Energy / GeV')\n", | |
"plt.savefig('plots/pred_cscd_composition.svg')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"plt.title(r'''How get $\\nu_\\mu$ labeled?''')\n", | |
"norm = np.sum(np.array((track_as_track, track_as_cscd)), axis=0)\n", | |
"plt.plot(x, track_as_track/norm, label='Tagged: Track', ls=lines)\n", | |
"plt.plot(x, track_as_cscd/norm, label='Tagged: Cascade', ls=lines)\n", | |
"plt.ylim(-0.05, 1.05)\n", | |
"plt.ylabel(r'% of $\\nu_\\mu$ ')\n", | |
"plt.legend(loc=0)\n", | |
"plt.xlabel('Energy / GeV')\n", | |
"plt.savefig('plots/nu_mu_predictions.svg')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"plt.title(r'''How get $\\nu_e$ labeled?''')\n", | |
"norm = np.sum(np.array((cscd_as_track, cscd_as_cscd)), axis=0)\n", | |
"plt.plot(x, cscd_as_track/norm, label='Tagged: Track', ls=lines)\n", | |
"plt.plot(x, cscd_as_cscd/norm, label='Tagged: Cascade', ls=lines)\n", | |
"plt.ylim(-0.05, 1.05)\n", | |
"plt.ylabel(r'% of $\\nu_e$ ')\n", | |
"plt.legend(loc=0)\n", | |
"plt.xlabel('Energy / GeV')\n", | |
"plt.savefig('plots/nu_e_and_NC_predictions.svg')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"plt.plot(x, aucs, ls=lines)\n", | |
"plt.ylim(0, 1)\n", | |
"plt.title('Overall (Cut-Independent) PID Performance') \n", | |
"plt.ylabel('Area under ROC Curve (AUC)')\n", | |
"plt.xlabel('Energy / GeV')\n", | |
"plt.savefig('plots/auc.svg')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"print('Hi!')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"plt.title(r'How do $\\nu$ get labeled?')\n", | |
"plt.hist(score[y_test == 0], bins=np.linspace(0, 1, 50+1), histtype='stepfilled', alpha=.5, label=r'$\\nu_e$')\n", | |
"plt.hist(score[y_test == 1], bins=np.linspace(0, 1, 50+1), histtype='stepfilled', alpha=.5, label=r'$\\nu_\\mu$')\n", | |
"plt.legend()\n", | |
"plt.xlabel('Confidence of \"Event is Track\"')\n", | |
"plt.yscale('log')\n", | |
"plt.savefig('plots/rf_score.svg')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"print(sorted(mc_test.columns))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"out = mc_test.copy()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"df = pd.DataFrame.from_dict({\n", | |
" 'zenith_cscd': X_test['dusj_d_theta'],\n", | |
" 'zenith_track': np.arccos(-1 * X_test['tf_r_dir_z']) * 180 / np.pi,\n", | |
" 'zenith_mc': mc_test['dusj_MC_theta'],\n", | |
" 'azimuth_cscd': X_test['dusj_d_phi'],\n", | |
" 'azimuth_track': np.arctan2(X_test['tf_r_dir_x'], X_test['tf_r_dir_y']) * 180 / np.pi,\n", | |
" 'azimuth_mc': mc_test['dusj_MC_phi'],\n", | |
" 'energy_cscd': X_test['dusj_d_E'],\n", | |
" 'energy_track': X_test['tf_r_NuE'],\n", | |
" 'energy_mc': mc_test['dusj_MCen'], \n", | |
" 'bjorken_y': X_test['tf_r_by'], \n", | |
" 'bjorken_y_mc': mc_test['dusj_MCby'],\n", | |
" 'proba_track': score, \n", | |
" 'w2': mc_test['dusj_OneWeight'],\n", | |
" 'n_evts': mc_test['dusj_NEvents'],\n", | |
" 'livetime_sec': mc_test['dusj_LivetimeSec'],\n", | |
" 'type_mc': mc_test['dusj_MCtype'],\n", | |
" 'true_label': target_test,\n", | |
" 'true_y': y_test,\n", | |
" 'flux_honda2015': honda_test,\n", | |
" })" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"#h5 = h5py.File('df_test.h5', 'a')\n", | |
"#h5.create_dataset('pid_out', data=df.to_records())\n", | |
"#h5.close()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"e_max = 10" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"cut = 'energy_mc < %d and true_label == \"cscd\"' % e_max\n", | |
"plt.hexbin('proba_track', 'bjorken_y_mc', data=df.query(cut), gridsize=10, extent=(0, 1, 0, 1))\n", | |
"plt.ylabel('bjorken_y_mc')\n", | |
"plt.xlabel('confidence \"Event is Track\"')\n", | |
"plt.title(cut)\n", | |
"plt.colorbar()\n", | |
"plt.savefig('plots/pid_vs_mc_bjorken-y_cscd_%d.png' % e_max)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"cut = 'energy_mc < %d and true_label == \"track\"' % e_max\n", | |
"plt.hexbin('proba_track', 'bjorken_y_mc', data=df.query(cut), gridsize=10, extent=(0, 1, 0, 1))\n", | |
"plt.ylabel('bjorken_y_mc')\n", | |
"plt.xlabel('confidence \"Event is Track\"')\n", | |
"plt.title(cut)\n", | |
"plt.colorbar()\n", | |
"plt.savefig('plots/pid_vs_mc_bjorken-y_track_%d.png' % e_max)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
" #variances = clf.best_estimator_.steps[0][1].variances_\n", | |
"#rf = clf.best_estimator_.steps[-1][1]\n", | |
"#imp = pd.DataFrame.from_dict({\n", | |
"# 'features': X_test.columns[variances != 0],\n", | |
"# 'importance': rf.feature_importances_\n", | |
"#})\n", | |
"#imp.sort_values(by='importance')[::-1].head(5)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"fig, (ax_cscd, ax_track) = plt.subplots(ncols=2, figsize=(10, 4))\n", | |
"from matplotlib.colors import LogNorm\n", | |
"lims = (0, 1, 0, 2)\n", | |
"p1 = ax_cscd.hexbin(score[y_test == 0], np.log10(mc_test['dusj_MCen'][y_test == 0]), gridsize=25, norm=LogNorm(), extent=lims)\n", | |
"p2 = ax_track.hexbin(score[y_test == 1], np.log10(mc_test['dusj_MCen'][y_test == 1]), gridsize=25, norm=LogNorm(), extent=lims)\n", | |
"fig.suptitle('Unweighted Input')\n", | |
"plt.colorbar(p1, ax=ax_cscd)\n", | |
"plt.colorbar(p2, ax=ax_track)\n", | |
"ax_cscd.set_title('true cscd')\n", | |
"ax_track.set_title('true track')\n", | |
"ax_cscd.set_ylabel('log10(Energy)')\n", | |
"ax_cscd.set_xlabel('Probability(Track)')\n", | |
"plt.tight_layout()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"fig, (ax_cscd, ax_track) = plt.subplots(ncols=2, figsize=(10, 4))\n", | |
"from matplotlib.colors import LogNorm\n", | |
"lims = (0, 1, 0, 2)\n", | |
"p1 = ax_cscd.hexbin(score[y_test == 0], np.log10(mc_test['dusj_MCen'][y_test == 0]), gridsize=25, norm=LogNorm(), extent=lims)\n", | |
"p2 = ax_track.hexbin(score[y_test == 1], np.log10(mc_test['dusj_MCen'][y_test == 1]), gridsize=25, norm=LogNorm(), extent=lims)\n", | |
"fig.suptitle('Unweighted Input')\n", | |
"plt.colorbar(p1, ax=ax_cscd)\n", | |
"plt.colorbar(p2, ax=ax_track)\n", | |
"ax_cscd.set_title('true cscd')\n", | |
"ax_track.set_title('true track')\n", | |
"ax_cscd.set_ylabel('log10(Energy)')\n", | |
"ax_cscd.set_xlabel('Probability(Track)')\n", | |
"plt.tight_layout()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(8, 3))\n", | |
"p1 = ax1.hexbin(pred_atmu, np.log10(X['tf_r_NuE'][Y_abs == 13]), gridsize=25, norm=LogNorm()\n", | |
" )\n", | |
" #, extent=lims\n", | |
"p2 = ax2.hexbin(pred_atmu, np.log10(X['dusj_d_E'][Y_abs == 13]), gridsize=25, norm=LogNorm()\n", | |
" #, extent=lims\n", | |
" )\n", | |
"plt.colorbar(p1, ax=ax1)\n", | |
"plt.colorbar(p2, ax=ax2)\n", | |
"ax1.set_ylabel('log10(NuE)')\n", | |
"ax2.set_ylabel('log10(d_E)')\n", | |
"ax1.set_xlabel('Probability(Track)')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"fig, (ax_cscd, ax_trak) = plt.subplots(ncols=2, figsize=(8, 3))\n", | |
"ax_cscd.hist(energy[y == 1], bins=lin_bins, histtype='stepfilled', alpha=.5, label='mc')\n", | |
"ax_cscd.hist(e_rec_cscd[y == 1], bins=lin_bins, histtype='stepfilled', alpha=.5, label='shower reco')\n", | |
"ax_trak.hist(energy[y == 2], bins=lin_bins, histtype='stepfilled', alpha=.5, label='mc')\n", | |
"ax_trak.hist(e_rec_trak[y == 2], bins=lin_bins, histtype='stepfilled', alpha=.5, label='track reco')\n", | |
"ax_cscd.set_title('cscd')\n", | |
"ax_trak.set_title('nu_mu cc')\n", | |
"for ax in ax_trak, ax_cscd:\n", | |
" #ax.set_xscale('log')\n", | |
" ax.set_xlabel('Energy / GeV')\n", | |
" ax.legend()\n", | |
" #ax.set_yscale('log')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(8, 3))\n", | |
"lims = (0, 100, 0, 100)\n", | |
"p1 = ax1.hexbin(energy[Y_abs == 12], e_rec_cscd[Y_abs == 12], gridsize=25, norm=LogNorm(), extent=lims)\n", | |
"p2 = ax2.hexbin(energy[Y_abs == 12], e_rec_cscd[Y_abs == 12], gridsize=25, extent=lims)\n", | |
"plt.colorbar(p1, ax=ax1)\n", | |
"plt.colorbar(p2, ax=ax2)\n", | |
"for ax in ax1, ax2:\n", | |
" ax.set_xlabel('e_mc')\n", | |
" ax.set_ylabel('e_reco')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(8, 3))\n", | |
"p1 = ax1.hexbin(energy[Y_abs == 14], e_rec_trak[Y_abs == 14], gridsize=25, norm=LogNorm())\n", | |
"p2 = ax2.hexbin(energy[Y_abs == 14], e_rec_trak[Y_abs == 14], gridsize=25,)\n", | |
"plt.colorbar(p1, ax=ax1)\n", | |
"plt.colorbar(p2, ax=ax2)\n", | |
"for ax in ax1, ax2:\n", | |
" ax.set_xlabel('e_mc')\n", | |
" ax.set_ylabel('e_reco')" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.5.1" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 0 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment