Last active
February 25, 2020 22:59
-
-
Save jonathan-taylor/58c74f7f8cb463e79fc9633d4e9bf5fc to your computer and use it in GitHub Desktop.
Segfault with pandas conversion and joblib?
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from functools import partial\n", | |
"\n", | |
"import numpy as np, pandas as pd\n", | |
"\n", | |
"import rpy2.robjects as rpy\n", | |
"from rpy2.robjects.packages import importr\n", | |
"from rpy2.robjects import pandas2ri\n", | |
"from rpy2.robjects.conversion import localconverter\n", | |
"\n", | |
"from sklearn.base import BaseEstimator, RegressorMixin\n", | |
"from sklearn.model_selection import cross_validate, check_cv\n", | |
"from sklearn.metrics import mean_squared_error\n", | |
"from joblib.parallel import Parallel, delayed\n", | |
"\n", | |
"rpy.r('library(leaps)')\n", | |
"\n", | |
"class Subset(BaseEstimator, RegressorMixin):\n", | |
"\n", | |
" def __init__(self,\n", | |
" formula_str,\n", | |
" nvar,\n", | |
" method='exhaustive'):\n", | |
"\n", | |
" self.formula_str = formula_str\n", | |
" self.formula = rpy.r(formula_str)\n", | |
" self.method = method\n", | |
" self.nvar = nvar\n", | |
"\n", | |
" def fit(self, X, y):\n", | |
" \"\"\"Fit best subsets regression for a given `nvar` \n", | |
" \n", | |
" Parameters \n", | |
" ---------- \n", | |
" X : {array-like}, shape (n_samples, n_features) \n", | |
" Training data. Pass directly as Fortran-contiguous data \n", | |
" to avoid unnecessary memory duplication. If y is mono-output, \n", | |
" X can be sparse. \n", | |
" \n", | |
" y : array-like, shape (n_samples,) or (n_samples, n_targets) \n", | |
" Target values \n", | |
" \"\"\"\n", | |
" leaps = importr('leaps')\n", | |
" base = importr('base')\n", | |
"\n", | |
" D = _convert_df(pd.concat([X, y], axis=1).copy()) # reconstitute data \n", | |
" _regfit = rpy.r['regsubsets'](self.formula,\n", | |
" data=D,\n", | |
" method=self.method,\n", | |
" nvmax=self.nvar)\n", | |
" self._which = rpy.r['summary'](_regfit).rx2('which').astype(np.bool)\n", | |
" _names = _regfit.rx2('xnames')\n", | |
" self._nz_coef = rpy.r['coef'](_regfit, self.nvar)\n", | |
" self.coef_ = pd.Series(self._nz_coef, index=_names[self._which[self.nvar-1]])\n", | |
"\n", | |
" def predict(self, X):\n", | |
" stats = importr('stats')\n", | |
" with localconverter(rpy.default_converter + pandas2ri.converter):\n", | |
" _X = stats.model_matrix(self.formula, data=X)\n", | |
" _X = _X[:, self._which[self.nvar-1]] # 0-based indexing \n", | |
" return _X.dot(self._nz_coef)\n", | |
" \n", | |
"class SubsetCV(BaseEstimator, RegressorMixin): \n", | |
"\n", | |
" def __init__(self, \n", | |
" formula_str, \n", | |
" nvmax, \n", | |
" method='exhaustive', \n", | |
" cv=3, \n", | |
" n_jobs=-1,\n", | |
" verbose=False):\n", | |
" self.formula_str = formula_str\n", | |
" self.formula = rpy.r(formula_str)\n", | |
" self.method = method\n", | |
" self.cv = cv\n", | |
" self.nvmax = nvmax\n", | |
" self.n_jobs = n_jobs\n", | |
" self.verbose = verbose\n", | |
"\n", | |
" def fit(self, X, y):\n", | |
" \"\"\"Fit cross-validated best subsets regression\n", | |
"\n", | |
" Fit is over a grid of sizes up to `nvmax`.\n", | |
"\n", | |
" Parameters\n", | |
" ----------\n", | |
" X : {array-like}, shape (n_samples, n_features)\n", | |
" Training data. Pass directly as Fortran-contiguous data\n", | |
" to avoid unnecessary memory duplication. If y is mono-output,\n", | |
" X can be sparse.\n", | |
"\n", | |
" y : array-like, shape (n_samples,) or (n_samples, n_targets)\n", | |
" Target values\n", | |
" \"\"\"\n", | |
"\n", | |
" if X.shape[0] != y.shape[0]:\n", | |
" raise ValueError(\"X and y have inconsistent dimensions (%d != %d)\"\n", | |
" % (X.shape[0], y.shape[0]))\n", | |
"\n", | |
" cv = check_cv(self.cv)\n", | |
"\n", | |
" # Compute path for all folds and compute MSE to get the best alpha\n", | |
" folds = list(cv.split(X, y))\n", | |
" best_mse = np.inf\n", | |
"\n", | |
" # Loop over folds, computing mse path\n", | |
" # for each (train, test)\n", | |
" D = pd.concat([X, y], axis=1) # reconstitute data\n", | |
"\n", | |
" jobs = (delayed(_regsubsets_MSE)(X,\n", | |
" y,\n", | |
" self.formula, \n", | |
" self.method, \n", | |
" self.nvmax, \n", | |
" train, \n", | |
" test)\n", | |
" for train, test in folds)\n", | |
"\n", | |
" # Execute the jobs in parallel using joblib\n", | |
" MSEs = Parallel(n_jobs=self.n_jobs, verbose=self.verbose,\n", | |
" backend=\"threading\")(jobs)\n", | |
" MSEs = np.asarray(MSEs)\n", | |
" self.mse_path = MSEs.mean(0)\n", | |
"\n", | |
" # Find best index by minimizing MSEs\n", | |
"\n", | |
" self.best_index = int(np.argmin(self.mse_path) + 1)\n", | |
"\n", | |
" # Refit with best index hyperparameter \n", | |
" self.best_estimator = Subset(self.formula_str,\n", | |
" self.best_index,\n", | |
" method=self.method)\n", | |
" print(self.best_index)\n", | |
" self.best_estimator.fit(X, y)\n", | |
" self.coef_ = self.best_estimator.coef_\n", | |
"\n", | |
" \n", | |
" return self\n", | |
"\n", | |
" def predict(self, D):\n", | |
" return self.best_estimator.predict(D)\n", | |
" \n", | |
"\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def _regsubsets_MSE(X,\n", | |
" y,\n", | |
" formula, \n", | |
" method, \n", | |
" nvar, \n", | |
" train, \n", | |
" test):\n", | |
"\n", | |
" D = pd.concat([X, y], axis=1) # reconstitute data\n", | |
"\n", | |
" Dtrain = _convert_df(D.loc[D.index[train]].copy())\n", | |
" Dtest = _convert_df(D.loc[D.index[test]].copy())\n", | |
"\n", | |
" _regfit = rpy.r['regsubsets'](formula, \n", | |
" data=Dtrain, \n", | |
" method=method,\n", | |
" nvmax=nvar)\n", | |
" _which = np.asarray(rpy.r['summary'](_regfit).rx2('which')).astype(np.bool)\n", | |
" _Xtest = np.asarray(rpy.r['model.matrix'](formula, data=Dtest))\n", | |
" _coef = np.zeros(_Xtest.shape[1])\n", | |
"\n", | |
" _MSEs = []\n", | |
"\n", | |
" _y_test = y.loc[y.index[test]]\n", | |
" for ivar in range(1, nvar+1):\n", | |
" _nz_coef = rpy.r['coef'](_regfit, ivar)\n", | |
" _mask = _which[ivar-1]\n", | |
" _coef *= 0\n", | |
" _coef[_mask] = _nz_coef\n", | |
" _y_hat = _Xtest.dot(_coef)\n", | |
" _MSEs.append(((_y_test - _y_hat)**2).mean())\n", | |
"\n", | |
" return _MSEs\n", | |
"\n", | |
"def _convert_df(pd_df):\n", | |
" with localconverter(rpy.default_converter + pandas2ri.converter):\n", | |
" r_from_pd_df = rpy.conversion.py2rpy(pd_df)\n", | |
" return r_from_pd_df\n", | |
"\n", | |
"def _predict(formula, D, _which, nvar, _nz_coef):\n", | |
" D_ = _convert_df(D)\n", | |
" _X = np.asarray(rpy.r['model_matrix'](formula, data=D_))\n", | |
" _X = _X[:, _which[nvar-1]] # 0-based indexing \n", | |
" return _X.dot(_nz_coef)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Index(['X0', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X7', 'X8', 'X9', 'Y'], dtype='object')\n", | |
"[1.1044207361816287, 1.1127754845670763, 1.1162538883158355, 1.1969607794847699, 1.1971463482157412, 1.2241621673072327, 1.2152457283584275, 1.231690457342606]\n" | |
] | |
} | |
], | |
"source": [ | |
"D = pd.DataFrame(dict([('X%d' % i, np.random.standard_normal(200)) for i in range(10)] + \n", | |
" [('Y', np.random.standard_normal(200))]))\n", | |
"print(D.columns)\n", | |
"test = np.ones(200, np.bool)\n", | |
"test[:50] = 0\n", | |
"train = ~test\n", | |
"subset_CV = SubsetCV('Y ~ .', 12, method='exhaustive', cv=10)\n", | |
"print(_regsubsets_MSE(D, D['Y'], subset_CV.formula, 'exhaustive', 8, train, test))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"10\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/plain": [ | |
"SubsetCV(cv=10, formula_str='Y ~ .', method='exhaustive', n_jobs=-1, nvmax=10,\n", | |
" verbose=False)" | |
] | |
}, | |
"execution_count": 4, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"subset_CV = SubsetCV('Y ~ .', 10, method='exhaustive', cv=10)\n", | |
"subset_CV.fit(D, D['Y'])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.7.4" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment