Created
March 31, 2020 10:15
-
-
Save MaxHalford/47cd83f7cb8e23d2db5616ba9b177ea9 to your computer and use it in GitHub Desktop.
Improving scikit-learn's single prediction speed
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Speeding up scikit-learn for single predictions" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 44, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"'0.22.2.post1'" | |
] | |
}, | |
"execution_count": 44, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"import sklearn\n", | |
"\n", | |
"sklearn.__version__" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Linear regression" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 31, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from sklearn import datasets\n", | |
"\n", | |
"X, y = datasets.load_boston(return_X_y=True)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 32, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"44.4 µs ± 3.7 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)\n" | |
] | |
} | |
], | |
"source": [ | |
"from sklearn import linear_model\n", | |
"\n", | |
"lin_reg = linear_model.LinearRegression()\n", | |
"lin_reg.fit(X, y)\n", | |
"%timeit lin_reg.predict(X[[0]])[0]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 34, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"1.38 µs ± 31.7 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)\n" | |
] | |
} | |
], | |
"source": [ | |
"import numpy as np\n", | |
"\n", | |
"class BarebonesLinearRegression(linear_model.LinearRegression):\n", | |
" \n", | |
" def predict_single(self, x):\n", | |
" return np.dot(self.coef_, x) + self.intercept_\n", | |
" \n", | |
"bb_lin_reg = BarebonesLinearRegression()\n", | |
"bb_lin_reg.fit(X, y)\n", | |
"%timeit bb_lin_reg.predict_single(X[0])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 35, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"for xi in X:\n", | |
" assert lin_reg.predict([xi])[0] == bb_lin_reg.predict_single(xi)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Logistic regression" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 137, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from sklearn import datasets\n", | |
"from sklearn import preprocessing\n", | |
"\n", | |
"X, y = datasets.load_digits(return_X_y=True)\n", | |
"X = preprocessing.scale(X)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 138, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"71.3 µs ± 3.59 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)\n" | |
] | |
} | |
], | |
"source": [ | |
"log_reg = linear_model.LogisticRegression()\n", | |
"log_reg.fit(X, y)\n", | |
"%timeit log_reg.predict_proba(X[[0]])[0]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 145, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
" " | |
] | |
}, | |
{ | |
"data": { | |
"text/plain": [ | |
" 4400004 function calls in 4.414 seconds\n", | |
"\n", | |
" Ordered by: internal time\n", | |
" List reduced from 34 to 10 due to restriction <10>\n", | |
"\n", | |
" ncalls tottime percall cumtime percall filename:lineno(function)\n", | |
" 100000 1.068 0.000 3.504 0.000 _logsumexp.py:9(logsumexp)\n", | |
" 200000 0.530 0.000 0.530 0.000 {method 'reduce' of 'numpy.ufunc' objects}\n", | |
" 300000 0.315 0.000 1.376 0.000 {built-in method numpy.core._multiarray_umath.implement_array_function}\n", | |
" 100000 0.257 0.000 3.761 0.000 _logsumexp.py:132(softmax)\n", | |
" 100000 0.254 0.000 4.316 0.000 <ipython-input-145-3bbdfc107533>:5(predict_proba_single)\n", | |
" 200000 0.245 0.000 0.578 0.000 _ufunc_config.py:39(seterr)\n", | |
" 100000 0.232 0.000 0.383 0.000 _util.py:200(_asarray_validated)\n", | |
" 200000 0.226 0.000 0.868 0.000 fromnumeric.py:73(_wrapreduction)\n", | |
" 200000 0.201 0.000 0.222 0.000 _ufunc_config.py:139(geterr)\n", | |
" 100000 0.095 0.000 0.520 0.000 fromnumeric.py:2092(sum)" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"from scipy import special\n", | |
"\n", | |
"class BarebonesLogisticRegression(linear_model.LogisticRegression):\n", | |
" \n", | |
" def predict_proba_single(self, x):\n", | |
" return special.softmax(np.dot(self.coef_, x) + self.intercept_)\n", | |
"\n", | |
"bb_log_reg = BarebonesLogisticRegression()\n", | |
"bb_log_reg.fit(X, y)\n", | |
"%prun -l 10 [bb_log_reg.predict_proba_single(X[0]) for _ in range(100000)]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 147, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"14.7 µs ± 682 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)\n" | |
] | |
} | |
], | |
"source": [ | |
"def custom_softmax(x):\n", | |
" z = x - max(x)\n", | |
" numerator = np.exp(z)\n", | |
" denominator = np.sum(numerator)\n", | |
" return numerator / denominator\n", | |
"\n", | |
"class BarebonesLogisticRegression(linear_model.LogisticRegression):\n", | |
" \n", | |
" def predict_proba_single(self, x):\n", | |
" return custom_softmax(np.dot(self.coef_, x) + self.intercept_)\n", | |
"\n", | |
"bb_log_reg = BarebonesLogisticRegression()\n", | |
"bb_log_reg.fit(X, y)\n", | |
"%timeit bb_log_reg.predict_proba_single(X[0])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 148, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"for xi in X:\n", | |
" assert np.allclose(log_reg.predict_proba([xi])[0], bb_log_reg.predict_proba_single(xi))" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Standard scaling" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 36, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"43 µs ± 1.07 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)\n" | |
] | |
} | |
], | |
"source": [ | |
"from sklearn import preprocessing\n", | |
"\n", | |
"scaler = preprocessing.StandardScaler()\n", | |
"scaler.fit(X)\n", | |
"%timeit scaler.transform(X[[0]])[0]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 37, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"1.7 µs ± 34.6 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)\n" | |
] | |
} | |
], | |
"source": [ | |
"class BarebonesStandardScaler(preprocessing.StandardScaler):\n", | |
"\n", | |
" def transform_single(self, x):\n", | |
" return (x - self.mean_) / self.var_ ** .5\n", | |
" \n", | |
"bb_scaler = BarebonesStandardScaler()\n", | |
"bb_scaler.fit(X)\n", | |
"%timeit bb_scaler.transform_single(X[0])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 41, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"for xi in X:\n", | |
" assert np.array_equal(scaler.transform([xi])[0], bb_scaler.transform_single(xi))" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Pipeline" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 48, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"97.6 µs ± 4.19 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)\n" | |
] | |
} | |
], | |
"source": [ | |
"from sklearn import pipeline\n", | |
"\n", | |
"pp = pipeline.Pipeline([('scaler', scaler), ('lin_reg', lin_reg)])\n", | |
"pp.fit(X, y)\n", | |
"%timeit pp.predict(X[[0]])[0]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 56, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"3.96 µs ± 184 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)\n" | |
] | |
} | |
], | |
"source": [ | |
"class BarebonesPipeline(pipeline.Pipeline):\n", | |
"\n", | |
" def predict_single(self, x):\n", | |
" for _, transformer in self.steps[:-1]:\n", | |
" x = transformer.transform_single(x)\n", | |
" return self.steps[-1][1].predict_single(x)\n", | |
" \n", | |
" def predict_proba_single(self, x):\n", | |
" for _, transformer in self.steps[:-1]:\n", | |
" x = transformer.transform_single(x)\n", | |
" return self.steps[-1][1].predict_proba_single(x)\n", | |
" \n", | |
"bb_pp = BarebonesPipeline([('bb_scaler', bb_scaler), ('bb_lin_reg', bb_lin_reg)])\n", | |
"bb_pp.fit(X, y)\n", | |
"%timeit bb_pp.predict_single(X[0])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 57, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"for xi in X:\n", | |
" assert pp.predict([xi])[0] == bb_pp.predict_single(xi)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Decision tree" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 128, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"49 µs ± 1.41 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)\n" | |
] | |
} | |
], | |
"source": [ | |
"from sklearn import tree\n", | |
"\n", | |
"X, y = datasets.fetch_california_housing(return_X_y=True)\n", | |
"dtree = tree.DecisionTreeRegressor(max_depth=7)\n", | |
"dtree.fit(X, y)\n", | |
"%timeit dtree.predict(X[[0]])[0]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 130, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"42.9 µs ± 1.34 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)\n" | |
] | |
} | |
], | |
"source": [ | |
"class BarebonesDecisionTreeRegressor(tree.DecisionTreeRegressor):\n", | |
" \n", | |
" def predict_single(self, x):\n", | |
" node_idx = self.apply([x])[0]\n", | |
" return self.tree_.value[node_idx][0, 0]\n", | |
" \n", | |
"bb_dtree = BarebonesDecisionTreeRegressor(max_depth=7)\n", | |
"bb_dtree.fit(X, y)\n", | |
"%timeit bb_dtree.predict_single(X[0])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 135, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"for xi in X:\n", | |
" assert np.isclose(dtree.predict([xi])[0], bb_dtree.predict_single(xi))" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.7.4" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 4 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment