Skip to content

Instantly share code, notes, and snippets.

@stsievert
Last active July 28, 2018 02:42
Show Gist options
  • Save stsievert/facdf6c1427c810cafecd30848fe20d9 to your computer and use it in GitHub Desktop.
Save stsievert/facdf6c1427c810cafecd30848fe20d9 to your computer and use it in GitHub Desktop.
Successive Halving with _incremental.fit
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Incremental Model Selection\n"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"\n",
"from sklearn.linear_model import SGDClassifier\n",
"\n",
"import dask\n",
"from dask.distributed import Client\n",
"from dask_ml.datasets import make_classification\n",
"from dask_ml.model_selection._incremental import fit"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<table style=\"border: 2px solid white;\">\n",
"<tr>\n",
"<td style=\"vertical-align: top; border: 0px solid white\">\n",
"<h3>Client</h3>\n",
"<ul>\n",
" <li><b>Scheduler: </b>tcp://127.0.0.1:56260\n",
" <li><b>Dashboard: </b><a href='http://127.0.0.1:8787/status' target='_blank'>http://127.0.0.1:8787/status</a>\n",
"</ul>\n",
"</td>\n",
"<td style=\"vertical-align: top; border: 0px solid white\">\n",
"<h3>Cluster</h3>\n",
"<ul>\n",
" <li><b>Workers: </b>8</li>\n",
" <li><b>Cores: </b>8</li>\n",
" <li><b>Memory: </b>17.18 GB</li>\n",
"</ul>\n",
"</td>\n",
"</tr>\n",
"</table>"
],
"text/plain": [
"<Client: scheduler='tcp://127.0.0.1:56260' processes=8 cores=8>"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"client = Client(processes=True)\n",
"client"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Make data"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"X, y = make_classification(n_samples=5000000, n_features=20,\n",
" chunks=100000)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Incremental.fit"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"X, y = make_classification(n_samples=5000000, n_features=20,\n",
" chunks=100000)\n",
"model = SGDClassifier(tol=1e-3, penalty='elasticnet')"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"max_iter = 100\n",
"patience = 10\n",
"tol = 0.001\n",
"\n",
"def adapt(info):\n",
" [info] = info.values()\n",
" if max_iter is not None and len(info) > max_iter:\n",
" return {0: 0}\n",
"\n",
" if len(info) > patience:\n",
" old = info[-patience]['score']\n",
" if all(d['score'] < old + tol for d in info[-patience:]):\n",
" return {0: 0}\n",
"\n",
" return {0: 1}\n",
"\n",
"from dask_ml.model_selection._incremental import fit\n",
"X_test = X.blocks[-1]\n",
"X = X.blocks[:-1]\n",
"y_test = y.blocks[-1]\n",
"y = y.blocks[:-1]\n",
"info, models, history = fit(model, [{}], X, y, X_test, y_test, adapt,\n",
" {'classes': [0, 1]})\n"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[{'model_id': 0, 'params': {}, 'partial_fit_calls': 19, 'score': 0.58955},\n",
" {'model_id': 0, 'params': {}, 'partial_fit_calls': 20, 'score': 0.58888},\n",
" {'model_id': 0, 'params': {}, 'partial_fit_calls': 21, 'score': 0.59109},\n",
" {'model_id': 0, 'params': {}, 'partial_fit_calls': 22, 'score': 0.58677},\n",
" {'model_id': 0, 'params': {}, 'partial_fit_calls': 23, 'score': 0.58385}]"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"info[0][-5:]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### RandomSearch"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"all_models = {}"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{0: 1, 1: 1, 2: 1, 3: 1, 4: 1}\n",
"{0: 1, 1: 1, 2: 1, 3: 1, 4: 1}\n",
"{0: 1, 1: 1, 2: 1, 3: 1, 4: 1}\n",
"{0: 1, 1: 1, 2: 1, 3: 1, 4: 1}\n",
"{0: 1, 1: 1, 2: 1, 3: 1, 4: 1}\n",
"{0: 1, 1: 1, 2: 1, 3: 1, 4: 1}\n",
"{0: 1, 1: 1, 2: 1, 3: 1, 4: 1}\n",
"{0: 1, 1: 1, 2: 1, 3: 1, 4: 1}\n",
"{0: 1, 1: 1, 2: 1, 3: 1, 4: 1}\n",
"{0: 1, 1: 1, 2: 1, 3: 1, 4: 1}\n",
"{0: 0, 1: 1, 2: 0, 3: 1, 4: 0}\n",
"{0: 0, 1: 1, 2: 0, 3: 1, 4: 0}\n",
"{0: 0, 1: 1, 2: 0, 3: 1, 4: 0}\n",
"{0: 0, 1: 1, 2: 0, 3: 1, 4: 0}\n",
"{0: 0, 1: 1, 2: 0, 3: 1, 4: 0}\n",
"{0: 0, 1: 1, 2: 0, 3: 1, 4: 0}\n",
"{0: 0, 1: 1, 2: 0, 3: 1, 4: 0}\n",
"{0: 0, 1: 1, 2: 0, 3: 1, 4: 0}\n",
"{0: 0, 1: 1, 2: 0, 3: 1, 4: 0}\n",
"{0: 0, 1: 1, 2: 0, 3: 0, 4: 0}\n",
"{0: 0, 1: 1, 2: 0, 3: 0, 4: 0}\n",
"{0: 0, 1: 1, 2: 0, 3: 0, 4: 0}\n",
"{0: 0, 1: 1, 2: 0, 3: 0, 4: 0}\n",
"{0: 0, 1: 1, 2: 0, 3: 0, 4: 0}\n",
"{0: 0, 1: 1, 2: 0, 3: 0, 4: 0}\n",
"{0: 0, 1: 1, 2: 0, 3: 0, 4: 0}\n",
"{0: 0, 1: 1, 2: 0, 3: 0, 4: 0}\n",
"{0: 0, 1: 0, 2: 0, 3: 0, 4: 0}\n"
]
}
],
"source": [
"max_iter = 81\n",
"patience = 10\n",
"tol = 0.001\n",
"\n",
"def adapt(info):\n",
" out = {}\n",
" for ident, records in info.items():\n",
" if max_iter is not None and len(records) > max_iter:\n",
" out[ident] = 0\n",
"\n",
" elif len(records) > patience:\n",
" old = records[-patience]['score']\n",
" if all(d['score'] < old + tol for d in records[-patience:]):\n",
" out[ident] = 0\n",
" else:\n",
" out[ident] = 1\n",
" \n",
" else:\n",
" out[ident] = 1\n",
" print(out)\n",
" return out\n",
"\n",
"X, y = make_classification(n_samples=5000000, n_features=20,\n",
" chunks=100000)\n",
"model = SGDClassifier(tol=1e-3, penalty='elasticnet')\n",
"\n",
"params = {'alpha': np.logspace(-2, 1, num=1000),\n",
" 'l1_ratio': np.linspace(0, 1, num=1000),\n",
" 'average': [True, False]}\n",
"\n",
"from sklearn.model_selection import ParameterSampler\n",
"params_list = list(ParameterSampler(params, 5))\n",
"\n",
"from dask_ml.model_selection._incremental import fit\n",
"X_test = X.blocks[-1]\n",
"X = X.blocks[:-1]\n",
"y_test = y.blocks[-1]\n",
"y = y.blocks[:-1]\n",
"info, models, history = fit(model, params_list, X, y, X_test, y_test, adapt,\n",
" {'classes': [0, 1]})\n",
"all_models['random'] = info"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[{'model_id': 3,\n",
" 'params': {'l1_ratio': 0.5495495495495496,\n",
" 'average': True,\n",
" 'alpha': 0.26510836019085376},\n",
" 'partial_fit_calls': 15,\n",
" 'score': 0.64588},\n",
" {'model_id': 3,\n",
" 'params': {'l1_ratio': 0.5495495495495496,\n",
" 'average': True,\n",
" 'alpha': 0.26510836019085376},\n",
" 'partial_fit_calls': 16,\n",
" 'score': 0.64604},\n",
" {'model_id': 3,\n",
" 'params': {'l1_ratio': 0.5495495495495496,\n",
" 'average': True,\n",
" 'alpha': 0.26510836019085376},\n",
" 'partial_fit_calls': 17,\n",
" 'score': 0.64607},\n",
" {'model_id': 3,\n",
" 'params': {'l1_ratio': 0.5495495495495496,\n",
" 'average': True,\n",
" 'alpha': 0.26510836019085376},\n",
" 'partial_fit_calls': 18,\n",
" 'score': 0.6459},\n",
" {'model_id': 3,\n",
" 'params': {'l1_ratio': 0.5495495495495496,\n",
" 'average': True,\n",
" 'alpha': 0.26510836019085376},\n",
" 'partial_fit_calls': 19,\n",
" 'score': 0.64594}]"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"best = max(info, key=lambda k: info[k][-1]['score'])\n",
"info[best][-5:]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Successive halving"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"import toolz\n",
"import math\n",
"\n",
"class SHA:\n",
" def __init__(self, n, r, eta=3):\n",
" self.steps = 0\n",
" self.n = n\n",
" self.r = r\n",
" self.eta = eta\n",
" \n",
" def fit(self, info):\n",
" n = self.n\n",
" r = self.r\n",
" eta = self.eta\n",
" \n",
" n_i = math.floor(n * eta ** -self.steps)\n",
" r_i = r * eta**self.steps\n",
" iters = {v[-1]['partial_fit_calls'] for v in info.values()}\n",
" if self.steps == 0:\n",
" self.steps = 1\n",
" return {k: r_i for k in info}\n",
" \n",
" best = toolz.topk(n_i // eta, info, key=lambda k: info[k][-1]['score'])\n",
"\n",
" if len(best) == 1:\n",
" self._best_arm = best\n",
" elif len(best) == 0:\n",
" [best] = self._best_arm\n",
" print({best: info[best][-1]['partial_fit_calls']})\n",
" return {best: 0}\n",
"\n",
" out = {k: r_i - info[k][-1]['partial_fit_calls']\n",
" for k in best}\n",
"\n",
" print(\"iters =\", iters)\n",
" print(out)\n",
" self.steps += 1\n",
" return out\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(81, 1)"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"model = SGDClassifier(tol=1e-3, penalty='elasticnet')\n",
"\n",
"params = {'alpha': np.logspace(-2, 1, num=1000),\n",
" 'l1_ratio': np.linspace(0, 1, num=1000),\n",
" 'average': [True, False]}\n",
"params = {'alpha': np.logspace(-2, 1, num=1000),\n",
" 'l1_ratio': np.linspace(0, 1, num=1000),\n",
" 'average': [True, False]}\n",
"\n",
"from sklearn.model_selection import ParameterSampler\n",
"\n",
"R = 81\n",
"eta = 3\n",
"# def hyperband(...):\n",
"s_max = math.floor(math.log(R, eta))\n",
"B = (s_max + 1) * R\n",
"# for s in [...]:\n",
"s = s_max # pick the most exploratory bracket\n",
"n = math.ceil(B / R * eta**s / (s + 1))\n",
"r = int(R * eta**-s)\n",
"n, r"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"iters = {1}\n",
"{46: 2, 8: 2, 36: 2, 28: 2, 64: 2, 24: 2, 21: 2, 27: 2, 40: 2}\n",
"iters = {3}\n",
"{8: 6, 24: 6, 36: 6}\n",
"iters = {9}\n",
"{24: 18}\n",
"{24: 27}\n"
]
}
],
"source": [
"alg = SHA(n, r)\n",
"\n",
"params_list = list(ParameterSampler(params, n))\n",
"\n",
"from dask_ml.model_selection._incremental import fit\n",
"X_test = X.blocks[-1]\n",
"X = X.blocks[:-1]\n",
"y_test = y.blocks[-1]\n",
"y = y.blocks[:-1]\n",
"\n",
"info, models, history = fit(model, params_list, X, y, X_test, y_test,\n",
" alg.fit, {'classes': [0, 1]})\n",
"# all_models['hyperband'] = info"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[{'model_id': 27,\n",
" 'params': {'l1_ratio': 0.34634634634634637,\n",
" 'average': False,\n",
" 'alpha': 0.26510836019085376},\n",
" 'partial_fit_calls': 3,\n",
" 'score': 0.66754},\n",
" {'model_id': 24,\n",
" 'params': {'l1_ratio': 0.9159159159159159,\n",
" 'average': False,\n",
" 'alpha': 0.18891927762076663},\n",
" 'partial_fit_calls': 9,\n",
" 'score': 0.6679},\n",
" {'model_id': 8,\n",
" 'params': {'l1_ratio': 0.7927927927927928,\n",
" 'average': False,\n",
" 'alpha': 0.12563166024741207},\n",
" 'partial_fit_calls': 9,\n",
" 'score': 0.66776},\n",
" {'model_id': 36,\n",
" 'params': {'l1_ratio': 0.1931931931931932,\n",
" 'average': False,\n",
" 'alpha': 0.08890965989529158},\n",
" 'partial_fit_calls': 9,\n",
" 'score': 0.66755},\n",
" {'model_id': 24,\n",
" 'params': {'l1_ratio': 0.9159159159159159,\n",
" 'average': False,\n",
" 'alpha': 0.18891927762076663},\n",
" 'partial_fit_calls': 27,\n",
" 'score': 0.66742}]"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"history[-5:]"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{0, 1, 3, 9, 27}"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"{v['partial_fit_calls'] for v in history}"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment