Created
September 3, 2019 19:55
-
-
Save pentschev/bf9bea35f0093b2021b8b5f8bfef53bf to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from dask_cuda import LocalCUDACluster\n", | |
"from distributed import Client\n", | |
"\n", | |
"import cudf\n", | |
"import dask_cudf\n", | |
"import pandas as pd\n", | |
"\n", | |
"from cuml.dask.ensemble import RandomForestRegressor as cumlRandomForestRegressor\n", | |
"from sklearn.ensemble import RandomForestRegressor as sklRandomForestRegressor" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Create a Dask CUDA cluster with one worker per device" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"cluster = LocalCUDACluster()\n", | |
"client = Client(cluster)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Read CSV on CPU" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 25.4 s, sys: 2.61 s, total: 28 s\n", | |
"Wall time: 27 s\n" | |
] | |
} | |
], | |
"source": [ | |
"%%time\n", | |
"pdf = pd.read_csv(\"/home/pentschev/datasets/yellow_tripdata_2015-01.csv\")" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Read CSV on GPU" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 2.09 s, sys: 446 ms, total: 2.53 s\n", | |
"Wall time: 3 s\n" | |
] | |
} | |
], | |
"source": [ | |
"%%time\n", | |
"df = cudf.read_csv(\"/home/pentschev/datasets/yellow_tripdata_2015-01.csv\")" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Read CSV file in parallel across workers on GPU" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"ddf = dask_cudf.read_csv(\"/home/pentschev/datasets/yellow_tripdata_2015-01.csv\")" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Fit RandomForestRegressor on GPU" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"rfg = cumlRandomForestRegressor()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 298 ms, sys: 62.8 ms, total: 361 ms\n", | |
"Wall time: 4.38 s\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/plain": [ | |
"<cuml.dask.ensemble.randomforestregressor.RandomForestRegressor at 0x7f7f711de048>" | |
] | |
}, | |
"execution_count": 8, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"%%time\n", | |
"rfg.fit(ddf[[\"trip_distance\", \"passenger_count\"]], ddf[\"total_amount\"])" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Fit RandomForestRegressor on CPU" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"sklrfg = sklRandomForestRegressor(n_jobs=-1)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"/home/pentschev/miniconda3/envs/rapids-nightly-0.10/lib/python3.7/site-packages/sklearn/ensemble/forest.py:245: FutureWarning: The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.\n", | |
" \"10 in version 0.20 to 100 in 0.22.\", FutureWarning)\n" | |
] | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 3min 3s, sys: 2.97 s, total: 3min 6s\n", | |
"Wall time: 20 s\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/plain": [ | |
"RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,\n", | |
" max_features='auto', max_leaf_nodes=None,\n", | |
" min_impurity_decrease=0.0, min_impurity_split=None,\n", | |
" min_samples_leaf=1, min_samples_split=2,\n", | |
" min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,\n", | |
" oob_score=False, random_state=None, verbose=0,\n", | |
" warm_start=False)" | |
] | |
}, | |
"execution_count": 10, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"%%time\n", | |
"sklrfg.fit(pdf[[\"trip_distance\", \"passenger_count\"]], pdf[\"total_amount\"])" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.7.3" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 4 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment