pentschev · September 3, 2019 19:55
diff --git a/dask_cudf_cuml.ipynb b/dask_cudf_cuml.ipynb
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from dask_cuda import LocalCUDACluster\n",
    "from distributed import Client\n",
    "\n",
    "import cudf\n",
    "import dask_cudf\n",
    "import pandas as pd\n",
    "\n",
    "from cuml.dask.ensemble import RandomForestRegressor as cumlRandomForestRegressor\n",
    "from sklearn.ensemble import RandomForestRegressor as sklRandomForestRegressor"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Create a Dask CUDA cluster with one worker per device"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "cluster = LocalCUDACluster()\n",
    "client = Client(cluster)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Read CSV on CPU"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 25.4 s, sys: 2.61 s, total: 28 s\n",
      "Wall time: 27 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "pdf = pd.read_csv(\"/home/pentschev/datasets/yellow_tripdata_2015-01.csv\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Read CSV on GPU"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 2.09 s, sys: 446 ms, total: 2.53 s\n",
      "Wall time: 3 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "df = cudf.read_csv(\"/home/pentschev/datasets/yellow_tripdata_2015-01.csv\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Read CSV file in parallel across workers on GPU"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "ddf = dask_cudf.read_csv(\"/home/pentschev/datasets/yellow_tripdata_2015-01.csv\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Fit RandomForestRegressor on GPU"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "rfg = cumlRandomForestRegressor()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 298 ms, sys: 62.8 ms, total: 361 ms\n",
      "Wall time: 4.38 s\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "<cuml.dask.ensemble.randomforestregressor.RandomForestRegressor at 0x7f7f711de048>"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%time\n",
    "rfg.fit(ddf[[\"trip_distance\", \"passenger_count\"]], ddf[\"total_amount\"])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Fit RandomForestRegressor on CPU"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "sklrfg = sklRandomForestRegressor(n_jobs=-1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/pentschev/miniconda3/envs/rapids-nightly-0.10/lib/python3.7/site-packages/sklearn/ensemble/forest.py:245: FutureWarning: The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.\n",
      "  \"10 in version 0.20 to 100 in 0.22.\", FutureWarning)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 3min 3s, sys: 2.97 s, total: 3min 6s\n",
      "Wall time: 20 s\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,\n",
       "                      max_features='auto', max_leaf_nodes=None,\n",
       "                      min_impurity_decrease=0.0, min_impurity_split=None,\n",
       "                      min_samples_leaf=1, min_samples_split=2,\n",
       "                      min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,\n",
       "                      oob_score=False, random_state=None, verbose=0,\n",
       "                      warm_start=False)"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%time\n",
    "sklrfg.fit(pdf[[\"trip_distance\", \"passenger_count\"]], pdf[\"total_amount\"])"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
 }
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {},
	"outputs": [],
	"source": [
	"from dask_cuda import LocalCUDACluster\n",
	"from distributed import Client\n",
	"\n",
	"import cudf\n",
	"import dask_cudf\n",
	"import pandas as pd\n",
	"\n",
	"from cuml.dask.ensemble import RandomForestRegressor as cumlRandomForestRegressor\n",
	"from sklearn.ensemble import RandomForestRegressor as sklRandomForestRegressor"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Create a Dask CUDA cluster with one worker per device"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {},
	"outputs": [],
	"source": [
	"cluster = LocalCUDACluster()\n",
	"client = Client(cluster)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Read CSV on CPU"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"CPU times: user 25.4 s, sys: 2.61 s, total: 28 s\n",
	"Wall time: 27 s\n"
	]
	}
	],
	"source": [
	"%%time\n",
	"pdf = pd.read_csv(\"/home/pentschev/datasets/yellow_tripdata_2015-01.csv\")"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Read CSV on GPU"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"CPU times: user 2.09 s, sys: 446 ms, total: 2.53 s\n",
	"Wall time: 3 s\n"
	]
	}
	],
	"source": [
	"%%time\n",
	"df = cudf.read_csv(\"/home/pentschev/datasets/yellow_tripdata_2015-01.csv\")"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Read CSV file in parallel across workers on GPU"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"metadata": {},
	"outputs": [],
	"source": [
	"ddf = dask_cudf.read_csv(\"/home/pentschev/datasets/yellow_tripdata_2015-01.csv\")"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Fit RandomForestRegressor on GPU"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 7,
	"metadata": {},
	"outputs": [],
	"source": [
	"rfg = cumlRandomForestRegressor()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 8,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"CPU times: user 298 ms, sys: 62.8 ms, total: 361 ms\n",
	"Wall time: 4.38 s\n"
	]
	},
	{
	"data": {
	"text/plain": [
	"<cuml.dask.ensemble.randomforestregressor.RandomForestRegressor at 0x7f7f711de048>"
	]
	},
	"execution_count": 8,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"%%time\n",
	"rfg.fit(ddf[[\"trip_distance\", \"passenger_count\"]], ddf[\"total_amount\"])"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Fit RandomForestRegressor on CPU"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 9,
	"metadata": {},
	"outputs": [],
	"source": [
	"sklrfg = sklRandomForestRegressor(n_jobs=-1)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 10,
	"metadata": {},
	"outputs": [
	{
	"name": "stderr",
	"output_type": "stream",
	"text": [
	"/home/pentschev/miniconda3/envs/rapids-nightly-0.10/lib/python3.7/site-packages/sklearn/ensemble/forest.py:245: FutureWarning: The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.\n",
	" \"10 in version 0.20 to 100 in 0.22.\", FutureWarning)\n"
	]
	},
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"CPU times: user 3min 3s, sys: 2.97 s, total: 3min 6s\n",
	"Wall time: 20 s\n"
	]
	},
	{
	"data": {
	"text/plain": [
	"RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,\n",
	" max_features='auto', max_leaf_nodes=None,\n",
	" min_impurity_decrease=0.0, min_impurity_split=None,\n",
	" min_samples_leaf=1, min_samples_split=2,\n",
	" min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,\n",
	" oob_score=False, random_state=None, verbose=0,\n",
	" warm_start=False)"
	]
	},
	"execution_count": 10,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"%%time\n",
	"sklrfg.fit(pdf[[\"trip_distance\", \"passenger_count\"]], pdf[\"total_amount\"])"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.7.3"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 4
	}