Skip to content

Instantly share code, notes, and snippets.

@rnyak
Last active April 3, 2020 17:50
Show Gist options
  • Save rnyak/7e021bafae1bb16420e27679c95d90cd to your computer and use it in GitHub Desktop.
Save rnyak/7e021bafae1bb16420e27679c95d90cd to your computer and use it in GitHub Desktop.
import pandas as pd
import math
import cudf
import dask, dask_cudf
import xgboost as xgb
from dask.distributed import Client, wait
from dask_cuda import LocalCUDACluster
# connect to the Dask cluster created at Dataproc startup time
cluster = LocalCUDACluster()
client = Client(cluster)
# forces workers to restart. useful to ensure GPU memory is clear
client.restart()
client
cdf = cudf.DataFrame({'passengers': [2, 1, 1, 1, 1, 3, 2],
'trip_distance': [1.59, 3.30, 1.80, 0.50, 3.00, 6.00, 12.00],
'fare_amount': [12.0, 14.5, 9.5, 3.5, 15.0, 17.00, 18.50]})
ddf= dask_cudf.from_cudf(cdf, npartitions=1)
X_train = ddf.query('trip_distance <6').persist()
# create a Y_train ddf with just the target variable
Y_train = X_train[['fare_amount']].persist()
# drop the target variable from the training ddf
X_train = X_train[X_train.columns.difference(['fare_amount'])]
# this wont return until all data is in GPU memory
done = wait([X_train, Y_train])
X_test = ddf.query('trip_distance >= 6').persist()
#X_test = drop_empty_partitions(X_test)
# Create Y_test with just the fare amount
Y_test = X_test[['fare_amount']].persist()
# Drop the fare amount from X_test
X_test = X_test[X_test.columns.difference(['fare_amount'])]
# this wont return until all data is in GPU memory
done = wait([X_test, Y_test])
dtrain = xgb.dask.DaskDMatrix(client, X_train, Y_train)
#train model
trained_model = xgb.dask.train(client,
{
'learning_rate': 0.3,
'max_depth': 8,
'objective': 'reg:squarederror',
'subsample': 0.6,
'gamma': 1,
'silent': True,
'verbose_eval': True,
'tree_method':'gpu_hist',
'n_gpus': 1
},
dtrain,
num_boost_round=100, evals=[(dtrain, 'train')])
# generate predictions on the test set
'''feed X_test as a dataframe'''
prediction = xgb.dask.predict(client, trained_model['booster'], X_test).persist()
wait(prediction)
type(prediction)
dask.dataframe.core.Series
#convert prediction to dask_cudf.core.Series
pred = dask_cudf.from_dask_dataframe(prediction)
true = Y_test['fare_amount']
#want to calculate RMSE, but getting inaccurate results:
RMSE formula:
SE = ((pred-true) **2).compute()
math.sqrt(SE.mean())
#this gives wrong results
((pred -true)**2).compute()
0 null
1 null
5 null
6 null
dtype: float64
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment