rnyak · April 3, 2020 17:50
diff --git a/xgb-dask b/xgb-dask
 import pandas as pd
 import math
 import cudf
 import dask, dask_cudf
 import xgboost as xgb
 from dask.distributed import Client, wait
 from dask_cuda import LocalCUDACluster

 # connect to the Dask cluster created at Dataproc startup time
 cluster = LocalCUDACluster()
 client = Client(cluster)
 # forces workers to restart. useful to ensure GPU memory is clear
 client.restart()
 client


 cdf = cudf.DataFrame({'passengers': [2, 1, 1, 1, 1, 3, 2],
                   'trip_distance': [1.59, 3.30, 1.80, 0.50, 3.00, 6.00, 12.00],
                   'fare_amount': [12.0, 14.5, 9.5, 3.5, 15.0, 17.00, 18.50]})

 ddf= dask_cudf.from_cudf(cdf, npartitions=1)

 X_train = ddf.query('trip_distance <6').persist()

 # create a Y_train ddf with just the target variable
 Y_train = X_train[['fare_amount']].persist()
 # drop the target variable from the training ddf
 X_train = X_train[X_train.columns.difference(['fare_amount'])]

 # this wont return until all data is in GPU memory
 done = wait([X_train, Y_train])

 X_test = ddf.query('trip_distance >= 6').persist()
 #X_test = drop_empty_partitions(X_test)

 # Create Y_test with just the fare amount
 Y_test = X_test[['fare_amount']].persist()

 # Drop the fare amount from X_test
 X_test = X_test[X_test.columns.difference(['fare_amount'])]

 # this wont return until all data is in GPU memory
 done = wait([X_test, Y_test])

 dtrain = xgb.dask.DaskDMatrix(client, X_train, Y_train)

 #train model
 trained_model = xgb.dask.train(client,
                        {
                         'learning_rate': 0.3,
                          'max_depth': 8,
                          'objective': 'reg:squarederror',
                          'subsample': 0.6,
                          'gamma': 1,
                          'silent': True,
                          'verbose_eval': True,
                          'tree_method':'gpu_hist',
                          'n_gpus': 1
                        },
                        dtrain,
                        num_boost_round=100, evals=[(dtrain, 'train')])
                        
 # generate predictions on the test set
 '''feed X_test as a dataframe'''
 prediction = xgb.dask.predict(client, trained_model['booster'], X_test).persist()
 wait(prediction)


 type(prediction)
 dask.dataframe.core.Series

 #convert prediction to dask_cudf.core.Series
 pred = dask_cudf.from_dask_dataframe(prediction)
 true = Y_test['fare_amount']

 #want to calculate RMSE, but getting inaccurate results:
 RMSE formula:
 SE = ((pred-true) **2).compute()
 math.sqrt(SE.mean())

 #this gives wrong results
 ((pred -true)**2).compute()
 0    null
 1    null
 5    null
 6    null
 dtype: float64
	import pandas as pd
	import math
	import cudf
	import dask, dask_cudf
	import xgboost as xgb
	from dask.distributed import Client, wait
	from dask_cuda import LocalCUDACluster

	# connect to the Dask cluster created at Dataproc startup time
	cluster = LocalCUDACluster()
	client = Client(cluster)
	# forces workers to restart. useful to ensure GPU memory is clear
	client.restart()
	client


	cdf = cudf.DataFrame({'passengers': [2, 1, 1, 1, 1, 3, 2],
	'trip_distance': [1.59, 3.30, 1.80, 0.50, 3.00, 6.00, 12.00],
	'fare_amount': [12.0, 14.5, 9.5, 3.5, 15.0, 17.00, 18.50]})

	ddf= dask_cudf.from_cudf(cdf, npartitions=1)

	X_train = ddf.query('trip_distance <6').persist()

	# create a Y_train ddf with just the target variable
	Y_train = X_train[['fare_amount']].persist()
	# drop the target variable from the training ddf
	X_train = X_train[X_train.columns.difference(['fare_amount'])]

	# this wont return until all data is in GPU memory
	done = wait([X_train, Y_train])

	X_test = ddf.query('trip_distance >= 6').persist()
	#X_test = drop_empty_partitions(X_test)

	# Create Y_test with just the fare amount
	Y_test = X_test[['fare_amount']].persist()

	# Drop the fare amount from X_test
	X_test = X_test[X_test.columns.difference(['fare_amount'])]

	# this wont return until all data is in GPU memory
	done = wait([X_test, Y_test])

	dtrain = xgb.dask.DaskDMatrix(client, X_train, Y_train)

	#train model
	trained_model = xgb.dask.train(client,
	{
	'learning_rate': 0.3,
	'max_depth': 8,
	'objective': 'reg:squarederror',
	'subsample': 0.6,
	'gamma': 1,
	'silent': True,
	'verbose_eval': True,
	'tree_method':'gpu_hist',
	'n_gpus': 1
	},
	dtrain,
	num_boost_round=100, evals=[(dtrain, 'train')])

	# generate predictions on the test set
	'''feed X_test as a dataframe'''
	prediction = xgb.dask.predict(client, trained_model['booster'], X_test).persist()
	wait(prediction)


	type(prediction)
	dask.dataframe.core.Series

	#convert prediction to dask_cudf.core.Series
	pred = dask_cudf.from_dask_dataframe(prediction)
	true = Y_test['fare_amount']

	#want to calculate RMSE, but getting inaccurate results:
	RMSE formula:
	SE = ((pred-true) **2).compute()
	math.sqrt(SE.mean())

	#this gives wrong results
	((pred -true)**2).compute()
	0 null
	1 null
	5 null
	6 null
	dtype: float64