prepare iris data https://support.treasuredata.com/hc/en-us/articles/360001260787-Iris-Multiclass-Classification-by-RandomForest
-- create xgboost input format (see https://xgboost.readthedocs.io/en/latest/tutorials/input_format.html)
INSERT OVERWRITE TABLE input
select
rowid,
indexed_features(sepal_length, sepal_width, petal_length, petal_width) as features,
if(class = 'Iris-setosa', 1, 0) as label -- label need to be 0 or 1
from
original;
INSERT OVERWRITE TABLE model
select
train_xgboost_classifier(features, label)
as (model_id, model)
from (
select features, label
from input
cluster by rand(43) -- shuffle
) shuffled;
select rowid, avg(predicted) as predicted
from (
select xgboost_predict(rowid, features, model_id, model)
as (rowid, predicted)
from
model l
LEFT OUTER JOIN input r -- actually cross-join taking model for left
) t
group by rowid
https://github.com/dmlc/xgboost/blob/master/jvm-packages/xgboost4j/src/main/scala/ml/dmlc/xgboost4j/LabeledPoint.scala
Since LabeledPoint is Scala class, it required Scala lib for dependencies.