Skip to content

Instantly share code, notes, and snippets.

@myui
Created August 23, 2019 06:08
Show Gist options
  • Save myui/aa6e142a95ca8f995cc8e49146dbe2eb to your computer and use it in GitHub Desktop.
Save myui/aa6e142a95ca8f995cc8e49146dbe2eb to your computer and use it in GitHub Desktop.
xgboost_iris.md

xgboost binary classification example

prepare data

prepare iris data https://support.treasuredata.com/hc/en-us/articles/360001260787-Iris-Multiclass-Classification-by-RandomForest

-- create xgboost input format (see https://xgboost.readthedocs.io/en/latest/tutorials/input_format.html)
INSERT OVERWRITE TABLE input
select
  rowid, 
  indexed_features(sepal_length, sepal_width, petal_length, petal_width) as features,
  if(class = 'Iris-setosa', 1, 0) as label -- label need to be 0 or 1 
from
  original;

train

INSERT OVERWRITE TABLE model
select 
  train_xgboost_classifier(features, label) 
    as (model_id, model)
from (
  select features, label
  from input
  cluster by rand(43) -- shuffle
) shuffled;

predict

select rowid, avg(predicted) as predicted
from (
  select xgboost_predict(rowid, features, model_id, model)
    as (rowid, predicted)
  from 
    model l
	LEFT OUTER JOIN input r -- actually cross-join taking model for left
) t
group by rowid
@myui
Copy link
Author

myui commented Aug 23, 2019

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment