|
;; Stacking modeling library |
|
;; Main exports: |
|
;; (make-stack dataset-id) - to create the stacked models |
|
;; (make-stack-prediction models metamodel input-data) - |
|
;; to use the above to make predictions |
|
;; |
|
|
|
;; Auxiliary function splitting a dataset |
|
(define (split-dataset ds-id rate) |
|
(let (seed (str (rand-int 1000) (rand-int 1000))) |
|
[(create-dataset {"sample_rate" rate |
|
"origin_dataset" ds-id |
|
"out_of_bag" false |
|
"seed" seed}) |
|
(create-dataset {"sample_rate" rate |
|
"origin_dataset" ds-id |
|
"out_of_bag" true |
|
"seed" seed})])) |
|
|
|
;; Split a dataset and wait until both halves are finished |
|
(define (split-dataset-and-wait ds-id rate) |
|
(wait* (split-dataset ds-id rate))) |
|
|
|
;; Extract for a batchpredction its associated dataset of results |
|
(define (batch-dataset id) |
|
(wait (get (fetch id) "output_dataset_resource"))) |
|
|
|
;; Create a batchprediction for the given model and datasets, |
|
;; with a map of additional options and using defaults appropriate |
|
;; for model stacking |
|
(define (make-batch ds-id mod-id opts) |
|
(create-batchprediction (merge {"all_fields" true |
|
"output_dataset" true |
|
"dataset" ds-id |
|
"prediction_name" (resource-type mod-id) |
|
"model" (wait mod-id)} |
|
{}))) |
|
|
|
;; Auxiliary function extracting the model_inputs of a model |
|
(define (model-inputs mod-id) |
|
(get (fetch mod-id) "input_fields")) |
|
|
|
;; Auxiliary function to create the set of stack models |
|
(define (create-stack-models train-id) |
|
[(create-model {"dataset" train-id}) |
|
(create-ensemble {"dataset" train-id |
|
"number_of_models" 20 |
|
"randomize" false}) |
|
(create-ensemble {"dataset" train-id |
|
"number_of_models" 20 |
|
"randomize" true}) |
|
(create-logisticregression {"dataset" train-id})]) |
|
|
|
;; Auxiliary funtion to successively create batchpredictions using the |
|
;; given models over the initial dataset ds-id. Returns the final |
|
;; dataset id. |
|
(define (create-stack-predictions models ds-id) |
|
(reduce (lambda (did mid) |
|
(batch-dataset (wait (make-batch did mid {})))) |
|
ds-id models)) |
|
|
|
;; Splits the given dataset, using half of it to create |
|
;; an heterogeneous collection of models and the other |
|
;; half to train a tree that predicts based on those other |
|
;; models predictions. Returns a map with the collection |
|
;; of models (under the key "models") and the meta-prediction |
|
;; as the value of the key "metamodel". The key "result" |
|
;; has as value a boolean flag indicating whether the |
|
;; process was successful. |
|
(define (make-stack dataset-id) |
|
(let (ids (split-dataset-and-wait dataset-id 0.5) |
|
train-id (nth ids 0) |
|
hold-id (nth ids 1) |
|
models (create-stack-models train-id) |
|
id (create-stack-predictions models hold-id) |
|
orig-fields (model-inputs (head models)) |
|
obj-id (dataset-get-objective-id train-id) |
|
meta-id (create-and-wait-model {"dataset" id |
|
"excluded_fields" orig-fields |
|
"objective_field" obj-id}) |
|
success? (resource-done? (fetch meta-id))) |
|
{"models" models "metamodel" meta-id "result" success?})) |
|
|
|
;; Use the models and metamodels computed by make-stack |
|
;; to make a prediction on the input-data map. Returns |
|
;; the identifier of the prediction object. |
|
(define (make-stack-prediction models meta-model input-data) |
|
(let (preds (map (lambda (m) (create-prediction {"model" m |
|
"input_data" input-data})) |
|
models) |
|
preds (map (lambda (p) |
|
(head (values (get (fetch p) "prediction")))) |
|
preds) |
|
meta-input (make-map (model-inputs meta-model) preds)) |
|
(create-prediction {"model" meta-model "input_data" meta-input}))) |