whizzmler · May 10, 2016 21:22
diff --git a/readme.md b/readme.md
diff --git a/stacked-generalization.whizzml b/stacked-generalization.whizzml
 ;; Stacking modeling library
 ;; Main exports:
 ;;  (make-stack dataset-id) - to create the stacked models
 ;;  (make-stack-prediction models metamodel input-data) -
 ;;     to use the above to make predictions
 ;;

 ;; Auxiliary function splitting a dataset
 (define (split-dataset ds-id rate)
  (let (seed (str (rand-int 1000) (rand-int 1000)))
    [(create-dataset {"sample_rate" rate
                      "origin_dataset"  ds-id
                      "out_of_bag" false
                      "seed" seed})
     (create-dataset {"sample_rate" rate
                      "origin_dataset"  ds-id
                      "out_of_bag" true
                      "seed" seed})]))

 ;; Split a dataset and wait until both halves are finished
 (define (split-dataset-and-wait ds-id rate)
  (wait* (split-dataset ds-id rate)))

 ;; Extract for a batchpredction its associated dataset of results
 (define (batch-dataset id)
  (wait (get (fetch id) "output_dataset_resource")))

 ;; Create a batchprediction for the given model and datasets,
 ;; with a map of additional options and using defaults appropriate
 ;; for model stacking
 (define (make-batch ds-id mod-id opts)
  (create-batchprediction (merge {"all_fields" true
                                  "output_dataset" true
                                  "dataset" ds-id
                                  "prediction_name" (resource-type mod-id)
                                  "model" (wait mod-id)}
                                 {})))

 ;; Auxiliary function extracting the model_inputs of a model
 (define (model-inputs mod-id)
  (get (fetch mod-id) "input_fields"))

 ;; Auxiliary function to create the set of stack models
 (define (create-stack-models train-id)
  [(create-model {"dataset" train-id})
   (create-ensemble {"dataset" train-id
                     "number_of_models" 20
                     "randomize" false})
   (create-ensemble {"dataset" train-id
                     "number_of_models" 20
                     "randomize" true})
   (create-logisticregression {"dataset" train-id})])

 ;; Auxiliary funtion to successively create batchpredictions using the
 ;; given models over the initial dataset ds-id.  Returns the final
 ;; dataset id.
 (define (create-stack-predictions models ds-id)
  (reduce (lambda (did mid)
            (batch-dataset (wait (make-batch did mid {}))))
          ds-id models))

 ;; Splits the given dataset, using half of it to create
 ;; an heterogeneous collection of models and the other
 ;; half to train a tree that predicts based on those other
 ;; models predictions.  Returns a map with the collection
 ;; of models (under the key "models") and the meta-prediction
 ;; as the value of the key "metamodel".  The key "result"
 ;; has as value a boolean flag indicating whether the
 ;; process was successful.
 (define (make-stack dataset-id)
  (let (ids (split-dataset-and-wait dataset-id 0.5)
        train-id (nth ids 0)
        hold-id (nth ids 1)
        models (create-stack-models train-id)
        id (create-stack-predictions models hold-id)
        orig-fields (model-inputs (head models))
        obj-id (dataset-get-objective-id train-id)
        meta-id (create-and-wait-model {"dataset" id
                                        "excluded_fields" orig-fields
                                        "objective_field" obj-id})
        success? (resource-done? (fetch meta-id)))
    {"models" models "metamodel" meta-id "result" success?}))

 ;; Use the models and metamodels computed by make-stack
 ;; to make a prediction on the input-data map.  Returns
 ;; the identifier of the prediction object.
 (define (make-stack-prediction models meta-model input-data)
  (let (preds (map (lambda (m) (create-prediction {"model" m
                                                   "input_data" input-data}))
                   models)
        preds (map (lambda (p)
                     (head (values (get (fetch p) "prediction"))))
                   preds)
        meta-input (make-map (model-inputs meta-model) preds))
   (create-prediction {"model" meta-model "input_data" meta-input})))
	;; Stacking modeling library
	;; Main exports:
	;; (make-stack dataset-id) - to create the stacked models
	;; (make-stack-prediction models metamodel input-data) -
	;; to use the above to make predictions
	;;

	;; Auxiliary function splitting a dataset
	(define (split-dataset ds-id rate)
	(let (seed (str (rand-int 1000) (rand-int 1000)))
	[(create-dataset {"sample_rate" rate
	"origin_dataset" ds-id
	"out_of_bag" false
	"seed" seed})
	(create-dataset {"sample_rate" rate
	"origin_dataset" ds-id
	"out_of_bag" true
	"seed" seed})]))

	;; Split a dataset and wait until both halves are finished
	(define (split-dataset-and-wait ds-id rate)
	(wait* (split-dataset ds-id rate)))

	;; Extract for a batchpredction its associated dataset of results
	(define (batch-dataset id)
	(wait (get (fetch id) "output_dataset_resource")))

	;; Create a batchprediction for the given model and datasets,
	;; with a map of additional options and using defaults appropriate
	;; for model stacking
	(define (make-batch ds-id mod-id opts)
	(create-batchprediction (merge {"all_fields" true
	"output_dataset" true
	"dataset" ds-id
	"prediction_name" (resource-type mod-id)
	"model" (wait mod-id)}
	{})))

	;; Auxiliary function extracting the model_inputs of a model
	(define (model-inputs mod-id)
	(get (fetch mod-id) "input_fields"))

	;; Auxiliary function to create the set of stack models
	(define (create-stack-models train-id)
	[(create-model {"dataset" train-id})
	(create-ensemble {"dataset" train-id
	"number_of_models" 20
	"randomize" false})
	(create-ensemble {"dataset" train-id
	"number_of_models" 20
	"randomize" true})
	(create-logisticregression {"dataset" train-id})])

	;; Auxiliary funtion to successively create batchpredictions using the
	;; given models over the initial dataset ds-id. Returns the final
	;; dataset id.
	(define (create-stack-predictions models ds-id)
	(reduce (lambda (did mid)
	(batch-dataset (wait (make-batch did mid {}))))
	ds-id models))

	;; Splits the given dataset, using half of it to create
	;; an heterogeneous collection of models and the other
	;; half to train a tree that predicts based on those other
	;; models predictions. Returns a map with the collection
	;; of models (under the key "models") and the meta-prediction
	;; as the value of the key "metamodel". The key "result"
	;; has as value a boolean flag indicating whether the
	;; process was successful.
	(define (make-stack dataset-id)
	(let (ids (split-dataset-and-wait dataset-id 0.5)
	train-id (nth ids 0)
	hold-id (nth ids 1)
	models (create-stack-models train-id)
	id (create-stack-predictions models hold-id)
	orig-fields (model-inputs (head models))
	obj-id (dataset-get-objective-id train-id)
	meta-id (create-and-wait-model {"dataset" id
	"excluded_fields" orig-fields
	"objective_field" obj-id})
	success? (resource-done? (fetch meta-id)))
	{"models" models "metamodel" meta-id "result" success?}))

	;; Use the models and metamodels computed by make-stack
	;; to make a prediction on the input-data map. Returns
	;; the identifier of the prediction object.
	(define (make-stack-prediction models meta-model input-data)
	(let (preds (map (lambda (m) (create-prediction {"model" m
	"input_data" input-data}))
	models)
	preds (map (lambda (p)
	(head (values (get (fetch p) "prediction"))))
	preds)
	meta-input (make-map (model-inputs meta-model) preds))
	(create-prediction {"model" meta-model "input_data" meta-input})))