-
-
Save whizzmler/dce8ca7b1b1a35baa78c3cea780729a3 to your computer and use it in GitHub Desktop.
Best-first feature selection via WhizzML
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
;; A simple function to get the max value in a list | |
(define (get-max xs) (reduce (lambda (x y) (if (> x y) x y)) (head xs) xs)) | |
;; Get feature names given ids | |
(define (feature-names dataset-id ids) | |
(let (fields (get (fetch dataset-id) "fields")) | |
(map (lambda (id) (get-in fields [id "name"])) ids))) | |
;; Create a dataset sample | |
(define (sample-dataset ds-id rate oob) | |
(create-and-wait-dataset {"sample_rate" rate | |
"origin_dataset" ds-id | |
"out_of_bag" oob | |
"seed" "whizzml-example"})) | |
;; Split a dataset into training and test sets | |
(define (split-dataset ds-id rate) | |
(list (sample-dataset ds-id rate false) | |
(sample-dataset ds-id rate true))) | |
;; Get the default set of input fields for this dataset (all preferred | |
;; fields minus the objective field). | |
(define (default-inputs dataset-id obj-id) | |
(let (fields-structure (get (fetch dataset-id) "fields") | |
fids (keys fields-structure) | |
field-val (lambda (fid k) (get-in fields-structure [fid k]))) | |
(filter (lambda (k) (and (field-val k "preferred") (not (= obj-id k)))) | |
fids))) | |
;; Make a list of models using the given dataset, objective field and | |
;; selected features. For each model, add one of the potential | |
;; features in potentials to the input feature list. | |
(define (make-models dataset-id obj-field selected potentials) | |
(let (model-req {"dataset" dataset-id "objective_field" obj-field} | |
make-req (lambda (fid) | |
(assoc model-req "input_fields" (cons fid selected))) | |
all-reqs (map make-req potentials)) | |
(create-and-wait* "model" all-reqs))) | |
;; Given a test dataset, a list of potential features, and a list of | |
;; model ids corresponding to those features, select the best | |
;; potential feature by performing an evaluation on each model and | |
;; returning the feature with the best performance. | |
(define (select-feature test-dataset-id potentials model-ids) | |
(let (eval-req {"dataset" test-dataset-id} | |
make-req (lambda (mid) (assoc eval-req "model" mid)) | |
all-reqs (map make-req model-ids) | |
evs (map fetch (create-and-wait* "evaluation" all-reqs)) | |
vs (map (lambda (ev) (get-in ev ["result" "model" "average_phi"])) evs) | |
value-map (make-map potentials vs) | |
max-val (get-max vs) | |
choose-best (lambda (id) (if (= max-val (get value-map id)) id false))) | |
(some choose-best potentials))) | |
;; Do best-first feature selection. Given a dataset and a target | |
;; number of features iteratively construct models for each feature, | |
;; evaluate them, and add the feature corresponding to the best | |
;; evaluation to the running set of features. Stop when you reach the | |
;; target number, or you run out of features. | |
(define (select-features dataset-id nfeatures) | |
(let (obj-id (dataset-get-objective-id dataset-id) | |
input-ids (default-inputs dataset-id obj-id) | |
splits (split-dataset dataset-id 0.5) | |
train-id (nth splits 0) | |
test-id (nth splits 1)) | |
(loop (selected [] | |
potentials input-ids) | |
(if (or (>= (count selected) nfeatures) (empty? potentials)) | |
(feature-names dataset-id selected) | |
(let (_ (log-info "Making models...") | |
model-ids (make-models dataset-id obj-id selected potentials) | |
_ (log-info "Selecting feature...") | |
next-feat (select-feature test-id potentials model-ids) | |
_ (log-info "Selected feature is " next-feat)) | |
(recur (cons next-feat selected) | |
(filter (lambda (id) (not (= id next-feat))) potentials))))))) | |
(define output-features (select-features dataset-id nfeatures)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment