Last active
June 2, 2016 16:12
-
-
Save charleslparker/211cf12b588e2aa1b9e50593536b999d to your computer and use it in GitHub Desktop.
Custom Feature Analyzer
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"name": "Custom feature analyzer", | |
"description": "Find the best features for modeling using a greedy algorithm", | |
"kind": "script", | |
"source_code": "analyze-features.whizzml", | |
"inputs": [ | |
{ | |
"name": "dataset-id", | |
"type": "dataset-id", | |
"description": "The data to select features from" | |
}, | |
{ | |
"name": "balance-objective", | |
"type": "boolean", | |
"default": true, | |
"description": "Set `balance_objective` during model building" | |
}, | |
{ | |
"name": "number-of-models", | |
"type": "number", | |
"default": 10, | |
"description": "Set `number_of_models` during model building" | |
}, | |
{ | |
"name": "node-threshold", | |
"type": "number", | |
"default": 512, | |
"description": "Set `node_threshold` during model building" | |
}, | |
{ | |
"name": "randomize", | |
"type": "boolean", | |
"default": true, | |
"description": "Set `randomize` during model building" | |
}, | |
{ | |
"name": "staleness", | |
"type": "number", | |
"default": 5, | |
"description": "Stop the algorithm after this many iterations without improvement" | |
}, | |
{ | |
"name": "pos-class", | |
"type": "string", | |
"default": "1", | |
"description": "Category name for the class considered positive" | |
}, | |
{ | |
"name": "neg-class", | |
"type": "string", | |
"default": "0", | |
"description": "Category name for the class considered negative" | |
}, | |
{ | |
"name": "recall-threshold", | |
"type": "number", | |
"default": 0.5, | |
"description": "Threshold of acceptable recall for the positive class" | |
} | |
], | |
"outputs": [ | |
{ | |
"name": "output-features", | |
"type": "list", | |
"description": "The list of selected fields" | |
} | |
] | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
;; Get an array of feature names from an array of ids | |
(define (feature-names dataset-id ids) | |
(let (fields (get (fetch dataset-id) "fields")) | |
(map (lambda (id) (get-in fields [id "name"])) ids))) | |
;; Split a dataset into "folds" | |
(define (create-k-folds dataset-id k-folds) | |
(let (k-fold-fn (lambda (x) (create-dataset | |
{"origin_dataset" dataset-id | |
"row_offset" x | |
"row_step" k-folds | |
"new_fields" [{"name" "k_fold" | |
"field" (str x)}]})) | |
dataset-ids (map k-fold-fn (range 0 k-folds))) | |
(wait* dataset-ids))) | |
;; Create a series of training/testing pairs by "holding out" one of | |
;; the given dataset ids as the testing set for each pair | |
(define (pair-k-folds dataset-ids) | |
(map (lambda(x) (list (nth dataset-ids x) | |
(concat (take x dataset-ids) | |
(drop (+ x 1) dataset-ids)))) | |
(range 0 (count dataset-ids)))) | |
;; Evaluate a list of ensemble parameters using cross validation | |
;; folds given in k-folds and the evaluation function given in | |
;; eval-parser. The latter should take an evaluation as input and | |
;; produce a single value (the quality of the solution) as output. | |
(define (evaluate-candidates param-maps k-folds eval-parser) | |
(let (train-sets (map (lambda (f) (nth f 1)) k-folds) | |
test-sets (map (lambda (f) (nth f 0)) k-folds) | |
_ (log-info "Creating models...") | |
reqs-fn (lambda (p) | |
(map (lambda (d) (merge p {"datasets" d})) train-sets)) | |
mods (map (lambda (p) (create* "ensemble" (reqs-fn p))) param-maps) | |
mrows (map wait* mods) | |
_ (log-info "Modeling complete.") | |
_ (log-info "Creating evaluations...") | |
evals-fn (lambda (mrow) | |
(map (lambda (i) {"model" (nth mrow i) | |
"dataset" (nth test-sets i)}) | |
(range (count mrow)))) | |
evals (map (lambda (mrow) (create* "evaluation" (evals-fn mrow))) mrows) | |
erows (map wait* evals) | |
_ (log-info "Evaluation complete.") | |
_ (log-info "Combining evaluations...") | |
comb-evals (map (lambda (erow) | |
(create "evaluation" {"evaluations" erow})) | |
erows)) | |
(map eval-parser (wait* comb-evals)))) | |
;; Get the default set of input fields for this dataset (all preferred | |
;; fields minus the objective field). | |
(define (default-inputs dataset-id obj-id) | |
(let (fields-structure (get (fetch dataset-id) "fields") | |
fids (keys fields-structure) | |
field-val (lambda (fid k) (get-in fields-structure [fid k]))) | |
(filter (lambda (k) (and (field-val k "preferred") (not (= obj-id k)))) | |
fids))) | |
;; Get the given measure from the given class in the given evaluation | |
(define (get-measure eval class-name measure) | |
(let (per-class (get-in eval ["result" "model" "per_class_statistics"]) | |
is-class? (lambda (s) (= (get s "class_name") class-name)) | |
class-stats (filter is-class? per-class)) | |
(when (empty? class-stats) | |
(raise {"message" (str "Class " class-name " not found!") "code" -200})) | |
(get (head class-stats) measure))) | |
;; Construct a measure function that takes an evaluation as input and outputs | |
;; a single value by which to judge the quality of solutions | |
(define (make-evaluator pos-class neg-class recall-threshold) | |
(lambda (eval-id) | |
(let (eval (fetch eval-id) | |
pos-rec (get-measure eval pos-class "average_recall")) | |
(if (> pos-rec recall-threshold) | |
(let (pos-pre (get-measure eval pos-class "average_precision") | |
neg-pre (get-measure eval neg-class "average_precision")) | |
(/ (- pos-pre (- 1 neg-pre)) (- 1 neg-pre))) | |
0)))) | |
;; Make a list of models using the given dataset, objective field and | |
;; selected features. For each model, add one of the potential | |
;; features in potentials to the input feature list. | |
(define (make-and-eval selected potentials model-req k-folds evaluator) | |
(let (unselected (filter (lambda (f) (not (member? f selected))) potentials) | |
fsets (map (lambda (fid) (cons fid selected)) unselected) | |
make-req (lambda (fs) (assoc model-req "input_fields" fs)) | |
requests (map make-req fsets) | |
values (evaluate-candidates requests k-folds evaluator)) | |
(map (lambda (i) (list (nth fsets i) (nth values i))) | |
(range (count values))))) | |
;; Sort a list by a key function | |
(define (sort-by-fn fn xs) | |
(let (vals (map fn xs) | |
pairs (map (lambda (i) (list (nth vals i) i)) (range (count vals))) | |
spairs (reverse (sort pairs))) | |
(map (lambda (p) (nth xs (nth p 1))) spairs))) | |
;; Do best-first feature selection. | |
(define (select-features dataset-id | |
balance-objective | |
number-of-models | |
node-threshold | |
randomize | |
staleness | |
pos-class | |
neg-class | |
recall-threshold) | |
(let (k-folds (pair-k-folds (create-k-folds dataset-id 5)) | |
first-test (head (head k-folds)) | |
obj-id (dataset-get-objective-id first-test) | |
_ (log-info "Objective: " obj-id) | |
potentials (default-inputs first-test obj-id) | |
_ (log-info "Features: " potentials) | |
evaluator (make-evaluator pos-class neg-class recall-threshold) | |
mod-req {"seed" "features" | |
"balance_objective" balance-objective | |
"number_of_models" number-of-models | |
"node_threshold" node-threshold | |
"randomize" randomize}) | |
(loop (to-evaluate [[[] 0]] | |
evaluated [] | |
stale 0 | |
last-best 0) | |
(if (< stale staleness) | |
(let (sorted-evals (sort-by-fn (lambda (x) (nth x 1)) to-evaluate) | |
next-eval (head sorted-evals) | |
_ (log-info "Evaluating: " (head next-eval)) | |
rest-evals (tail sorted-evals) | |
new (make-and-eval (head next-eval) | |
potentials | |
mod-req | |
k-folds | |
evaluator) | |
next-to-evaluate (concat rest-evals new) | |
next-evaluated (cons next-eval evaluated) | |
all-evals (concat next-to-evaluate next-evaluated) | |
;; _ (log-info all-evals) | |
best-eval (head (sort-by-fn (lambda (x) (nth x 1)) all-evals)) | |
best (max (cons last-best (map (lambda (x) (nth x 1)) new))) | |
_ (log-info "Current best score: " best) | |
_ (log-info "Current best features: " | |
(feature-names first-test (head best-eval))) | |
_ (log-info "Iterations without improvement: " stale)) | |
(recur next-to-evaluate | |
next-evaluated | |
(if (> best (+ last-best 0.00001)) 0 (+ stale 1)) | |
best)) | |
(let (_ (log-info "Getting best...") | |
all-evals (concat to-evaluate evaluated) | |
sorted-evals (sort-by-fn (lambda (x) (nth x 1)) all-evals) | |
best-eval (head sorted-evals) | |
best-score (head (tail best-eval)) | |
_ (log-info "Features: " (head best-eval) " Score: " best-score)) | |
(when (<= best-score 0) | |
(log-warn "Best solution has poor recall!")) | |
(feature-names first-test (head best-eval))))))) | |
(define output-features (select-features dataset-id | |
balance-objective | |
number-of-models | |
node-threshold | |
randomize | |
staleness | |
pos-class | |
neg-class | |
recall-threshold)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment