-
-
Save whizzmler/9b74355ea3e1f688378a9c379eabedb9 to your computer and use it in GitHub Desktop.
Remove anomalies from dataset
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"name": "Normalize Dataset", | |
"description": "Remove the top n anomalies from a dataset", | |
"inputs": [ | |
{"name": "dataset-id", "type": "dataset-id", "description": "Dataset Id"}, | |
{"name": "top-n", "type": "number", "description": "Top N Anomalies to Remove"} | |
], | |
"outputs": [ | |
{"name": "normalized-dataset", "type": "dataset-id", "description": "Normalized Dataset"} | |
], | |
"total_resources_created": 1, | |
"resource_to_apply": "dataset" | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
;; Remove the top n anomalies from a dataset | |
;; given an anomaly resource, get the list of its top_anomaly row numbers | |
(define (anomalous-rows a) | |
(map (lambda (x) (get x "row_number")) | |
(get-in a ["model" "top_anomalies"]))) | |
;; given a list of row numbers, generate a flatline expression | |
;; that discards those rows. | |
(define (row-filter rows) | |
(let (eqs (map (lambda (n) (flatline "(= (row-number) {n})")) rows)) | |
(flatline "(not (or @{eqs}))"))) | |
;; given a dataset and a number of anomalies, generate a new one | |
;; that removes from the original the anomalous rows. | |
(define (normalize-dataset dataset-id n) | |
(let (a-id (create-and-wait-anomaly {"dataset" dataset-id "top_n" n}) | |
anomaly (fetch a-id {"exclude" "trees,fields"}) | |
rows (anomalous-rows anomaly) | |
filter (row-filter rows)) | |
(log-info "Deleting rows " rows) | |
(log-info "Using filter " filter) | |
(delete a-id) ;; or we could keep it | |
(create-and-wait-dataset {"origin_dataset" dataset-id | |
"lisp_filter" filter}))) | |
;; script with parameters dataset-id and top-n | |
(define normalized-dataset (normalize-dataset dataset-id top-n)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment