osroca · May 8, 2016 10:00
diff --git a/README.md b/README.md
diff --git a/normalize-dataset.json b/normalize-dataset.json
 {
  "name": "Normalize Dataset",
  "description": "Remove the top n anomalies from a dataset",
  "inputs": [
    {"name": "dataset-id", "type": "dataset-id", "description": "Dataset Id"},
    {"name": "top-n", "type": "number", "description": "Top N Anomalies to Remove", "default": 3}
  ],
  "outputs": [
    {"name": "normalized-dataset", "type": "dataset-id", "description": "Normalized Dataset"}
  ],
  "total_resources_created": 1,
  "resource_to_apply": "dataset"
 }
diff --git a/normalize-dataset.whizzml b/normalize-dataset.whizzml
 ;; Remove the top n anomalies from a dataset

 ;; given an anomaly resource, get the list of its top_anomaly row numbers
 (define (anomalous-rows a)
  (map (lambda (x) (get x "row_number"))
       (get-in a ["model" "top_anomalies"])))

 ;; given a list of row numbers, generate a flatline expression
 ;; that discards those rows.
 (define (row-filter rows)
  (let (eqs (map (lambda (n) (flatline "(= (row-number) {n})")) rows))
    (flatline "(not (or @{eqs}))")))

 ;; given a dataset and a number of anomalies, generate a new one
 ;; that removes from the original the anomalous rows.
 (define (normalize-dataset dataset-id n)
  (let (a-id (create-and-wait-anomaly {"dataset" dataset-id "top_n" n})
        anomaly (fetch a-id {"exclude" "trees,fields"})
        rows (anomalous-rows anomaly)
        filter (row-filter rows))
     (log-info "Deleting rows " rows)
     (log-info "Using filter " filter)
     (delete a-id) ;; or we could keep it
     (create-and-wait-dataset {"origin_dataset" dataset-id
                               "lisp_filter" filter})))

 ;; script with parameters dataset-id and top-n
 (define normalized-dataset (normalize-dataset dataset-id top-n))
	{
	"name": "Normalize Dataset",
	"description": "Remove the top n anomalies from a dataset",
	"inputs": [
	{"name": "dataset-id", "type": "dataset-id", "description": "Dataset Id"},
	{"name": "top-n", "type": "number", "description": "Top N Anomalies to Remove", "default": 3}
	],
	"outputs": [
	{"name": "normalized-dataset", "type": "dataset-id", "description": "Normalized Dataset"}
	],
	"total_resources_created": 1,
	"resource_to_apply": "dataset"
	}
	;; Remove the top n anomalies from a dataset

	;; given an anomaly resource, get the list of its top_anomaly row numbers
	(define (anomalous-rows a)
	(map (lambda (x) (get x "row_number"))
	(get-in a ["model" "top_anomalies"])))

	;; given a list of row numbers, generate a flatline expression
	;; that discards those rows.
	(define (row-filter rows)
	(let (eqs (map (lambda (n) (flatline "(= (row-number) {n})")) rows))
	(flatline "(not (or @{eqs}))")))

	;; given a dataset and a number of anomalies, generate a new one
	;; that removes from the original the anomalous rows.
	(define (normalize-dataset dataset-id n)
	(let (a-id (create-and-wait-anomaly {"dataset" dataset-id "top_n" n})
	anomaly (fetch a-id {"exclude" "trees,fields"})
	rows (anomalous-rows anomaly)
	filter (row-filter rows))
	(log-info "Deleting rows " rows)
	(log-info "Using filter " filter)
	(delete a-id) ;; or we could keep it
	(create-and-wait-dataset {"origin_dataset" dataset-id
	"lisp_filter" filter})))

	;; script with parameters dataset-id and top-n
	(define normalized-dataset (normalize-dataset dataset-id top-n))