|
;; Given a cluster and a point, find the closet n |
|
;; rows in the cluster's dataset(s). |
|
|
|
;; Given a cluster and an instance with the same fields as its dataset, |
|
;; order the cluster's centroids by increasing distance to the instance. |
|
(define (ordered-centroids cluster instance) |
|
(let (descs (get-in cluster ["clusters" "fields"]) |
|
weights (get cluster "scales") |
|
cs (map (lambda (cn) |
|
(let (c (get cn "center") |
|
d (row-distance-squared descs weights c instance)) |
|
(assoc c "distance" (or d -1) "id" (get cn "id")))) |
|
(get-in cluster ["clusters" "clusters"])) |
|
cs (filter (lambda (c) (positive? (get c "distance"))) cs)) |
|
(sort-by-key "distance" cs))) |
|
|
|
;; Auxiliary function for error signaling. |
|
(define (raise-missing id) |
|
(raise {"message" (str "Missing input field: " id) "code" -1})) |
|
|
|
;; Auxiliary function: constructs the flatline string that generates a |
|
;; new field with the distance of each row to the given one. |
|
(define (distance-flatline cluster instance) |
|
(let (ids (get cluster "input_fields") |
|
ps (map (lambda (id) (or (get instance id) (raise-missing id))) ids) |
|
scales (get cluster "scales") |
|
ws (map (lambda (id) (get scales id 1)) ids)) |
|
(flatline "(row-distance-squared (list @{{ps}}) (all) (list @{{ws}}))"))) |
|
|
|
;; Given a cluster and one of its centroids, uses the flatline |
|
;; string generated by `distance-flatline` to create a new |
|
;; dataset that extend's the centroid dataset with a distance |
|
;; column. |
|
(define (generate-distance-dataset cluster cent fl) |
|
(let (cluster-id (get cluster "resource") |
|
id (or (get cent "id") (raise (str "No id in " cent))) |
|
ds-id (get-in cluster ["cluster_datasets" id]) |
|
ds-id (if (or (not ds-id) (empty? ds-id)) |
|
(create-and-wait-dataset {"cluster" cluster-id "centroid" id}) |
|
(str "dataset/" ds-id))) |
|
(create-and-wait-dataset {"origin_dataset" ds-id |
|
"new_fields" [{"name" "distance" "field" fl}]}))) |
|
|
|
;; Given an extended centroid dataset (created by |
|
;; `generate-distance-dataset`), returns the list of `n` rows with the |
|
;; smallest values in the distance column. |
|
(define (fetch-dataset-instances ds-id n) |
|
(let (sample-id (create-and-wait-sample {"dataset" ds-id}) |
|
obj-id (dataset-get-objective-id ds-id) |
|
sample (fetch sample-id {"row_order_by" obj-id |
|
"rows" n |
|
"mode" "linear"}) |
|
rows (get-in sample ["sample" "rows"] [])) |
|
(delete sample-id) |
|
rows)) |
|
|
|
;; Final workflow. |
|
(define (find-neighbors cluster-id instance n) |
|
(let (cluster (fetch cluster-id {"limit" -1}) |
|
fl (distance-flatline cluster instance)) |
|
(loop (cps (ordered-centroids cluster instance) instances [] m n) |
|
(cond (empty? cps) instances |
|
(< m 1) instances |
|
(let (ds-id (generate-distance-dataset cluster (head cps) fl) |
|
new-instances (fetch-dataset-instances ds-id m)) |
|
(recur (tail cps) |
|
(concat instances new-instances) |
|
(- m (count new-instances)))))))) |
|
|
|
;; Inputs and outputs |
|
(define rows (find-neighbors cluster-id instance n)) |