Created
February 27, 2011 01:21
-
-
Save sritchie/845813 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
;; ## Example of timeseries aggregation in cascalog. | |
;; | |
;; (copy paste all of this in at the REPL!) | |
(use 'cascalog.api) | |
(def tseries [["ndvi" 1 0 [1 2 3 4]] | |
["ndvi" 1 2 [2 3 4 5]] | |
["ndvi" 1 1 [4 3 2 1]] | |
["ndvi" 1 4 [1 2 3 4]] | |
["ndvi" 1 3 [1 2 3 4]] | |
["evi" 2 0 [1 2 3 4]] | |
["evi" 2 1 [1 2 3 4]] | |
["evi" 2 2 [1 2 3 4]] | |
["evi" 2 3 [1 2 3 4]] | |
["evi" 2 4 [1 2 3 4]] | |
["ndvi" 2 0 [1 2 3 4]] | |
["ndvi" 2 1 [1 2 3 4]] | |
["ndvi" 2 2 [1 2 3 4]] | |
["ndvi" 2 3 [1 2 3 4]] | |
["ndvi" 2 4 [1 2 3 4]] | |
["evi" 3 0 [1 2 3 4]] | |
["evi" 3 1 [1 2 3 4]] | |
["evi" 3 2 [1 2 3 4]] | |
["evi" 3 3 [1 2 3 4]] | |
["evi" 3 4 [1 2 3 4]]]) | |
(defbufferop tuples->string | |
[tuples] | |
[(apply str (map str tuples))]) | |
(defn tester-strings [] | |
(?<- (stdout) | |
[?dataset ?tileid ?tuples] | |
(tseries ?dataset ?tileid ?tperiod ?chunk) | |
(:sort ?tperiod) | |
(tuples->string ?tperiod ?chunk :> ?tuples))) | |
;; RESULTS | |
;; ----------------------- | |
;; ndvi 1 (0 [1 2 3 4])(1 [4 3 2 1])(2 [2 3 4 5])(3 [1 2 3 4])(4 [1 2 3 4]) | |
;; evi 2 (0 [1 2 3 4])(1 [1 2 3 4])(2 [1 2 3 4])(3 [1 2 3 4])(4 [1 2 3 4]) | |
;; ndvi 2 (0 [1 2 3 4])(1 [1 2 3 4])(2 [1 2 3 4])(3 [1 2 3 4])(4 [1 2 3 4]) | |
;; evi 3 (0 [1 2 3 4])(1 [1 2 3 4])(2 [1 2 3 4])(3 [1 2 3 4])(4 [1 2 3 4]) | |
;; ----------------------- | |
(defbufferop | |
^{:doc "Takes in a number of <t-period, modis-chunk> tuples, sorted | |
by time period, and transposes these into (n = chunk-size) 4-tuples, | |
formatted as <pixel-idx, t-start, t-end, t-series>, where the | |
`t-series` field is represented by a vector. Entering chunks | |
should be sorted in ascending order."} | |
timeseries [tuples] | |
(let [[periods chunks] (apply map vector tuples) | |
periodize (partial vector | |
(first periods) | |
(last periods)) | |
tupleize (comp periodize vector)] | |
(map-indexed cons (apply map tupleize chunks)))) | |
(defn tester-tseries [] | |
(?<- (stdout) | |
[?dataset ?tileid ?pix-idx ?t-start ?t-end ?tseries] | |
(tseries ?dataset ?tileid ?tperiod ?chunk) | |
(:sort ?tperiod) | |
(timeseries ?tperiod ?chunk :> ?pix-idx ?t-start ?t-end ?tseries))) | |
;; RESULTS | |
;; ----------------------- | |
;; ndvi 1 0 0 4 [1 4 2 1 1] | |
;; ndvi 1 1 0 4 [2 3 3 2 2] | |
;; ndvi 1 2 0 4 [3 2 4 3 3] | |
;; ndvi 1 3 0 4 [4 1 5 4 4] | |
;; evi 2 0 0 4 [1 1 1 1 1] | |
;; evi 2 1 0 4 [2 2 2 2 2] | |
;; evi 2 2 0 4 [3 3 3 3 3] | |
;; evi 2 3 0 4 [4 4 4 4 4] | |
;; ndvi 2 0 0 4 [1 1 1 1 1] | |
;; ndvi 2 1 0 4 [2 2 2 2 2] | |
;; ndvi 2 2 0 4 [3 3 3 3 3] | |
;; ndvi 2 3 0 4 [4 4 4 4 4] | |
;; evi 3 0 0 4 [1 1 1 1 1] | |
;; evi 3 1 0 4 [2 2 2 2 2] | |
;; evi 3 2 0 4 [3 3 3 3 3] | |
;; evi 3 3 0 4 [4 4 4 4 4] | |
;; ----------------------- | |
;; The following version gives us int-arrays | |
;; instead, which serialize much smaller. This is what we'll use. | |
(defbufferop | |
^{:doc "Takes in a number of <t-period, modis-chunk> tuples, sorted | |
by time period, and transposes these into (n = chunk-size) 4-tuples, | |
formatted as <pixel-idx, t-start, t-end, t-series>, where the | |
`t-series` field is represented by an int-array. Entering chunks | |
should be sorted in ascending order."} | |
timeseries [tuples] | |
(let [[periods chunks] (apply map vector tuples) | |
[fp lp] ((juxt first peek) periods) | |
tupleize (comp (partial vector fp lp) | |
int-array | |
vector)] | |
(->> chunks | |
(apply map tupleize) | |
(map-indexed cons)))) | |
;; More complex version, with sparse-expander. | |
;; You'll need the following in your thrift file: https://gist.github.com/1058480 | |
;; And these functions (this is the io namespace referenced below): https://gist.github.com/1058476 | |
(defn sparse-expander | |
"Takes in a sequence of 2-tuples of the form `<idx, val>` and | |
generates a sparse expansion with each `val` inserted at its | |
corresponding `idx`. Missing values will be set to the supplied | |
placeholder. | |
If no starting index is supplied, `sparse-expander` assumes that | |
counting begins with the first `<idx, val>` pair." | |
[placeholder tuples & {:keys [start length]}] | |
(let [start (or start (ffirst tuples)) | |
halt? (fn [idx tup-seq] | |
(if length | |
(>= idx (+ start length)) | |
(empty? tup-seq)))] | |
(loop [idx start | |
tup-seq tuples | |
v (transient [])] | |
(let [[[pos val] & more] tup-seq] | |
(cond (halt? idx tup-seq) (persistent! v) | |
(when pos (= idx pos)) (recur (inc idx) more (conj! v val)) | |
(when pos (> idx pos)) (recur (inc idx) more (conj! v placeholder)) | |
:else (recur (inc idx) tup-seq (conj! v placeholder))))))) | |
(defbufferop [timeseries [missing-val]] | |
"Takes in a number of `<t-period, modis-chunk>` tuples, | |
sorted by time period, and transposes these into (n = chunk-size) | |
4-tuples, formatted as <pixel-idx, t-start, t-end, t-series>, where | |
the `t-series` field is represented by an instance of | |
`forma.schema.DoubleArray`. | |
Entering chunks should be sorted by `t-period` in ascending | |
order. `modis-chunk` tuple fields must be vectors or instances of | |
`forma.schema.DoubleArray` or `forma.schema.IntArray`, as dictated | |
by the Thriftable interface in `forma.hadoop.io`." | |
[tuples] | |
(let [[periods [val]] (apply map vector tuples) | |
[fp lp] ((juxt first peek) periods) | |
missing-struct (io/to-struct (repeat (io/count-vals val) missing-val)) | |
chunks (sparse-expander missing-struct tuples :start fp) | |
tupleize (comp (partial vector fp lp) | |
io/to-struct | |
vector)] | |
(->> chunks | |
(map io/get-vals) | |
(apply map tupleize) | |
(map-indexed cons)))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment