sritchie · February 27, 2011 01:21
diff --git a/tseries.clj b/tseries.clj
 ;; ## Example of timeseries aggregation in cascalog.
 ;; 
 ;; (copy paste all of this in at the REPL!)

 (use 'cascalog.api)

 (def tseries [["ndvi" 1 0 [1 2 3 4]]
              ["ndvi" 1 2 [2 3 4 5]]
              ["ndvi" 1 1 [4 3 2 1]]
              ["ndvi" 1 4 [1 2 3 4]]
              ["ndvi" 1 3 [1 2 3 4]]
              ["evi" 2 0 [1 2 3 4]]
              ["evi" 2 1 [1 2 3 4]]
              ["evi" 2 2 [1 2 3 4]]
              ["evi" 2 3 [1 2 3 4]]
              ["evi" 2 4 [1 2 3 4]]
              ["ndvi" 2 0 [1 2 3 4]]
              ["ndvi" 2 1 [1 2 3 4]]
              ["ndvi" 2 2 [1 2 3 4]]
              ["ndvi" 2 3 [1 2 3 4]]
              ["ndvi" 2 4 [1 2 3 4]]
              ["evi" 3 0 [1 2 3 4]]
              ["evi" 3 1 [1 2 3 4]]
              ["evi" 3 2 [1 2 3 4]]
              ["evi" 3 3 [1 2 3 4]]
              ["evi" 3 4 [1 2 3 4]]])

 (defbufferop tuples->string
  [tuples]
  [(apply str (map str tuples))])

 (defn tester-strings []
  (?<- (stdout)
       [?dataset ?tileid ?tuples]
       (tseries ?dataset ?tileid ?tperiod ?chunk)
       (:sort ?tperiod)
       (tuples->string ?tperiod ?chunk :> ?tuples)))

 ;; RESULTS
 ;; -----------------------
 ;; ndvi	1	(0 [1 2 3 4])(1 [4 3 2 1])(2 [2 3 4 5])(3 [1 2 3 4])(4 [1 2 3 4])
 ;; evi	2	(0 [1 2 3 4])(1 [1 2 3 4])(2 [1 2 3 4])(3 [1 2 3 4])(4 [1 2 3 4])
 ;; ndvi	2	(0 [1 2 3 4])(1 [1 2 3 4])(2 [1 2 3 4])(3 [1 2 3 4])(4 [1 2 3 4])
 ;; evi	3	(0 [1 2 3 4])(1 [1 2 3 4])(2 [1 2 3 4])(3 [1 2 3 4])(4 [1 2 3 4])
 ;; -----------------------

 (defbufferop
  ^{:doc "Takes in a number of <t-period, modis-chunk> tuples, sorted
  by time period, and transposes these into (n = chunk-size) 4-tuples,
  formatted as <pixel-idx, t-start, t-end, t-series>, where the
  `t-series` field is represented by a vector. Entering chunks
  should be sorted in ascending order."}
  timeseries [tuples]
  (let [[periods chunks] (apply map vector tuples)
        periodize (partial vector
                           (first periods)
                           (last periods))
        tupleize (comp periodize vector)]
    (map-indexed cons (apply map tupleize chunks))))

 (defn tester-tseries []
  (?<- (stdout)
       [?dataset ?tileid ?pix-idx ?t-start ?t-end ?tseries]
       (tseries ?dataset ?tileid ?tperiod ?chunk)
       (:sort ?tperiod)
       (timeseries ?tperiod ?chunk :> ?pix-idx ?t-start ?t-end ?tseries)))

 ;; RESULTS
 ;; -----------------------
 ;; ndvi	1	0	0	4	[1 4 2 1 1]
 ;; ndvi	1	1	0	4	[2 3 3 2 2]
 ;; ndvi	1	2	0	4	[3 2 4 3 3]
 ;; ndvi	1	3	0	4	[4 1 5 4 4]
 ;; evi	2	0	0	4	[1 1 1 1 1]
 ;; evi	2	1	0	4	[2 2 2 2 2]
 ;; evi	2	2	0	4	[3 3 3 3 3]
 ;; evi	2	3	0	4	[4 4 4 4 4]
 ;; ndvi	2	0	0	4	[1 1 1 1 1]
 ;; ndvi	2	1	0	4	[2 2 2 2 2]
 ;; ndvi	2	2	0	4	[3 3 3 3 3]
 ;; ndvi	2	3	0	4	[4 4 4 4 4]
 ;; evi	3	0	0	4	[1 1 1 1 1]
 ;; evi	3	1	0	4	[2 2 2 2 2]
 ;; evi	3	2	0	4	[3 3 3 3 3]
 ;; evi	3	3	0	4	[4 4 4 4 4]
 ;; -----------------------

 ;; The following version gives us int-arrays
 ;; instead, which serialize much smaller. This is what we'll use.

 (defbufferop
  ^{:doc "Takes in a number of <t-period, modis-chunk> tuples, sorted
  by time period, and transposes these into (n = chunk-size) 4-tuples,
  formatted as <pixel-idx, t-start, t-end, t-series>, where the
  `t-series` field is represented by an int-array. Entering chunks
  should be sorted in ascending order."}
  timeseries [tuples]
  (let [[periods chunks] (apply map vector tuples)
        [fp lp] ((juxt first peek) periods)
        tupleize (comp (partial vector fp lp)
                       int-array
                       vector)]
    (->> chunks
         (apply map tupleize)
         (map-indexed cons))))

 ;; More complex version, with sparse-expander.
 ;; You'll need the following in your thrift file: https://gist.github.com/1058480
 ;; And these functions (this is the io namespace referenced below): https://gist.github.com/1058476

 (defn sparse-expander
  "Takes in a sequence of 2-tuples of the form `<idx, val>` and
  generates a sparse expansion with each `val` inserted at its
  corresponding `idx`. Missing values will be set to the supplied
  placeholder.

  If no starting index is supplied, `sparse-expander` assumes that
  counting begins with the first `<idx, val>` pair."
  [placeholder tuples & {:keys [start length]}]   
  (let [start (or start (ffirst tuples))
        halt? (fn [idx tup-seq]
                (if length
                  (>= idx (+ start length))
                  (empty? tup-seq)))]
    (loop [idx start
           tup-seq tuples
           v (transient [])]
      (let [[[pos val] & more] tup-seq]
        (cond (halt? idx tup-seq) (persistent! v)
              (when pos (= idx pos)) (recur (inc idx) more (conj! v val))
              (when pos (> idx pos)) (recur (inc idx) more (conj! v placeholder))
              :else (recur (inc idx) tup-seq (conj! v placeholder)))))))

 (defbufferop [timeseries [missing-val]]
  "Takes in a number of `<t-period, modis-chunk>` tuples,
  sorted by time period, and transposes these into (n = chunk-size)
  4-tuples, formatted as <pixel-idx, t-start, t-end, t-series>, where
  the `t-series` field is represented by an instance of
  `forma.schema.DoubleArray`.

  Entering chunks should be sorted by `t-period` in ascending
  order. `modis-chunk` tuple fields must be vectors or instances of
  `forma.schema.DoubleArray` or `forma.schema.IntArray`, as dictated
  by the Thriftable interface in `forma.hadoop.io`."
  [tuples]
  (let [[periods [val]] (apply map vector tuples)
        [fp lp] ((juxt first peek) periods)
        missing-struct (io/to-struct (repeat (io/count-vals val) missing-val))
        chunks (sparse-expander missing-struct tuples :start fp)
        tupleize (comp (partial vector fp lp)
                       io/to-struct
                       vector)]
    (->> chunks
         (map io/get-vals)
         (apply map tupleize)
         (map-indexed cons))))
	;; ## Example of timeseries aggregation in cascalog.
	;;
	;; (copy paste all of this in at the REPL!)

	(use 'cascalog.api)

	(def tseries [["ndvi" 1 0 [1 2 3 4]]
	["ndvi" 1 2 [2 3 4 5]]
	["ndvi" 1 1 [4 3 2 1]]
	["ndvi" 1 4 [1 2 3 4]]
	["ndvi" 1 3 [1 2 3 4]]
	["evi" 2 0 [1 2 3 4]]
	["evi" 2 1 [1 2 3 4]]
	["evi" 2 2 [1 2 3 4]]
	["evi" 2 3 [1 2 3 4]]
	["evi" 2 4 [1 2 3 4]]
	["ndvi" 2 0 [1 2 3 4]]
	["ndvi" 2 1 [1 2 3 4]]
	["ndvi" 2 2 [1 2 3 4]]
	["ndvi" 2 3 [1 2 3 4]]
	["ndvi" 2 4 [1 2 3 4]]
	["evi" 3 0 [1 2 3 4]]
	["evi" 3 1 [1 2 3 4]]
	["evi" 3 2 [1 2 3 4]]
	["evi" 3 3 [1 2 3 4]]
	["evi" 3 4 [1 2 3 4]]])

	(defbufferop tuples->string
	[tuples]
	[(apply str (map str tuples))])

	(defn tester-strings []
	(?<- (stdout)
	[?dataset ?tileid ?tuples]
	(tseries ?dataset ?tileid ?tperiod ?chunk)
	(:sort ?tperiod)
	(tuples->string ?tperiod ?chunk :> ?tuples)))

	;; RESULTS
	;; -----------------------
	;; ndvi 1 (0 [1 2 3 4])(1 [4 3 2 1])(2 [2 3 4 5])(3 [1 2 3 4])(4 [1 2 3 4])
	;; evi 2 (0 [1 2 3 4])(1 [1 2 3 4])(2 [1 2 3 4])(3 [1 2 3 4])(4 [1 2 3 4])
	;; ndvi 2 (0 [1 2 3 4])(1 [1 2 3 4])(2 [1 2 3 4])(3 [1 2 3 4])(4 [1 2 3 4])
	;; evi 3 (0 [1 2 3 4])(1 [1 2 3 4])(2 [1 2 3 4])(3 [1 2 3 4])(4 [1 2 3 4])
	;; -----------------------

	(defbufferop
	^{:doc "Takes in a number of <t-period, modis-chunk> tuples, sorted
	by time period, and transposes these into (n = chunk-size) 4-tuples,
	formatted as <pixel-idx, t-start, t-end, t-series>, where the
	`t-series` field is represented by a vector. Entering chunks
	should be sorted in ascending order."}
	timeseries [tuples]
	(let [[periods chunks] (apply map vector tuples)
	periodize (partial vector
	(first periods)
	(last periods))
	tupleize (comp periodize vector)]
	(map-indexed cons (apply map tupleize chunks))))

	(defn tester-tseries []
	(?<- (stdout)
	[?dataset ?tileid ?pix-idx ?t-start ?t-end ?tseries]
	(tseries ?dataset ?tileid ?tperiod ?chunk)
	(:sort ?tperiod)
	(timeseries ?tperiod ?chunk :> ?pix-idx ?t-start ?t-end ?tseries)))

	;; RESULTS
	;; -----------------------
	;; ndvi 1 0 0 4 [1 4 2 1 1]
	;; ndvi 1 1 0 4 [2 3 3 2 2]
	;; ndvi 1 2 0 4 [3 2 4 3 3]
	;; ndvi 1 3 0 4 [4 1 5 4 4]
	;; evi 2 0 0 4 [1 1 1 1 1]
	;; evi 2 1 0 4 [2 2 2 2 2]
	;; evi 2 2 0 4 [3 3 3 3 3]
	;; evi 2 3 0 4 [4 4 4 4 4]
	;; ndvi 2 0 0 4 [1 1 1 1 1]
	;; ndvi 2 1 0 4 [2 2 2 2 2]
	;; ndvi 2 2 0 4 [3 3 3 3 3]
	;; ndvi 2 3 0 4 [4 4 4 4 4]
	;; evi 3 0 0 4 [1 1 1 1 1]
	;; evi 3 1 0 4 [2 2 2 2 2]
	;; evi 3 2 0 4 [3 3 3 3 3]
	;; evi 3 3 0 4 [4 4 4 4 4]
	;; -----------------------

	;; The following version gives us int-arrays
	;; instead, which serialize much smaller. This is what we'll use.

	(defbufferop
	^{:doc "Takes in a number of <t-period, modis-chunk> tuples, sorted
	by time period, and transposes these into (n = chunk-size) 4-tuples,
	formatted as <pixel-idx, t-start, t-end, t-series>, where the
	`t-series` field is represented by an int-array. Entering chunks
	should be sorted in ascending order."}
	timeseries [tuples]
	(let [[periods chunks] (apply map vector tuples)
	[fp lp] ((juxt first peek) periods)
	tupleize (comp (partial vector fp lp)
	int-array
	vector)]
	(->> chunks
	(apply map tupleize)
	(map-indexed cons))))

	;; More complex version, with sparse-expander.
	;; You'll need the following in your thrift file: https://gist.github.com/1058480
	;; And these functions (this is the io namespace referenced below): https://gist.github.com/1058476

	(defn sparse-expander
	"Takes in a sequence of 2-tuples of the form `<idx, val>` and
	generates a sparse expansion with each `val` inserted at its
	corresponding `idx`. Missing values will be set to the supplied
	placeholder.

	If no starting index is supplied, `sparse-expander` assumes that
	counting begins with the first `<idx, val>` pair."
	[placeholder tuples & {:keys [start length]}]
	(let [start (or start (ffirst tuples))
	halt? (fn [idx tup-seq]
	(if length
	(>= idx (+ start length))
	(empty? tup-seq)))]
	(loop [idx start
	tup-seq tuples
	v (transient [])]
	(let [[[pos val] & more] tup-seq]
	(cond (halt? idx tup-seq) (persistent! v)
	(when pos (= idx pos)) (recur (inc idx) more (conj! v val))
	(when pos (> idx pos)) (recur (inc idx) more (conj! v placeholder))
	:else (recur (inc idx) tup-seq (conj! v placeholder)))))))

	(defbufferop [timeseries [missing-val]]
	"Takes in a number of `<t-period, modis-chunk>` tuples,
	sorted by time period, and transposes these into (n = chunk-size)
	4-tuples, formatted as <pixel-idx, t-start, t-end, t-series>, where
	the `t-series` field is represented by an instance of
	`forma.schema.DoubleArray`.

	Entering chunks should be sorted by `t-period` in ascending
	order. `modis-chunk` tuple fields must be vectors or instances of
	`forma.schema.DoubleArray` or `forma.schema.IntArray`, as dictated
	by the Thriftable interface in `forma.hadoop.io`."
	[tuples]
	(let [[periods [val]] (apply map vector tuples)
	[fp lp] ((juxt first peek) periods)
	missing-struct (io/to-struct (repeat (io/count-vals val) missing-val))
	chunks (sparse-expander missing-struct tuples :start fp)
	tupleize (comp (partial vector fp lp)
	io/to-struct
	vector)]
	(->> chunks
	(map io/get-vals)
	(apply map tupleize)
	(map-indexed cons))))