welch · August 29, 2015 14:19
diff --git a/trend.juttle b/trend.juttle
 // model trends in a series via linear regression.
 //
 // Exported subs:
 // fit: do a least-squares fit of a line to successive batches of points,
 //      and calculate the trend's value at each point. A fit is computed for each batch.
 //     ... | fit -in 'cpu' -every :2h: -over :8h: -t0 :2014-01-01: -out 'trend';
 // fit_initial: do a least-squares fit of a line to an initial window of points,
 //      and calculate this trend's value at subsequent points. A fit is computed once.
 //     ... | fit_initial -in 'cpu' -over :2h: -t0 :2014-01-01: -out 'trend';
 // fit_trailing: do a least-squares fit of a line to a moving window of points
 //      and calculate the trend value at each point. A fit is computed from each point.
 //     ... | fit_trailing -in 'cpu' -over :2h: -t0 :2014-01-01: -out 'trend';
 // rate: robust rate of change estimation by fitting a trend to trailing data.
 //     ... | rate -in 'cpu' -dt :5m: -t0 :2014-01-01: -out 'rate' ;
 // change: trend-based change over an interval. This is rate * dt and is in the same units as the input.
 //     ... | change -in 'cpu' -dt :5m: -t0 :2014-01-01: -out 'change' ;
 // subtract: de-trend a series of points or successive batches of points.
 //     ... | subtract -in 'cpu' -every :2h: -over :8h: -t0 :2014-01-01: -out 'detrend_2h' ;
 // subtract_trailing: de-trend a series of points by re-fitting at each point.
 //     ... | subtract_trailing -in 'cpu' -over :2h: -t0 :2014-01-01: -out 'detrend_2h' ;
 //
 // Exported reducer:
 // fit_coeffs: linear regression on a timeseries
 //     return coefficients of least-squares fit of a line to a specified field on a batch of points.
 //     ... | reduce line = coeffs(y, t0)
 //         | put y0 = value(t0, coeffs), y1 = value(t1, coeffs)
 //         | ...
 //////////////////////////////////////////////////////////////////////////////////////
 //
 // math utilities
 //
 // Exported subs:
 // 
 // Exported functions:
 // inv2x2(a,b,c,d): invert 2x2 matrix
 // isFinite(x): is x a good number?
 //
 // Exported reducers:
 // 
 /////////////////////////////////////////////////////////////////////////
 //
 // inv2x2:
 // invert a 2x2 matrix | a, b ; c, d | (row major order)
 // 
 export function inv2x2(a, b, c, d) {
    var det = (a * d - b * c) ;
    return [d / det, -b / det, -c / det, a / det] ; 
 }
 //
 // isFinite:
 // verify x is a good number.
 // 
 export function isFinite(x) {
    return (x == x && x != null && x != x + 1);
 }
 //
 // fit_coeffs: Linear regression vs time, for a batch or window of points.
 // parameters:
 //    y -- field to fit
 //   t0 -- time origin for first fitted line
 // returns vector [b, m, t0, n]
 //    b -- yintercept, m: slope (per second), n: number of points fit.
 //
 export reducer fit_coeffs(y, t0) {
    var initial = true ;
    var T, Y, T2, TY; // accumulate t, y, ... etc
    var n;
    
    function update() {
        if (initial) {
            T = Y = T2 = TY = n = 0;
            initial = false;
        }
        var t = Duration.seconds(*"time" - t0) ;
        T = T + t;
        Y = Y + *y;
        T2 = T2 + t * t;
        TY = TY + t * *y;
        n = n + 1;
    }
    function expire() {
        var t = Duration.seconds(*"time" - t0) ;
        T = T - t;
        Y = Y - *y;
        T2 = T2 - t * t;
        TY = TY - t * *y;
        n = n - 1;
    }
    function result() {
        var inv = inv2x2(T2, T, T, n) ;
        var m = inv[0] * TY + inv[1] * Y;
        var b = inv[2] * TY + inv[3] * Y;
        return [b, m, t0, n];
    }
    function reset() {
        initial = true;
    }
 }
 //
 // value:
 // evaluate the trendline represented by coeffs at the given time
 //
 export function value(t, coeffs, default) {
    if (isFinite(coeffs[0]) && isFinite(coeffs[1])) {
        return coeffs[0] + Duration.seconds(t - coeffs[2]) * coeffs[1];
    } else {
        return default;
    }
 }
 //
 // fit:
 // construct a sequence of linear fits over epochs specified by -every@t0.
 // output each point's projection onto the fitted line covering its epoch.
 // parameters:
 //   in -- input field
 //   every -- re-fitting interval
 //   over -- data window for fitting (>= every)
 //   t0 -- time origin for first fitted line
 //   out -- output field for projected line's value on each point
 //
 export sub fit(in, every, over, t0, out) {
    (
        reduce -every every -over over -on t0 __fit_tmp=fit_coeffs(in, t0)
        // XXX: we don't have a lag operator, so use pace to send the result back in
        // time to beginning of its batch so it can be joined with the data it was fitted to.
            | pace -from t0 -every :0s: ;
        merge ;
    )| join
        | put *out = value(time, __fit_tmp, *in)
        | remove __fit_tmp
 }
 //
 // fit_initial:
 // fit coefficients only once against the initial window,
 // thereafter projecting against this same trend as time advances.
 // For periodic data, this is a little dangerous if not run against an even
 // multiple of the periodicity, because the extra partial-period data
 // gives rise to a nonzero trend (phase-shift). But really, if you know you
 // have periodic data, you shouldn't be fitting a trend!
 //
 // parameters:
 //   in -- input field
 //   initial -- duration of initial batch to fit
 //   t0 -- time origin for initial batch
 //   out -- output field for projected line's value on each point
 //
 export sub fit_initial(in, initial, t0, out) {
    (
        reduce -every initial -over initial -on t0 -from t0 -to t0+initial __fit_tmp=fit_coeffs(in, t0)
        // XXX: we don't have a lag operator, so use pace to send the result back in
        // time to beginning of its batch so it can be joined with the data it was fitted to.
            | pace -from t0 -every :0s: ;
        merge ;
    )| join
        | put *out = value(time, __fit_tmp, *in)
        | remove __fit_tmp
 }
 //
 // fit_trailing:
 // construct a linear fit over a trailing window of points
 // and put its value on the incoming point. A new fit is computed at each point.
 // parameters:
 //   in -- input field
 //   over -- trailing window to fit trend over
 //   t0 -- time origin for first fitted line
 //   out -- output field for projected line's value
 //
 export sub fit_trailing(in, over, t0, out) {
    put -over over __fit_tmp = fit_coeffs(in, t0)
        | put *out = value(time, __fit_tmp, *in)
        | remove __fit_tmp
 }
 //
 // subtract:
 // de-trend intervals of points by subtracting their MSE linear fit.
 // There will be a trend discontinuity at each refitting interval, so if
 // this is a concern you should use subtract_trailing
 // parameters:
 //   in -- field to de-trend
 //   every -- re-fitting interval
 //   over -- data window for fitting (>= every)
 //   t0 -- earliest time
 //   out -- output field for detrended value on each point
 //
 export sub subtract(in, every, over, t0, out) {
    fit -in in -every every -over over -t0 t0 -out '__subtract_tmp'
        | put *out = *in - __subtract_tmp
        | remove __subtract_tmp
 }
 //
 // subtract_trailing:
 // de-trend intervals of points by subtracting the MSE linear fit 
 // as each point arrives. 
 // parameters:
 //   in -- field to de-trend
 //   over -- data window for fitting
 //   t0 -- earliest time
 //   out -- output field for detrended value on each point
 //
 export sub subtract_trailing(in, over, t0, out) {
    fit_trailing -in in -over over -t0 t0 -out '__subtract_tmp'
        | put *out = *in - __subtract_tmp
        | remove __subtract_tmp
 }
 //
 // rate:
 // robustly estimate rate of change by fitting trendlines to a short moving
 // window of points anchored at each successive point, and returning their slopes.
 // parameters:
 //   in -- field to rate-estimate
 //   dt -- time window for rate estimator (longer values give smoother derivatives)
 //   t0 -- earliest time
 //   out -- output field for rate (in/second) on each point
 //
 export sub rate(in, dt, t0, out) {
    (
        reduce -every dt -over dt -on t0 __dt_tmp=fit_coeffs(in, t0)
        // XXX: we don't have a lag operator, so use pace to send the result back in
        // time to beginning of its batch so it can be joined with the data it was fitted to.
            | pace -from t0 -every :0s: ;
        merge ;
    )| join
        | put *out = isFinite(__dt_tmp[1]) ? __dt_tmp[1] : null
        | remove __dt_tmp
 }
 //
 // change:
 // estimate the amount of change over an interval due to a linear trend over the interval.
 // parameters:
 //   in -- field to rate-estimate
 //   dt -- time window for rate estimator (longer values give smoother derivatives)
 //   t0 -- earliest time
 //   out -- output field for amount change in this interval due to linear trend.
 //
 export sub change (in, dt, t0, out) {
    rate -in  in -dt dt -t0 t0 -out out 
        | put *out = *out * Duration.seconds(dt)
 }
	// model trends in a series via linear regression.
	//
	// Exported subs:
	// fit: do a least-squares fit of a line to successive batches of points,
	// and calculate the trend's value at each point. A fit is computed for each batch.
	// ... \| fit -in 'cpu' -every :2h: -over :8h: -t0 :2014-01-01: -out 'trend';
	// fit_initial: do a least-squares fit of a line to an initial window of points,
	// and calculate this trend's value at subsequent points. A fit is computed once.
	// ... \| fit_initial -in 'cpu' -over :2h: -t0 :2014-01-01: -out 'trend';
	// fit_trailing: do a least-squares fit of a line to a moving window of points
	// and calculate the trend value at each point. A fit is computed from each point.
	// ... \| fit_trailing -in 'cpu' -over :2h: -t0 :2014-01-01: -out 'trend';
	// rate: robust rate of change estimation by fitting a trend to trailing data.
	// ... \| rate -in 'cpu' -dt :5m: -t0 :2014-01-01: -out 'rate' ;
	// change: trend-based change over an interval. This is rate * dt and is in the same units as the input.
	// ... \| change -in 'cpu' -dt :5m: -t0 :2014-01-01: -out 'change' ;
	// subtract: de-trend a series of points or successive batches of points.
	// ... \| subtract -in 'cpu' -every :2h: -over :8h: -t0 :2014-01-01: -out 'detrend_2h' ;
	// subtract_trailing: de-trend a series of points by re-fitting at each point.
	// ... \| subtract_trailing -in 'cpu' -over :2h: -t0 :2014-01-01: -out 'detrend_2h' ;
	//
	// Exported reducer:
	// fit_coeffs: linear regression on a timeseries
	// return coefficients of least-squares fit of a line to a specified field on a batch of points.
	// ... \| reduce line = coeffs(y, t0)
	// \| put y0 = value(t0, coeffs), y1 = value(t1, coeffs)
	// \| ...
	//////////////////////////////////////////////////////////////////////////////////////
	//
	// math utilities
	//
	// Exported subs:
	//
	// Exported functions:
	// inv2x2(a,b,c,d): invert 2x2 matrix
	// isFinite(x): is x a good number?
	//
	// Exported reducers:
	//
	/////////////////////////////////////////////////////////////////////////
	//
	// inv2x2:
	// invert a 2x2 matrix \| a, b ; c, d \| (row major order)
	//
	export function inv2x2(a, b, c, d) {
	var det = (a * d - b * c) ;
	return [d / det, -b / det, -c / det, a / det] ;
	}
	//
	// isFinite:
	// verify x is a good number.
	//
	export function isFinite(x) {
	return (x == x && x != null && x != x + 1);
	}
	//
	// fit_coeffs: Linear regression vs time, for a batch or window of points.
	// parameters:
	// y -- field to fit
	// t0 -- time origin for first fitted line
	// returns vector [b, m, t0, n]
	// b -- yintercept, m: slope (per second), n: number of points fit.
	//
	export reducer fit_coeffs(y, t0) {
	var initial = true ;
	var T, Y, T2, TY; // accumulate t, y, ... etc
	var n;

	function update() {
	if (initial) {
	T = Y = T2 = TY = n = 0;
	initial = false;
	}
	var t = Duration.seconds(*"time" - t0) ;
	T = T + t;
	Y = Y + *y;
	T2 = T2 + t * t;
	TY = TY + t * *y;
	n = n + 1;
	}
	function expire() {
	var t = Duration.seconds(*"time" - t0) ;
	T = T - t;
	Y = Y - *y;
	T2 = T2 - t * t;
	TY = TY - t * *y;
	n = n - 1;
	}
	function result() {
	var inv = inv2x2(T2, T, T, n) ;
	var m = inv[0] * TY + inv[1] * Y;
	var b = inv[2] * TY + inv[3] * Y;
	return [b, m, t0, n];
	}
	function reset() {
	initial = true;
	}
	}
	//
	// value:
	// evaluate the trendline represented by coeffs at the given time
	//
	export function value(t, coeffs, default) {
	if (isFinite(coeffs[0]) && isFinite(coeffs[1])) {
	return coeffs[0] + Duration.seconds(t - coeffs[2]) * coeffs[1];
	} else {
	return default;
	}
	}
	//
	// fit:
	// construct a sequence of linear fits over epochs specified by -every@t0.
	// output each point's projection onto the fitted line covering its epoch.
	// parameters:
	// in -- input field
	// every -- re-fitting interval
	// over -- data window for fitting (>= every)
	// t0 -- time origin for first fitted line
	// out -- output field for projected line's value on each point
	//
	export sub fit(in, every, over, t0, out) {
	(
	reduce -every every -over over -on t0 __fit_tmp=fit_coeffs(in, t0)
	// XXX: we don't have a lag operator, so use pace to send the result back in
	// time to beginning of its batch so it can be joined with the data it was fitted to.
	\| pace -from t0 -every :0s: ;
	merge ;
	)\| join
	\| put out = value(time, __fit_tmp, in)
	\| remove __fit_tmp
	}
	//
	// fit_initial:
	// fit coefficients only once against the initial window,
	// thereafter projecting against this same trend as time advances.
	// For periodic data, this is a little dangerous if not run against an even
	// multiple of the periodicity, because the extra partial-period data
	// gives rise to a nonzero trend (phase-shift). But really, if you know you
	// have periodic data, you shouldn't be fitting a trend!
	//
	// parameters:
	// in -- input field
	// initial -- duration of initial batch to fit
	// t0 -- time origin for initial batch
	// out -- output field for projected line's value on each point
	//
	export sub fit_initial(in, initial, t0, out) {
	(
	reduce -every initial -over initial -on t0 -from t0 -to t0+initial __fit_tmp=fit_coeffs(in, t0)
	// XXX: we don't have a lag operator, so use pace to send the result back in
	// time to beginning of its batch so it can be joined with the data it was fitted to.
	\| pace -from t0 -every :0s: ;
	merge ;
	)\| join
	\| put out = value(time, __fit_tmp, in)
	\| remove __fit_tmp
	}
	//
	// fit_trailing:
	// construct a linear fit over a trailing window of points
	// and put its value on the incoming point. A new fit is computed at each point.
	// parameters:
	// in -- input field
	// over -- trailing window to fit trend over
	// t0 -- time origin for first fitted line
	// out -- output field for projected line's value
	//
	export sub fit_trailing(in, over, t0, out) {
	put -over over __fit_tmp = fit_coeffs(in, t0)
	\| put out = value(time, __fit_tmp, in)
	\| remove __fit_tmp
	}
	//
	// subtract:
	// de-trend intervals of points by subtracting their MSE linear fit.
	// There will be a trend discontinuity at each refitting interval, so if
	// this is a concern you should use subtract_trailing
	// parameters:
	// in -- field to de-trend
	// every -- re-fitting interval
	// over -- data window for fitting (>= every)
	// t0 -- earliest time
	// out -- output field for detrended value on each point
	//
	export sub subtract(in, every, over, t0, out) {
	fit -in in -every every -over over -t0 t0 -out '__subtract_tmp'
	\| put out = in - __subtract_tmp
	\| remove __subtract_tmp
	}
	//
	// subtract_trailing:
	// de-trend intervals of points by subtracting the MSE linear fit
	// as each point arrives.
	// parameters:
	// in -- field to de-trend
	// over -- data window for fitting
	// t0 -- earliest time
	// out -- output field for detrended value on each point
	//
	export sub subtract_trailing(in, over, t0, out) {
	fit_trailing -in in -over over -t0 t0 -out '__subtract_tmp'
	\| put out = in - __subtract_tmp
	\| remove __subtract_tmp
	}
	//
	// rate:
	// robustly estimate rate of change by fitting trendlines to a short moving
	// window of points anchored at each successive point, and returning their slopes.
	// parameters:
	// in -- field to rate-estimate
	// dt -- time window for rate estimator (longer values give smoother derivatives)
	// t0 -- earliest time
	// out -- output field for rate (in/second) on each point
	//
	export sub rate(in, dt, t0, out) {
	(
	reduce -every dt -over dt -on t0 __dt_tmp=fit_coeffs(in, t0)
	// XXX: we don't have a lag operator, so use pace to send the result back in
	// time to beginning of its batch so it can be joined with the data it was fitted to.
	\| pace -from t0 -every :0s: ;
	merge ;
	)\| join
	\| put *out = isFinite(__dt_tmp[1]) ? __dt_tmp[1] : null
	\| remove __dt_tmp
	}
	//
	// change:
	// estimate the amount of change over an interval due to a linear trend over the interval.
	// parameters:
	// in -- field to rate-estimate
	// dt -- time window for rate estimator (longer values give smoother derivatives)
	// t0 -- earliest time
	// out -- output field for amount change in this interval due to linear trend.
	//
	export sub change (in, dt, t0, out) {
	rate -in in -dt dt -t0 t0 -out out
	\| put out = out * Duration.seconds(dt)
	}