Skip to content

Instantly share code, notes, and snippets.

@chenryn
Last active August 29, 2015 14:02
Show Gist options
  • Save chenryn/43315b6c7ddaf9c39aab to your computer and use it in GitHub Desktop.
Save chenryn/43315b6c7ddaf9c39aab to your computer and use it in GitHub Desktop.
port etsy skyline/algorithms.py to Perl5, and use s-w test instead of k-s test
use warnings;
use strict;
use 5.010;
#use Data::Dumper;
use PDL;
use PDL::Fit::Polynomial;
use PDL::Finance::Talib;
use Statistics::Distributions qw/tdistr/;
use Statistics::Normality qw/shapiro_wilk_test/;
use JSON;
use File::Slurp;
use constant FULL_DURATION => 86400;
use constant MAX_RESOLUTION => 3600;
#$PDL::toolongtoprint = 20000;
sub tail_avg {
my $timeseries = shift;
return $timeseries->[-1][1] if $#$timeseries < 3;
my $t =
( $timeseries->[-1][1] + $timeseries->[-2][1] + $timeseries->[-3][1] ) /
3;
return $t;
}
sub median_absolute_deviation {
my $timeseries = shift;
my $series = pdl( map { $_->[1] } @$timeseries );
my $median = $series->median();
my $demedianed = abs( $series - $median );
my $median_deviation = $demedianed->median();
return undef if $median_deviation == 0;
my $test_statistic = $demedianed->at(-1) / $median_deviation;
return 1 if $test_statistic > 6;
}
sub first_hour_average {
my $timeseries = shift;
my $last_hour_threshold = time() - ( FULL_DURATION - 3600 );
my $series = pdl( map { $_->[1] }
grep { $_->[0] < $last_hour_threshold } @$timeseries );
my $mean = ( $series->stats )[0];
my $stdDev = ( $series->stats )[6];
my $t = tail_avg($timeseries);
return abs( $t - $mean ) > 3 * $stdDev;
}
sub stddev_from_average {
my $timeseries = shift;
my $series = pdl( map { $_->[1] } @$timeseries );
my $mean = ( $series->stats )[0];
my $stdDev = ( $series->stats )[6];
my $t = tail_avg($timeseries);
return abs( $t - $mean ) > 3 * $stdDev;
}
sub least_squares {
my $timeseries = shift;
my $x = pdl( map { $_->[0] } @$timeseries );
my $y = pdl( map { $_->[1] } @$timeseries );
#my $A = $x->dummy(0)->append(1);
my ( $yfit, $coeffs ) = fitpoly1d( $x, $y, 2 );
my @errors;
for my $i ( 0 .. $y->nelem - 1 ) {
my $projected = $coeffs->index(1) * $x->index($i) + $coeffs->index(0);
my $error = $y->index($i) - $projected;
push @errors, $error;
}
return undef if $#errors < 3;
my $std_dev = ( pdl(@errors)->stats )[6];
my $t = ( $errors[-1] + $errors[-2] + $errors[-3] ) / 3;
return abs($t) > $std_dev * 3 and int($std_dev) != 0 and int($t) != 0;
}
sub histogram_bins {
my $timeseries = shift;
my $t = tail_avg($timeseries);
my $series = pdl( map { $_->[1] } @$timeseries );
# Attention: there is some different between pdl hist and numpy histogram!
my ( $bins, $h ) =
hist( $series, $series->min, $series->max, $series->nelem / 15 );
for my $index ( 0 .. $h->nelem - 1 ) {
if ( $h->index($index) <= 20 ) {
if ( $index == 0 ) {
if ( $t <= $bins->index(0) ) {
return 1;
}
}
elsif ( $t >= $bins->index($index)
and $t < $bins->index( $index + 1 ) )
{
return 1;
}
}
}
return undef;
}
sub mean_subtraction_cumulation {
my $timeseries = shift;
my $series = pdl( map { $_->[1] || 0 } @$timeseries );
my $mean = ( $series->stats )[0];
$series = $series - $mean;
my $stdDev = ( $series->stats )[6];
return abs( $series->at(-1) ) > 3 * $stdDev;
}
sub stddev_from_moving_average {
my $timeseries = shift;
my $series = pdl( map { $_->[1] } @$timeseries );
my $expAverage = ta_ema( $series, 101 );
my $stdDev = ta_stddev( $series, 101, 1 );
return abs( $series->at(-1) - $expAverage->at(-1) ) > 3 * $stdDev->at(-1);
}
sub grubbs {
my $timeseries = shift;
my $series = pdl( map { $_->[1] } @$timeseries );
my $mean = ( $series->stats )[0];
my $stdDev = ( $series->stats )[6];
my $tail_average = tail_avg($timeseries);
my $z_score = ( $tail_average - $mean ) / $stdDev;
my $len_series = $series->nelem;
my $threshold = tdistr( $len_series-2, 0.05/(2*$len_series) );
my $threshold_squared = $threshold * $threshold;
my $grubbs_score = ( ( $len_series - 1 ) / sqrt($len_series) ) *
sqrt( $threshold_squared / ( $len_series - 2 + $threshold_squared ) );
return $z_score > $grubbs_score;
}
sub sw_test {
my $timeseries = shift;
my $hour_ago = time() - 3600;
my $reference =
[ map { $_->[1] } grep { $_->[0] >= $hour_ago } @$timeseries ];
return undef unless $#$reference > 6 and $#$reference < 5000;
my ( $pval, $w_statistic ) = shapiro_wilk_test($reference);
return $pval < 0.05 ? 1 : undef;
}
sub main {
my $data = from_json( read_file('data.json') );
my $initial = time - MAX_RESOLUTION;
my $min_pass = 7;
for my $datapoint ( @{ $data->{'results'} } ) {
$datapoint->[0] = $initial;
$initial += 1;
}
my $ret = grep { $_->( $data->{'results'} ) } (
\&median_absolute_deviation, \&first_hour_average,
\&stddev_from_average, \&least_squares,
\&histogram_bins, \&mean_subtraction_cumulation,
\&stddev_from_moving_average, \&grubbs,
\&sw_test
);
say $ret > $min_pass ? "Your data don't pass test." : "OK";
}
main();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment