Last active
August 29, 2015 14:02
-
-
Save chenryn/43315b6c7ddaf9c39aab to your computer and use it in GitHub Desktop.
port etsy skyline/algorithms.py to Perl5, and use s-w test instead of k-s test
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
use warnings; | |
use strict; | |
use 5.010; | |
#use Data::Dumper; | |
use PDL; | |
use PDL::Fit::Polynomial; | |
use PDL::Finance::Talib; | |
use Statistics::Distributions qw/tdistr/; | |
use Statistics::Normality qw/shapiro_wilk_test/; | |
use JSON; | |
use File::Slurp; | |
use constant FULL_DURATION => 86400; | |
use constant MAX_RESOLUTION => 3600; | |
#$PDL::toolongtoprint = 20000; | |
sub tail_avg { | |
my $timeseries = shift; | |
return $timeseries->[-1][1] if $#$timeseries < 3; | |
my $t = | |
( $timeseries->[-1][1] + $timeseries->[-2][1] + $timeseries->[-3][1] ) / | |
3; | |
return $t; | |
} | |
sub median_absolute_deviation { | |
my $timeseries = shift; | |
my $series = pdl( map { $_->[1] } @$timeseries ); | |
my $median = $series->median(); | |
my $demedianed = abs( $series - $median ); | |
my $median_deviation = $demedianed->median(); | |
return undef if $median_deviation == 0; | |
my $test_statistic = $demedianed->at(-1) / $median_deviation; | |
return 1 if $test_statistic > 6; | |
} | |
sub first_hour_average { | |
my $timeseries = shift; | |
my $last_hour_threshold = time() - ( FULL_DURATION - 3600 ); | |
my $series = pdl( map { $_->[1] } | |
grep { $_->[0] < $last_hour_threshold } @$timeseries ); | |
my $mean = ( $series->stats )[0]; | |
my $stdDev = ( $series->stats )[6]; | |
my $t = tail_avg($timeseries); | |
return abs( $t - $mean ) > 3 * $stdDev; | |
} | |
sub stddev_from_average { | |
my $timeseries = shift; | |
my $series = pdl( map { $_->[1] } @$timeseries ); | |
my $mean = ( $series->stats )[0]; | |
my $stdDev = ( $series->stats )[6]; | |
my $t = tail_avg($timeseries); | |
return abs( $t - $mean ) > 3 * $stdDev; | |
} | |
sub least_squares { | |
my $timeseries = shift; | |
my $x = pdl( map { $_->[0] } @$timeseries ); | |
my $y = pdl( map { $_->[1] } @$timeseries ); | |
#my $A = $x->dummy(0)->append(1); | |
my ( $yfit, $coeffs ) = fitpoly1d( $x, $y, 2 ); | |
my @errors; | |
for my $i ( 0 .. $y->nelem - 1 ) { | |
my $projected = $coeffs->index(1) * $x->index($i) + $coeffs->index(0); | |
my $error = $y->index($i) - $projected; | |
push @errors, $error; | |
} | |
return undef if $#errors < 3; | |
my $std_dev = ( pdl(@errors)->stats )[6]; | |
my $t = ( $errors[-1] + $errors[-2] + $errors[-3] ) / 3; | |
return abs($t) > $std_dev * 3 and int($std_dev) != 0 and int($t) != 0; | |
} | |
sub histogram_bins { | |
my $timeseries = shift; | |
my $t = tail_avg($timeseries); | |
my $series = pdl( map { $_->[1] } @$timeseries ); | |
# Attention: there is some different between pdl hist and numpy histogram! | |
my ( $bins, $h ) = | |
hist( $series, $series->min, $series->max, $series->nelem / 15 ); | |
for my $index ( 0 .. $h->nelem - 1 ) { | |
if ( $h->index($index) <= 20 ) { | |
if ( $index == 0 ) { | |
if ( $t <= $bins->index(0) ) { | |
return 1; | |
} | |
} | |
elsif ( $t >= $bins->index($index) | |
and $t < $bins->index( $index + 1 ) ) | |
{ | |
return 1; | |
} | |
} | |
} | |
return undef; | |
} | |
sub mean_subtraction_cumulation { | |
my $timeseries = shift; | |
my $series = pdl( map { $_->[1] || 0 } @$timeseries ); | |
my $mean = ( $series->stats )[0]; | |
$series = $series - $mean; | |
my $stdDev = ( $series->stats )[6]; | |
return abs( $series->at(-1) ) > 3 * $stdDev; | |
} | |
sub stddev_from_moving_average { | |
my $timeseries = shift; | |
my $series = pdl( map { $_->[1] } @$timeseries ); | |
my $expAverage = ta_ema( $series, 101 ); | |
my $stdDev = ta_stddev( $series, 101, 1 ); | |
return abs( $series->at(-1) - $expAverage->at(-1) ) > 3 * $stdDev->at(-1); | |
} | |
sub grubbs { | |
my $timeseries = shift; | |
my $series = pdl( map { $_->[1] } @$timeseries ); | |
my $mean = ( $series->stats )[0]; | |
my $stdDev = ( $series->stats )[6]; | |
my $tail_average = tail_avg($timeseries); | |
my $z_score = ( $tail_average - $mean ) / $stdDev; | |
my $len_series = $series->nelem; | |
my $threshold = tdistr( $len_series-2, 0.05/(2*$len_series) ); | |
my $threshold_squared = $threshold * $threshold; | |
my $grubbs_score = ( ( $len_series - 1 ) / sqrt($len_series) ) * | |
sqrt( $threshold_squared / ( $len_series - 2 + $threshold_squared ) ); | |
return $z_score > $grubbs_score; | |
} | |
sub sw_test { | |
my $timeseries = shift; | |
my $hour_ago = time() - 3600; | |
my $reference = | |
[ map { $_->[1] } grep { $_->[0] >= $hour_ago } @$timeseries ]; | |
return undef unless $#$reference > 6 and $#$reference < 5000; | |
my ( $pval, $w_statistic ) = shapiro_wilk_test($reference); | |
return $pval < 0.05 ? 1 : undef; | |
} | |
sub main { | |
my $data = from_json( read_file('data.json') ); | |
my $initial = time - MAX_RESOLUTION; | |
my $min_pass = 7; | |
for my $datapoint ( @{ $data->{'results'} } ) { | |
$datapoint->[0] = $initial; | |
$initial += 1; | |
} | |
my $ret = grep { $_->( $data->{'results'} ) } ( | |
\&median_absolute_deviation, \&first_hour_average, | |
\&stddev_from_average, \&least_squares, | |
\&histogram_bins, \&mean_subtraction_cumulation, | |
\&stddev_from_moving_average, \&grubbs, | |
\&sw_test | |
); | |
say $ret > $min_pass ? "Your data don't pass test." : "OK"; | |
} | |
main(); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment