Last active
August 29, 2015 14:08
-
-
Save nlitsme/ad898d46b0f24111e0be to your computer and use it in GitHub Desktop.
calculate least squares approximations of input data for several expressions
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!perl -w | |
# vim: sw=4 ts=4 et : | |
# (C) 2003-2007 Willem Jan Hengeveld <[email protected]> | |
# Web: http://www.xs4all.nl/~itsme/ | |
# http://wiki.xda-developers.com/ | |
# | |
# $Id: $ | |
# | |
# http://en.wikipedia.org/wiki/Linear_regression | |
# http://mathworld.wolfram.com/LeastSquaresFitting.html | |
# http://mathworld.wolfram.com/LeastSquaresFittingPolynomial.html | |
# http://mathworld.wolfram.com/LeastSquaresFittingExponential.html | |
# http://mathworld.wolfram.com/LeastSquaresFittingPerpendicularOffsets.html | |
# http://mathworld.wolfram.com/NonlinearLeastSquaresFitting.html | |
# | |
# tests: | |
# perl -e 'printf("%5.2f %5.2f\n", $_, 0.4*exp(0.2*$_)) for 1..100' | lsq | sort -n | |
# perl -e 'printf("%5.2f %5.2f\n", $_, 0.4+0.2*log($_)) for 1..100' | lsq | sort -n | |
# perl -e 'printf("%5.2f %5.2f\n", $_, 0.4*$_**0.2) for 1..100' | lsq | sort -n | |
# perl -e 'printf("%5.2f %5.2f\n", $_, 0.4+0.2*($_**2)) for 1..100' | lsq | sort -n | |
# perl -e 'printf("%5.2f %5.2f\n", $_, 0.4+0.2*($_**5)) for 1..100' | lsq | sort -n | |
# perl -e 'printf("%5.2f %5.2f\n", $_, 0.4+0.2*($_**7)) for 1..100' | lsq | sort -n | |
# | |
# todo: add fft for periodic signals | |
# done: add option to output data+functions with gnuplot | |
# todo: read about non-linear least squares Marquardt-Levenberg algorithm | |
# ........ how to get proper timestamps: | |
# .... also added date+time as first 2 colums | |
# set xdata time | |
# set timefmt "%Y-%m-%d %H:%M:%S" | |
# set format x "%H:%M" | |
# plot '/tmp/test.dat' using 1:4, '' using 1:(328677.832933346+$3*(-0.000841587261071837)) title '1st order poly' smooth unique | |
use strict; | |
use warnings; | |
use Getopt::Long; | |
use Math::MatrixReal; | |
#use File::Temp; | |
use IO::File; | |
use POSIX; | |
$|=1; | |
# simple case: a*x+b=y | |
# given many pairs (x,y) find a and b that minimize the error. | |
# | |
# -> a*x0+b=y0 | |
# -> a*x1+b=y1 | |
# -> a*x2+b=y2 | |
# | |
# -> equation | |
# ( x0 1 ) ( a ) ( y0 ) | |
# ( x1 1 ) *( b ) = ( y1 ) | |
# ( x2 1 ) ( y2 ) | |
# | |
# A * x = y | |
# | |
# minimize distance to line: | |
# d(line,[xy]) = |A*x-y| | |
# | |
# error : |A*x-y| = (A*x-y)' * (A*x-y) | |
# = x'*A'*A*x - 2*x*A'*y + y'*y | |
# differentiate by x | |
# -> 2*A'*A*x-2*A'*y = 0 -> A'*A*x=A'*y | |
# mini | |
# 'm' measurements | |
# 'p' variables | |
# | |
# y_0=a0*x_0^0+a1*x_0^1+a2*x_0^2+..an*x_0^n | |
# y_1=a0*x_1^0+a1*x_1^1+a2*x_1^2+..an*x_1^n | |
# .. | |
# y_m=a0*x_m^0+am*x_m^1+a2*x_m^2+..an*x_m^n | |
# -> | |
# | |
# (y_0 .. y_m) = Matrix_rows( [x_0^0, x_0^1, .. x_0^n], [x_1^0 .. x_1^n] ... ) * (a0..an) | |
# y=sum(a_i*x^i) find_lsq_poly | |
# y=a*exp(b*x) find_lsq_exponential | |
# y=a+b*ln(x) find_lsq_logarithmic | |
# y=a*x^b find_lsq_powerlaw | |
# done: add support for timestamps | |
# .. use parsedate to identify timestamps. | |
# timestamps can be with, or without date. | |
# done: add support for calc yresult for xval | |
# done: add support for calc xval for targety | |
# .. if list of values starting at col0 follows list, assume that 'y' needs to be calculated | |
# .. if list of values starting with TAB follows list, assum that 'x' needs to be calculated. | |
my ($xcol, $ycol)=(0,1); | |
my $calclog=1; | |
my $maxorder=10; | |
my $minorder=0; | |
my $xinside; | |
my $gnuplot; | |
GetOptions( | |
"x=s" => \$xcol, | |
"y=s" => \$ycol, | |
"o=i" => sub { $maxorder= $_[1]+1; $minorder=$_[1]; $calclog=0; }, | |
"f" => \$xinside, | |
"g" => \$gnuplot, | |
) or die usage(); | |
sub usage { | |
return <<__EOF__ | |
Usage: lsq [-x COL] [-y COL] [-o N] | |
-x COL : specify 'x' column ( first col = 0 ) | |
-y COL : specify 'y' column | |
-o N : specify order of polynomial | |
-f : try to move factor inside exponent (a*x)^n | |
-g : output gnuplot data | |
input data can contain date's + time's + 0xhexdump + a+b*c expressions | |
for lines starting with TAB, or ? the corresponding 'x' will be calculated | |
for lines ending with ? or which have only one value the corresponding 'y' | |
will be calculated | |
__EOF__ | |
} | |
my @values; | |
my @xvalues; | |
my @yvalues; | |
my @valueformat; # -3:date+time, -2:date, -1:time, >=0: decimals | |
while (<>) { | |
next if /^#/; | |
my @line= split /[^0-9x.e:-]+/, $_; | |
next unless @line; | |
shift @line if $line[0] eq ""; | |
for (my $i=0 ; $i<@line ; $i++) { | |
if ($i<$#line && isdate($line[$i]) && istime($line[$i+1])) { | |
$line[$i]= maketimestamp($line[$i], $line[$i+1]); | |
$valueformat[$i]= -3; | |
splice @line, $i+1,1; | |
} | |
elsif (isdate($line[$i])) { | |
$line[$i]= maketimestamp($line[$i], 0); | |
$valueformat[$i]= -2; | |
} | |
elsif (istime($line[$i])) { | |
$line[$i]= maketimestamp(0, $line[$i]); | |
$valueformat[$i]= -1; | |
} | |
elsif (isexpr($line[$i])) { | |
$line[$i]= eval($line[$i]); | |
} | |
elsif ($line[$i] =~ /\.(\d+)/ && ( !defined $valueformat[$i] || length($1)>$valueformat[$i])) { | |
$valueformat[$i]= length($1); | |
} | |
} | |
if (defined $line[$xcol] && defined $line[$ycol]) { | |
if ($line[$xcol] ne "?" && $line[$ycol] ne "?") { | |
push @values, [$line[$xcol], $line[$ycol] ]; | |
} | |
elsif ($line[$xcol] ne "?") { | |
push @xvalues, $line[$xcol]; | |
} | |
elsif ($line[$ycol] ne "?") { | |
push @yvalues, $line[$ycol]; | |
} | |
} | |
elsif (@line) { | |
if (/^\t/) { | |
push @yvalues, $line[0]; | |
} | |
else { | |
push @xvalues, $line[0]; | |
} | |
} | |
} | |
sub isexpr { | |
my $number= "(?:0x[0-9a-f]+|0[0-7]+|[1-9][0-9]*)"; | |
return $_[0] =~ /^$number(?:[+*\/&|^-]$number)*$/; | |
} | |
sub isdate { | |
return ($_[0] =~ /\d-\d+-\d/); | |
} | |
sub istime { | |
return ($_[0] =~ /\d:\d/); | |
} | |
sub maketimestamp { | |
my ($date, $time) = @_; | |
my @ymd= $date ? split(/-/, $date) : (0,0,0); | |
if ($ymd[0]<100) { | |
# y2k fix! | |
if ($ymd[0]<50) { | |
$ymd[0]+=2000; | |
} | |
else { | |
$ymd[0]+=1900; | |
} | |
} | |
my @hms= $time ? split(/:/, $time) : (0,0,0); | |
push @hms,0 if (@hms==2); | |
$ymd[0]-=1900; | |
$ymd[1]-=1; | |
return POSIX::mktime(reverse @ymd,@hms)-($gnuplot ? 946684800 : 0); | |
} | |
my @errs; # error for expr | |
my @exprs; # printable expressions | |
my @funcs; # perl functions | |
my @gplot; # gnuplot expressions | |
my @title; # titles for gnuplot | |
# first try polynomials | |
for (my $order=$minorder ; $order < $maxorder ; $order++) { | |
my ($poly, $error)= find_lsq_poly($order, \@values); | |
next if (!defined $poly); | |
if ($gnuplot) { | |
} | |
if ($xinside) { | |
push @errs, $error; | |
push @exprs, join("", map { $_==0 ? sprintf("%5g", $poly->[$_]) : | |
$_==1 ? sprintf("%+5g*x", $poly->[$_]) : | |
sprintf("%s(%5g*x)^%d", $poly->[$_]<0?"-":"+", | |
abs($poly->[$_])**(1/$_), $_) } (0..$#$poly)); | |
} | |
else { | |
push @errs, $error; | |
push @exprs, join("", map { $_==0 ? sprintf("%5g", $poly->[$_]) : | |
$_==1 ? sprintf("%+5g*x", $poly->[$_]) : | |
sprintf("%+5g*x^%d", $poly->[$_], $_) } (0..$#$poly)); | |
} | |
push @funcs, sub { my $res=0; for (my $i=$#$poly ; $i>=0 ; $i--) { $res *= $_[0]; $res += $poly->[$i]; } return $res; }; | |
#p0+x*(p1+x*(p2+x*p3)) | |
push @gplot, join("+\$1*(", @$poly).(')'x(@$poly-1)); | |
push @title, countword($order)." order poly"; | |
} | |
sub countword { | |
return "1st" if ($_[0]==1); | |
return "2nd" if ($_[0]==2); | |
return "3rd" if ($_[0]==3); | |
return "$_[0]th"; | |
} | |
if ($calclog) { | |
{ | |
my ($ab, $err)= find_lsq_exponential(\@values); | |
if ($ab) { | |
push @errs, $err; | |
push @exprs, sprintf("%5g * exp( %5g * x )", $ab->[0], $ab->[1]); | |
push @funcs, sub { return $ab->[0]*exp($_[0]*$ab->[1]); }; | |
push @gplot, sprintf("%.10g*exp(\$1*%.10g)", $ab->[0], $ab->[1]); | |
push @title, "a+exp(b*x)"; | |
} | |
} | |
{ | |
my ($ab, $err)= find_lsq_logarithmic(\@values); | |
if ($ab) { | |
# a + b * l(x) == a + l(x^b) == l(x^b*e^a) | |
push @errs, $err; | |
push @exprs, sprintf("%5g + %5g * log( x )", $ab->[0], $ab->[1]); | |
push @funcs, sub { if ($_[0]<=0) { return undef; } return $ab->[0]+log($_[0])*$ab->[1]; }; | |
push @gplot, sprintf("%.10g + %.10g * log( \$1 )", $ab->[0], $ab->[1]); | |
push @title, "a+b*log(x)"; | |
} | |
} | |
{ | |
my ($ab, $err)= find_lsq_powerlaw(\@values); | |
if ($ab) { | |
if ($xinside) { | |
push @errs, $err; | |
push @exprs, sprintf("%s( %5g * x ) ^ %5g", | |
$ab->[0]<0?"-":"+", | |
abs($ab->[0])**(1/$ab->[1]), $ab->[1]); | |
} | |
else { | |
push @errs, $err; | |
push @exprs, sprintf("%5g * x ^ %5g", $ab->[0], $ab->[1]); | |
} | |
push @funcs, sub { return $ab->[0]*$_[0]**$ab->[1]; }; | |
push @gplot, sprintf("%.10g * exp(log(\$1)*%.10g)", $ab->[0], $ab->[1]); | |
push @title, "a*x^b"; | |
} | |
} | |
} | |
sub makedatafile { | |
my ($values)= @_; | |
#my $fh= File::Temp->new(); | |
my $fh= IO::File->new("/tmp/test.dat", "w"); | |
for (@$values) { | |
$fh->printf("%.10g %.10g\n", @$_); | |
} | |
#return $fh->filename; | |
return "/tmp/test.dat"; | |
} | |
if ($gnuplot) { | |
my $tmpdat= makedatafile(\@values); | |
printf("plot '%s' using 1:2, %s\n", $tmpdat, join(", ", map { sprintf("'' using 1:(%s) title '%s' smooth unique", $gplot[$_], $title[$_]) } 0..$#gplot)); | |
} | |
else { | |
print map { sprintf("%10f %s\n", $errs[$_], $exprs[$_]) } 0..$#exprs; | |
} | |
for my $xval (@xvalues) { | |
printf(" %s", formatvalue($xval, $valueformat[$xcol])); | |
for my $f (@funcs) { | |
printf(" %s", formatvalue($f->($xval), $valueformat[$ycol])); | |
} | |
print "\n"; | |
} | |
for my $yval (@yvalues) { | |
printf(" %s", formatvalue($yval, $valueformat[$ycol]));; | |
for my $f (@funcs) { | |
my $xval= findinverse($f, $yval); | |
printf(" %s", formatvalue($xval, $valueformat[$xcol]));; | |
} | |
print "\n"; | |
} | |
sub formatvalue { | |
my ($val, $ts)= @_; | |
$ts ||= 0; | |
if (!defined $val) { | |
return "undef"; | |
} | |
if ($ts>=0) { | |
return sprintf("%.*f", $ts, $val); | |
} | |
else { | |
my @tv= localtime $val; | |
$tv[4]+=1; | |
$tv[5]+=1900; | |
if ($ts==-1) { | |
return sprintf("%02d:%02d:%02d", reverse @tv[0..2]); | |
} | |
elsif ($ts==-2) { | |
return sprintf("%04d-%02d-%02d", reverse @tv[3..5]); | |
} | |
elsif ($ts==-3) { | |
return sprintf("%04d-%02d-%02d %02d:%02d:%02d", reverse @tv[0..5]); | |
} | |
} | |
} | |
sub findslope { | |
my ($f, $yval, $x)= @_; | |
my $x0= $x; | |
my $y0= $f->($x0)-$yval; | |
my $x0d= $x0+0.0001; | |
my $y0d= eval { $f->($x0d)-$yval; }; | |
if (!defined $y0d || $@) { | |
$x0d= $x0-0.0001; | |
$y0d= eval { $f->($x0d)-$yval; }; | |
if (!defined $y0d || $@) { | |
return undef; | |
} | |
} | |
if ($x0==$x0d) { | |
return 999999999999; | |
} | |
return ($y0d-$y0)/($x0d-$x0); | |
} | |
sub findinverse { | |
my ($f, $yval)= @_; | |
# todo: calc epsilon based on min/max x-value. | |
my $x0= $values[0][0]; | |
my $iter=0; | |
while (1) { | |
my $s= findslope($f, $yval, $x0); | |
if (!defined $s || $s==0) { | |
return undef; | |
} | |
my $y0= $f->($x0)-$yval; | |
# y0+(x-x0)*s = 0 -> y0+x*s-x0*s -> x=x0-y0/s | |
my $x1= $x0-$y0/$s; | |
last if abs($x1-$x0)<0.0001; | |
last if ($iter++>100); | |
$x0= $x1; | |
} | |
return $x0; | |
} | |
sub find_lsq_poly { | |
my ($n, $values)= @_; | |
# (y_0 .. y_m) = Matrix_rows( [x_0^0, x_0^1, .. x_0^n], [x_1^0 .. x_1^n] ... ) * (a0..an) | |
if ( $#$values-$n-1 <= 0 ) { | |
return undef; | |
} | |
my @a; | |
my @b; | |
for (my $row=0 ; $row<=$#$values ; $row++) { | |
push @b, $values->[$row][1]; | |
for (my $col=0 ; $col<=$n ; $col++) { | |
$a[$row][$col]= $values->[$row][0]**$col; | |
} | |
} | |
return solve_lsq(\@a, \@b); | |
} | |
# y=a*exp(b*x) -> solve ln(y) = ln(a) + b*x | |
# ( 1 x0 ) ( ln(a) ) ( ln(y0) ) | |
# ( 1 x1 ) * ( b ) = ( ln(y1) ) | |
# ( 1 x2 ) ( ln(y2) ) | |
# | |
# todo: how to calculate err = y-a*exp(b*x) | |
# given |ln(y)-a-b*x| | |
sub find_lsq_exponential { | |
my ($values)= @_; | |
my @a; | |
my @b; | |
for (my $row=0 ; $row<=$#$values ; $row++) { | |
return undef unless $values->[$row][1] > 0; | |
push @b, log($values->[$row][1]); | |
$a[$row][0]= 1; | |
$a[$row][1]= $values->[$row][0]; | |
} | |
my ($ab, $err)= solve_lsq(\@a, \@b); | |
return undef if (!defined $ab); | |
# my $err2=0; | |
# $err2 += ($_->[1]-exp($ab->[0])*exp($ab->[1]*$_->[0]))**2 for @$values; | |
# printf("err1 : %f err2 : %f\n", $err, sqrt($err2/($#$values-3))); | |
return [ exp($ab->[0]), $ab->[1] ], $err; | |
} | |
# y=a+b*ln(x) | |
sub find_lsq_logarithmic { | |
my ($values)= @_; | |
my @a; | |
my @b; | |
for (my $row=0 ; $row<=$#$values ; $row++) { | |
return undef unless $values->[$row][0] > 0; | |
push @b, $values->[$row][1]; | |
$a[$row][0]= 1; | |
$a[$row][1]= log($values->[$row][0]); | |
} | |
return solve_lsq(\@a, \@b); | |
} | |
# y=a*x^b -> ln(y) = ln(a) + b*ln(x) | |
sub find_lsq_powerlaw { | |
my ($values)= @_; | |
my @a; | |
my @b; | |
for (my $row=0 ; $row<=$#$values ; $row++) { | |
return undef unless $values->[$row][1] > 0; | |
push @b, log($values->[$row][1]); | |
$a[$row][0]= 1; | |
$a[$row][1]= log($values->[$row][0]); | |
} | |
my ($ab, $err)= solve_lsq(\@a, \@b); | |
return undef if (!defined $ab); | |
return [ exp($ab->[0]), $ab->[1] ], $err; | |
} | |
sub solve_lsq { | |
my ($m_a, $v_b)= @_; | |
return undef if (@{$m_a}-@{$m_a->[0]}-1<=0); | |
my $A= Math::MatrixReal->new_from_rows($m_a); | |
my $b= Math::MatrixReal->new_from_columns([$v_b]); | |
my $Ab= (~$A) * $b; | |
my $AA = (~$A) * $A; | |
my $LR= $AA->decompose_LR(); | |
my ($dim,$x,$B) = $LR->solve_LR($Ab); | |
if (!defined $x) { | |
printf("NO SOLUTION!\n"); | |
return undef; | |
} | |
my $Axb= $A*$x-$b; | |
my $error= (~$Axb)*$Axb; | |
#printf("a : %d x %d - b : %d err:%f\n", scalar @$m_a, scalar @{$m_a->[0]}, scalar @$v_b, sqrt($error->element(1,1)/(@{$m_a}-@{$m_a->[0]}-1))); | |
return [ map { $x->element($_,1) } (1..@{$m_a->[0]}) ], sqrt($error->element(1,1)/(@{$m_a}-@{$m_a->[0]}-1)); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment