Skip to content

Instantly share code, notes, and snippets.

@nlitsme
Last active August 29, 2015 14:08
Show Gist options
  • Save nlitsme/ad898d46b0f24111e0be to your computer and use it in GitHub Desktop.
Save nlitsme/ad898d46b0f24111e0be to your computer and use it in GitHub Desktop.
calculate least squares approximations of input data for several expressions
#!perl -w
# vim: sw=4 ts=4 et :
# (C) 2003-2007 Willem Jan Hengeveld <[email protected]>
# Web: http://www.xs4all.nl/~itsme/
# http://wiki.xda-developers.com/
#
# $Id: $
#
# http://en.wikipedia.org/wiki/Linear_regression
# http://mathworld.wolfram.com/LeastSquaresFitting.html
# http://mathworld.wolfram.com/LeastSquaresFittingPolynomial.html
# http://mathworld.wolfram.com/LeastSquaresFittingExponential.html
# http://mathworld.wolfram.com/LeastSquaresFittingPerpendicularOffsets.html
# http://mathworld.wolfram.com/NonlinearLeastSquaresFitting.html
#
# tests:
# perl -e 'printf("%5.2f %5.2f\n", $_, 0.4*exp(0.2*$_)) for 1..100' | lsq | sort -n
# perl -e 'printf("%5.2f %5.2f\n", $_, 0.4+0.2*log($_)) for 1..100' | lsq | sort -n
# perl -e 'printf("%5.2f %5.2f\n", $_, 0.4*$_**0.2) for 1..100' | lsq | sort -n
# perl -e 'printf("%5.2f %5.2f\n", $_, 0.4+0.2*($_**2)) for 1..100' | lsq | sort -n
# perl -e 'printf("%5.2f %5.2f\n", $_, 0.4+0.2*($_**5)) for 1..100' | lsq | sort -n
# perl -e 'printf("%5.2f %5.2f\n", $_, 0.4+0.2*($_**7)) for 1..100' | lsq | sort -n
#
# todo: add fft for periodic signals
# done: add option to output data+functions with gnuplot
# todo: read about non-linear least squares Marquardt-Levenberg algorithm
# ........ how to get proper timestamps:
# .... also added date+time as first 2 colums
# set xdata time
# set timefmt "%Y-%m-%d %H:%M:%S"
# set format x "%H:%M"
# plot '/tmp/test.dat' using 1:4, '' using 1:(328677.832933346+$3*(-0.000841587261071837)) title '1st order poly' smooth unique
use strict;
use warnings;
use Getopt::Long;
use Math::MatrixReal;
#use File::Temp;
use IO::File;
use POSIX;
$|=1;
# simple case: a*x+b=y
# given many pairs (x,y) find a and b that minimize the error.
#
# -> a*x0+b=y0
# -> a*x1+b=y1
# -> a*x2+b=y2
#
# -> equation
# ( x0 1 ) ( a ) ( y0 )
# ( x1 1 ) *( b ) = ( y1 )
# ( x2 1 ) ( y2 )
#
# A * x = y
#
# minimize distance to line:
# d(line,[xy]) = |A*x-y|
#
# error : |A*x-y| = (A*x-y)' * (A*x-y)
# = x'*A'*A*x - 2*x*A'*y + y'*y
# differentiate by x
# -> 2*A'*A*x-2*A'*y = 0 -> A'*A*x=A'*y
# mini
# 'm' measurements
# 'p' variables
#
# y_0=a0*x_0^0+a1*x_0^1+a2*x_0^2+..an*x_0^n
# y_1=a0*x_1^0+a1*x_1^1+a2*x_1^2+..an*x_1^n
# ..
# y_m=a0*x_m^0+am*x_m^1+a2*x_m^2+..an*x_m^n
# ->
#
# (y_0 .. y_m) = Matrix_rows( [x_0^0, x_0^1, .. x_0^n], [x_1^0 .. x_1^n] ... ) * (a0..an)
# y=sum(a_i*x^i) find_lsq_poly
# y=a*exp(b*x) find_lsq_exponential
# y=a+b*ln(x) find_lsq_logarithmic
# y=a*x^b find_lsq_powerlaw
# done: add support for timestamps
# .. use parsedate to identify timestamps.
# timestamps can be with, or without date.
# done: add support for calc yresult for xval
# done: add support for calc xval for targety
# .. if list of values starting at col0 follows list, assume that 'y' needs to be calculated
# .. if list of values starting with TAB follows list, assum that 'x' needs to be calculated.
my ($xcol, $ycol)=(0,1);
my $calclog=1;
my $maxorder=10;
my $minorder=0;
my $xinside;
my $gnuplot;
GetOptions(
"x=s" => \$xcol,
"y=s" => \$ycol,
"o=i" => sub { $maxorder= $_[1]+1; $minorder=$_[1]; $calclog=0; },
"f" => \$xinside,
"g" => \$gnuplot,
) or die usage();
sub usage {
return <<__EOF__
Usage: lsq [-x COL] [-y COL] [-o N]
-x COL : specify 'x' column ( first col = 0 )
-y COL : specify 'y' column
-o N : specify order of polynomial
-f : try to move factor inside exponent (a*x)^n
-g : output gnuplot data
input data can contain date's + time's + 0xhexdump + a+b*c expressions
for lines starting with TAB, or ? the corresponding 'x' will be calculated
for lines ending with ? or which have only one value the corresponding 'y'
will be calculated
__EOF__
}
my @values;
my @xvalues;
my @yvalues;
my @valueformat; # -3:date+time, -2:date, -1:time, >=0: decimals
while (<>) {
next if /^#/;
my @line= split /[^0-9x.e:-]+/, $_;
next unless @line;
shift @line if $line[0] eq "";
for (my $i=0 ; $i<@line ; $i++) {
if ($i<$#line && isdate($line[$i]) && istime($line[$i+1])) {
$line[$i]= maketimestamp($line[$i], $line[$i+1]);
$valueformat[$i]= -3;
splice @line, $i+1,1;
}
elsif (isdate($line[$i])) {
$line[$i]= maketimestamp($line[$i], 0);
$valueformat[$i]= -2;
}
elsif (istime($line[$i])) {
$line[$i]= maketimestamp(0, $line[$i]);
$valueformat[$i]= -1;
}
elsif (isexpr($line[$i])) {
$line[$i]= eval($line[$i]);
}
elsif ($line[$i] =~ /\.(\d+)/ && ( !defined $valueformat[$i] || length($1)>$valueformat[$i])) {
$valueformat[$i]= length($1);
}
}
if (defined $line[$xcol] && defined $line[$ycol]) {
if ($line[$xcol] ne "?" && $line[$ycol] ne "?") {
push @values, [$line[$xcol], $line[$ycol] ];
}
elsif ($line[$xcol] ne "?") {
push @xvalues, $line[$xcol];
}
elsif ($line[$ycol] ne "?") {
push @yvalues, $line[$ycol];
}
}
elsif (@line) {
if (/^\t/) {
push @yvalues, $line[0];
}
else {
push @xvalues, $line[0];
}
}
}
sub isexpr {
my $number= "(?:0x[0-9a-f]+|0[0-7]+|[1-9][0-9]*)";
return $_[0] =~ /^$number(?:[+*\/&|^-]$number)*$/;
}
sub isdate {
return ($_[0] =~ /\d-\d+-\d/);
}
sub istime {
return ($_[0] =~ /\d:\d/);
}
sub maketimestamp {
my ($date, $time) = @_;
my @ymd= $date ? split(/-/, $date) : (0,0,0);
if ($ymd[0]<100) {
# y2k fix!
if ($ymd[0]<50) {
$ymd[0]+=2000;
}
else {
$ymd[0]+=1900;
}
}
my @hms= $time ? split(/:/, $time) : (0,0,0);
push @hms,0 if (@hms==2);
$ymd[0]-=1900;
$ymd[1]-=1;
return POSIX::mktime(reverse @ymd,@hms)-($gnuplot ? 946684800 : 0);
}
my @errs; # error for expr
my @exprs; # printable expressions
my @funcs; # perl functions
my @gplot; # gnuplot expressions
my @title; # titles for gnuplot
# first try polynomials
for (my $order=$minorder ; $order < $maxorder ; $order++) {
my ($poly, $error)= find_lsq_poly($order, \@values);
next if (!defined $poly);
if ($gnuplot) {
}
if ($xinside) {
push @errs, $error;
push @exprs, join("", map { $_==0 ? sprintf("%5g", $poly->[$_]) :
$_==1 ? sprintf("%+5g*x", $poly->[$_]) :
sprintf("%s(%5g*x)^%d", $poly->[$_]<0?"-":"+",
abs($poly->[$_])**(1/$_), $_) } (0..$#$poly));
}
else {
push @errs, $error;
push @exprs, join("", map { $_==0 ? sprintf("%5g", $poly->[$_]) :
$_==1 ? sprintf("%+5g*x", $poly->[$_]) :
sprintf("%+5g*x^%d", $poly->[$_], $_) } (0..$#$poly));
}
push @funcs, sub { my $res=0; for (my $i=$#$poly ; $i>=0 ; $i--) { $res *= $_[0]; $res += $poly->[$i]; } return $res; };
#p0+x*(p1+x*(p2+x*p3))
push @gplot, join("+\$1*(", @$poly).(')'x(@$poly-1));
push @title, countword($order)." order poly";
}
sub countword {
return "1st" if ($_[0]==1);
return "2nd" if ($_[0]==2);
return "3rd" if ($_[0]==3);
return "$_[0]th";
}
if ($calclog) {
{
my ($ab, $err)= find_lsq_exponential(\@values);
if ($ab) {
push @errs, $err;
push @exprs, sprintf("%5g * exp( %5g * x )", $ab->[0], $ab->[1]);
push @funcs, sub { return $ab->[0]*exp($_[0]*$ab->[1]); };
push @gplot, sprintf("%.10g*exp(\$1*%.10g)", $ab->[0], $ab->[1]);
push @title, "a+exp(b*x)";
}
}
{
my ($ab, $err)= find_lsq_logarithmic(\@values);
if ($ab) {
# a + b * l(x) == a + l(x^b) == l(x^b*e^a)
push @errs, $err;
push @exprs, sprintf("%5g + %5g * log( x )", $ab->[0], $ab->[1]);
push @funcs, sub { if ($_[0]<=0) { return undef; } return $ab->[0]+log($_[0])*$ab->[1]; };
push @gplot, sprintf("%.10g + %.10g * log( \$1 )", $ab->[0], $ab->[1]);
push @title, "a+b*log(x)";
}
}
{
my ($ab, $err)= find_lsq_powerlaw(\@values);
if ($ab) {
if ($xinside) {
push @errs, $err;
push @exprs, sprintf("%s( %5g * x ) ^ %5g",
$ab->[0]<0?"-":"+",
abs($ab->[0])**(1/$ab->[1]), $ab->[1]);
}
else {
push @errs, $err;
push @exprs, sprintf("%5g * x ^ %5g", $ab->[0], $ab->[1]);
}
push @funcs, sub { return $ab->[0]*$_[0]**$ab->[1]; };
push @gplot, sprintf("%.10g * exp(log(\$1)*%.10g)", $ab->[0], $ab->[1]);
push @title, "a*x^b";
}
}
}
sub makedatafile {
my ($values)= @_;
#my $fh= File::Temp->new();
my $fh= IO::File->new("/tmp/test.dat", "w");
for (@$values) {
$fh->printf("%.10g %.10g\n", @$_);
}
#return $fh->filename;
return "/tmp/test.dat";
}
if ($gnuplot) {
my $tmpdat= makedatafile(\@values);
printf("plot '%s' using 1:2, %s\n", $tmpdat, join(", ", map { sprintf("'' using 1:(%s) title '%s' smooth unique", $gplot[$_], $title[$_]) } 0..$#gplot));
}
else {
print map { sprintf("%10f %s\n", $errs[$_], $exprs[$_]) } 0..$#exprs;
}
for my $xval (@xvalues) {
printf(" %s", formatvalue($xval, $valueformat[$xcol]));
for my $f (@funcs) {
printf(" %s", formatvalue($f->($xval), $valueformat[$ycol]));
}
print "\n";
}
for my $yval (@yvalues) {
printf(" %s", formatvalue($yval, $valueformat[$ycol]));;
for my $f (@funcs) {
my $xval= findinverse($f, $yval);
printf(" %s", formatvalue($xval, $valueformat[$xcol]));;
}
print "\n";
}
sub formatvalue {
my ($val, $ts)= @_;
$ts ||= 0;
if (!defined $val) {
return "undef";
}
if ($ts>=0) {
return sprintf("%.*f", $ts, $val);
}
else {
my @tv= localtime $val;
$tv[4]+=1;
$tv[5]+=1900;
if ($ts==-1) {
return sprintf("%02d:%02d:%02d", reverse @tv[0..2]);
}
elsif ($ts==-2) {
return sprintf("%04d-%02d-%02d", reverse @tv[3..5]);
}
elsif ($ts==-3) {
return sprintf("%04d-%02d-%02d %02d:%02d:%02d", reverse @tv[0..5]);
}
}
}
sub findslope {
my ($f, $yval, $x)= @_;
my $x0= $x;
my $y0= $f->($x0)-$yval;
my $x0d= $x0+0.0001;
my $y0d= eval { $f->($x0d)-$yval; };
if (!defined $y0d || $@) {
$x0d= $x0-0.0001;
$y0d= eval { $f->($x0d)-$yval; };
if (!defined $y0d || $@) {
return undef;
}
}
if ($x0==$x0d) {
return 999999999999;
}
return ($y0d-$y0)/($x0d-$x0);
}
sub findinverse {
my ($f, $yval)= @_;
# todo: calc epsilon based on min/max x-value.
my $x0= $values[0][0];
my $iter=0;
while (1) {
my $s= findslope($f, $yval, $x0);
if (!defined $s || $s==0) {
return undef;
}
my $y0= $f->($x0)-$yval;
# y0+(x-x0)*s = 0 -> y0+x*s-x0*s -> x=x0-y0/s
my $x1= $x0-$y0/$s;
last if abs($x1-$x0)<0.0001;
last if ($iter++>100);
$x0= $x1;
}
return $x0;
}
sub find_lsq_poly {
my ($n, $values)= @_;
# (y_0 .. y_m) = Matrix_rows( [x_0^0, x_0^1, .. x_0^n], [x_1^0 .. x_1^n] ... ) * (a0..an)
if ( $#$values-$n-1 <= 0 ) {
return undef;
}
my @a;
my @b;
for (my $row=0 ; $row<=$#$values ; $row++) {
push @b, $values->[$row][1];
for (my $col=0 ; $col<=$n ; $col++) {
$a[$row][$col]= $values->[$row][0]**$col;
}
}
return solve_lsq(\@a, \@b);
}
# y=a*exp(b*x) -> solve ln(y) = ln(a) + b*x
# ( 1 x0 ) ( ln(a) ) ( ln(y0) )
# ( 1 x1 ) * ( b ) = ( ln(y1) )
# ( 1 x2 ) ( ln(y2) )
#
# todo: how to calculate err = y-a*exp(b*x)
# given |ln(y)-a-b*x|
sub find_lsq_exponential {
my ($values)= @_;
my @a;
my @b;
for (my $row=0 ; $row<=$#$values ; $row++) {
return undef unless $values->[$row][1] > 0;
push @b, log($values->[$row][1]);
$a[$row][0]= 1;
$a[$row][1]= $values->[$row][0];
}
my ($ab, $err)= solve_lsq(\@a, \@b);
return undef if (!defined $ab);
# my $err2=0;
# $err2 += ($_->[1]-exp($ab->[0])*exp($ab->[1]*$_->[0]))**2 for @$values;
# printf("err1 : %f err2 : %f\n", $err, sqrt($err2/($#$values-3)));
return [ exp($ab->[0]), $ab->[1] ], $err;
}
# y=a+b*ln(x)
sub find_lsq_logarithmic {
my ($values)= @_;
my @a;
my @b;
for (my $row=0 ; $row<=$#$values ; $row++) {
return undef unless $values->[$row][0] > 0;
push @b, $values->[$row][1];
$a[$row][0]= 1;
$a[$row][1]= log($values->[$row][0]);
}
return solve_lsq(\@a, \@b);
}
# y=a*x^b -> ln(y) = ln(a) + b*ln(x)
sub find_lsq_powerlaw {
my ($values)= @_;
my @a;
my @b;
for (my $row=0 ; $row<=$#$values ; $row++) {
return undef unless $values->[$row][1] > 0;
push @b, log($values->[$row][1]);
$a[$row][0]= 1;
$a[$row][1]= log($values->[$row][0]);
}
my ($ab, $err)= solve_lsq(\@a, \@b);
return undef if (!defined $ab);
return [ exp($ab->[0]), $ab->[1] ], $err;
}
sub solve_lsq {
my ($m_a, $v_b)= @_;
return undef if (@{$m_a}-@{$m_a->[0]}-1<=0);
my $A= Math::MatrixReal->new_from_rows($m_a);
my $b= Math::MatrixReal->new_from_columns([$v_b]);
my $Ab= (~$A) * $b;
my $AA = (~$A) * $A;
my $LR= $AA->decompose_LR();
my ($dim,$x,$B) = $LR->solve_LR($Ab);
if (!defined $x) {
printf("NO SOLUTION!\n");
return undef;
}
my $Axb= $A*$x-$b;
my $error= (~$Axb)*$Axb;
#printf("a : %d x %d - b : %d err:%f\n", scalar @$m_a, scalar @{$m_a->[0]}, scalar @$v_b, sqrt($error->element(1,1)/(@{$m_a}-@{$m_a->[0]}-1)));
return [ map { $x->element($_,1) } (1..@{$m_a->[0]}) ], sqrt($error->element(1,1)/(@{$m_a}-@{$m_a->[0]}-1));
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment