Created
November 27, 2018 23:15
-
-
Save arrieta/1f453781ef0bcec864998bd10ac0bbc1 to your computer and use it in GitHub Desktop.
Basic benchmark polynomial evaluation using Horner's Method
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""Simple re-arrangement can yield significant performance improvements in | |
naive, low-order polynomial evaluation. The interested reader may wish to read | |
about "Horner's method." | |
(C) 2018 Nabla Zero Labs | |
MIT License | |
""" | |
import time | |
def p0(a, x): | |
"""First strategy: naively calculate the polynomial. | |
Three additions, three multiplications, one pow(2), one pow(3).""" | |
for xi in x: | |
y = a[0] + a[1] * xi + a[2] * xi**2 + a[3] * xi**3 | |
return y | |
def p1(a, x): | |
"""Second strategy: save multiplications at the cost of memory. | |
Three additions, five multiplications, two extra variables""" | |
for xi in x: | |
xi2 = xi * xi | |
xi3 = xi2 * xi | |
y = a[0] + a[1] * xi + a[2] * xi2 + a[3] * xi3 | |
return y | |
def p2(a, x): | |
"""Third strategy: save multiplication with no cost. | |
Three additions, three multiplications. | |
""" | |
for xi in x: | |
y = a[0] + xi * (a[1] + xi * ( a[2] + a[3] * xi)) | |
return y | |
def cpu(fun, *args): | |
a, x = args | |
tbeg = time.perf_counter() | |
y = fun(a, x) | |
tend = time.perf_counter() | |
elap = tend - tbeg | |
thro = len(x) / elap | |
print(f"{fun.__name__}: {elap:.6f} sec ({thro:.0f} eval/sec)", | |
f"last value: {y:23.16e}", sep=" ") | |
if __name__ == "__main__": | |
import random | |
a = [random.uniform(-10, 10) for _ in range(4)] | |
x = [random.uniform(-1, 1) for _ in range(1000000)] | |
for _ in range(10): | |
cpu(p0, a, x) | |
cpu(p1, a, x) | |
cpu(p2, a, x) |
#include <chrono>
#include <iomanip>
#include <iostream>
#include <limits>
#include <random>
#include <vector>
template < typename T >
T p0( const T a[ 4 ], const std::vector< T >& x ) {
using std::pow;
T y = 0;
for ( auto&& xi : x ) {
y += a[ 0 ] + a[ 1 ] * xi + a[ 2 ] * pow( xi, 2 ) + a[ 3 ] * pow( xi, 3 );
}
return y;
}
template < typename T >
T p1( const T a[ 4 ], const std::vector< T >& x ) {
T y = 0;
for ( auto&& xi : x ) {
auto xi2 = xi * xi;
auto xi3 = xi2 * xi;
y += a[ 0 ] + a[ 1 ] * xi + a[ 2 ] * xi2 + a[ 3 ] * xi3;
}
return y;
}
template < typename T >
T p2( const T a[ 4 ], const std::vector< T >& x ) {
T y = 0;
for ( auto&& xi : x ) {
y += a[ 0 ] + xi * ( a[ 1 ] + xi * ( a[ 2 ] + a[ 3 ] * xi ) );
}
return y;
}
template < typename T >
T p3( const T a[ 4 ], const std::vector< T >& x ) {
T y = 0;
for ( auto&& xi : x ) {
y += std::fma(
std::fma( std::fma( a[ 3 ], xi, a[ 2 ] ), xi, a[ 1 ] ), xi, a[ 0 ] );
}
return y;
}
template < typename F, typename T >
void cpu( F&& fun, const T a[ 4 ], const std::vector< T >& x ) {
using namespace std::chrono;
auto tbeg = system_clock::now();
auto y = fun( a, x );
auto tend = system_clock::now();
auto elap = nanoseconds( tend - tbeg ).count();
auto thro = double( x.size() ) / elap;
std::cout << std::setprecision( std::numeric_limits< T >::max_digits10 )
<< std::scientific;
std::cout << std::setw( 12 ) << elap << " nanosec (" << std::setw( 12 )
<< thro << " eval / nanosec) last: " << y << "\n";
}
template < typename T >
void run() {
std::mt19937 gen( 0 );
std::uniform_real_distribution< T > dis( -1, 1 );
T a[] = {dis( gen ), dis( gen ), dis( gen ), dis( gen )};
std::vector< T > x;
for ( int k = 0; k < 1000000; ++k ) {
x.emplace_back( dis( gen ) );
}
std::cout << "new run\n";
for ( int k = 0; k < 5; ++k ) {
cpu( p0< T >, a, x );
cpu( p1< T >, a, x );
cpu( p2< T >, a, x );
cpu( p3< T >, a, x );
std::cout << "-------------------------------------------------------------"
"------------------\n";
}
}
int main() {
run< float >();
run< double >();
run< long double >();
}
Result:
new run
25824000 nanosec (3.872366791e-02 eval / nanosec) last: 2.412012031e+05
1398000 nanosec (7.153075823e-01 eval / nanosec) last: 2.412012031e+05
1064000 nanosec (9.398496241e-01 eval / nanosec) last: 2.412012031e+05
795000 nanosec (1.257861635e+00 eval / nanosec) last: 2.412012031e+05
-------------------------------------------------------------------------------
25858000 nanosec (3.867275118e-02 eval / nanosec) last: 2.412012031e+05
1262000 nanosec (7.923930269e-01 eval / nanosec) last: 2.412012031e+05
990000 nanosec (1.010101010e+00 eval / nanosec) last: 2.412012031e+05
729000 nanosec (1.371742112e+00 eval / nanosec) last: 2.412012031e+05
-------------------------------------------------------------------------------
25654000 nanosec (3.898027598e-02 eval / nanosec) last: 2.412012031e+05
1072000 nanosec (9.328358209e-01 eval / nanosec) last: 2.412012031e+05
1019000 nanosec (9.813542689e-01 eval / nanosec) last: 2.412012031e+05
775000 nanosec (1.290322581e+00 eval / nanosec) last: 2.412012031e+05
-------------------------------------------------------------------------------
25649000 nanosec (3.898787477e-02 eval / nanosec) last: 2.412012031e+05
1107000 nanosec (9.033423668e-01 eval / nanosec) last: 2.412012031e+05
965000 nanosec (1.036269430e+00 eval / nanosec) last: 2.412012031e+05
717000 nanosec (1.394700139e+00 eval / nanosec) last: 2.412012031e+05
-------------------------------------------------------------------------------
26686000 nanosec (3.747283220e-02 eval / nanosec) last: 2.412012031e+05
1296000 nanosec (7.716049383e-01 eval / nanosec) last: 2.412012031e+05
986000 nanosec (1.014198783e+00 eval / nanosec) last: 2.412012031e+05
731000 nanosec (1.367989056e+00 eval / nanosec) last: 2.412012031e+05
-------------------------------------------------------------------------------
new run
24278000 nanosec (4.11895543290221600e-02 eval / nanosec) last: 4.24933177245050552e+05
1523000 nanosec (6.56598818122127392e-01 eval / nanosec) last: 4.24933177245050552e+05
1244000 nanosec (8.03858520900321505e-01 eval / nanosec) last: 4.24933177245050552e+05
918000 nanosec (1.08932461873638342e+00 eval / nanosec) last: 4.24933177245050552e+05
-------------------------------------------------------------------------------
24012000 nanosec (4.16458437447942698e-02 eval / nanosec) last: 4.24933177245050552e+05
1142000 nanosec (8.75656742556917722e-01 eval / nanosec) last: 4.24933177245050552e+05
1090000 nanosec (9.17431192660550510e-01 eval / nanosec) last: 4.24933177245050552e+05
865000 nanosec (1.15606936416184980e+00 eval / nanosec) last: 4.24933177245050552e+05
-------------------------------------------------------------------------------
24334000 nanosec (4.10947645269992626e-02 eval / nanosec) last: 4.24933177245050552e+05
1091000 nanosec (9.16590284142988043e-01 eval / nanosec) last: 4.24933177245050552e+05
1043000 nanosec (9.58772770853307810e-01 eval / nanosec) last: 4.24933177245050552e+05
874000 nanosec (1.14416475972540055e+00 eval / nanosec) last: 4.24933177245050552e+05
-------------------------------------------------------------------------------
23830000 nanosec (4.19639110365086013e-02 eval / nanosec) last: 4.24933177245050552e+05
1436000 nanosec (6.96378830083565492e-01 eval / nanosec) last: 4.24933177245050552e+05
1280000 nanosec (7.81250000000000000e-01 eval / nanosec) last: 4.24933177245050552e+05
902000 nanosec (1.10864745011086474e+00 eval / nanosec) last: 4.24933177245050552e+05
-------------------------------------------------------------------------------
24681000 nanosec (4.05169968801912389e-02 eval / nanosec) last: 4.24933177245050552e+05
1237000 nanosec (8.08407437348423574e-01 eval / nanosec) last: 4.24933177245050552e+05
1123000 nanosec (8.90471950133570833e-01 eval / nanosec) last: 4.24933177245050552e+05
860000 nanosec (1.16279069767441867e+00 eval / nanosec) last: 4.24933177245050552e+05
-------------------------------------------------------------------------------
new run
53355000 nanosec (1.874238590572580021032e-02 eval / nanosec) last: 4.249331772450602300921e+05
2667000 nanosec (3.749531308586426803231e-01 eval / nanosec) last: 4.249331772450602300921e+05
1965000 nanosec (5.089058524173027953097e-01 eval / nanosec) last: 4.249331772450602300921e+05
106937000 nanosec (9.351300298306480102140e-03 eval / nanosec) last: 4.249331772450602300921e+05
-------------------------------------------------------------------------------
53469000 nanosec (1.870242570461388975644e-02 eval / nanosec) last: 4.249331772450602300921e+05
2969000 nanosec (3.368137420006736548750e-01 eval / nanosec) last: 4.249331772450602300921e+05
2194000 nanosec (4.557885141294439335091e-01 eval / nanosec) last: 4.249331772450602300921e+05
106082000 nanosec (9.426669934578911155820e-03 eval / nanosec) last: 4.249331772450602300921e+05
-------------------------------------------------------------------------------
53432000 nanosec (1.871537655337625338792e-02 eval / nanosec) last: 4.249331772450602300921e+05
2746000 nanosec (3.641660597232337925888e-01 eval / nanosec) last: 4.249331772450602300921e+05
2366000 nanosec (4.226542688081149634627e-01 eval / nanosec) last: 4.249331772450602300921e+05
107089000 nanosec (9.338027248363510793294e-03 eval / nanosec) last: 4.249331772450602300921e+05
-------------------------------------------------------------------------------
53097000 nanosec (1.883345575079571274091e-02 eval / nanosec) last: 4.249331772450602300921e+05
2692000 nanosec (3.714710252600297302195e-01 eval / nanosec) last: 4.249331772450602300921e+05
2104000 nanosec (4.752851711026616077227e-01 eval / nanosec) last: 4.249331772450602300921e+05
107119000 nanosec (9.335412018409433229649e-03 eval / nanosec) last: 4.249331772450602300921e+05
-------------------------------------------------------------------------------
53603000 nanosec (1.865567225714978571993e-02 eval / nanosec) last: 4.249331772450602300921e+05
2692000 nanosec (3.714710252600297302195e-01 eval / nanosec) last: 4.249331772450602300921e+05
1982000 nanosec (5.045408678102926147702e-01 eval / nanosec) last: 4.249331772450602300921e+05
106767000 nanosec (9.366189927599351955356e-03 eval / nanosec) last: 4.249331772450602300921e+05
-------------------------------------------------------------------------------
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Sample run:
Horner's is ~85% improvement over naive and ~19% improvement over pre-computed powers. In addition, Horner's has less exposure to overflow and round-off error. Additional improvements are possible if the underlying platform offers a fused-multiply-add operation.