Last active
December 12, 2015 06:29
-
-
Save JosephLaurino/4729783 to your computer and use it in GitHub Desktop.
Profiling experiment between CPU(non-threaded), PPL, AMP based implementations. The cost of moving data to the GPU can only be recouped if the computation time is much
larger than the data transfer time. With the experiment, the GPU (via AMP) only started winning when the loop count increased passed 200 iterations. Profile first! PPL might have …
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// ---------------------------------------------------------------------------- | |
#include <ppl.h> | |
#include <amp.h> | |
#include <amp_math.h> | |
#include <iostream> | |
#include <boost/chrono/chrono.hpp> | |
#include <vector> | |
using namespace concurrency; | |
using namespace concurrency::precise_math; | |
const int size = 80000; | |
const int loopCount = 1; | |
/* | |
Setup: Visual Studio 2012, Win7, i7, GeForce 650M | |
loopCount = 1 | |
cpu took 0.00463677 seconds | |
ppl took 0.00497504 seconds | |
gpu took 0.16018 seconds | |
loopCount = 20 | |
cpu took 0.0483519 seconds | |
ppl took 0.0113667 seconds | |
gpu took 0.166711 seconds | |
loopCount = 200 | |
cpu took 0.244843 seconds | |
ppl took 0.061472 seconds | |
gpu took 0.150505 seconds | |
loopCount = 2000 | |
cpu took 2.26371 seconds | |
ppl took 0.533806 seconds | |
gpu took 0.202005 seconds | |
loopCount = 20000 | |
cpu took 22.3698 seconds | |
ppl took 5.25856 seconds | |
gpu took 0.66479 seconds | |
*/ | |
void test_PPLMethod(std::vector<float>& result) { | |
boost::chrono::steady_clock::time_point start = boost::chrono::steady_clock::now(); | |
float aCPP[size]; | |
float bCPP[size]; | |
float sumCPP[size]; | |
for( int i = 0; i < size; i++ ) { | |
aCPP[i] = i; | |
bCPP[i] = i*i; | |
} | |
parallel_for( 0, size, [&](int idx) { | |
sumCPP[idx] = pow(aCPP[idx], bCPP[idx]); | |
for( int i = 0; i < loopCount; i++ ) { | |
sumCPP[idx] = pow(sumCPP[idx], bCPP[idx]); | |
} | |
}); | |
boost::chrono::duration<double> sec = boost::chrono::steady_clock::now() - start; | |
std::cout << "ppl took " << sec.count() << " seconds\n"; | |
result.clear(); | |
for( int i = 0; i < size; i++ ) { | |
result.push_back(sumCPP[i]); | |
} | |
} | |
void test_AmpMethod(std::vector<float>& result) { | |
boost::chrono::steady_clock::time_point start = boost::chrono::steady_clock::now(); | |
float aCPP[size]; | |
float bCPP[size]; | |
float sumCPP[size]; | |
for( int i = 0; i < size; i++ ) { | |
aCPP[i] = i; | |
bCPP[i] = i*i; | |
} | |
// Create C++ AMP objects. | |
array_view<const float, 1> a(size, aCPP); | |
array_view<const float, 1> b(size, bCPP); | |
array_view<float, 1> sum(size, sumCPP); | |
sum.discard_data(); | |
parallel_for_each( sum.extent, [=](index<1> idx) restrict(amp) { | |
sum[idx] = pow(a[idx], b[idx]); | |
for( int i = 0; i < loopCount; i++ ) { | |
sum[idx] = pow(sum[idx], b[idx]); | |
} | |
} | |
); | |
sum.synchronize(); // needed this to copy data from GPU back to CPU | |
boost::chrono::duration<double> sec = boost::chrono::steady_clock::now() - start; | |
std::cout << "gpu took " << sec.count() << " seconds\n"; | |
result.clear(); | |
for( int i = 0; i < size; i++ ) { | |
result.push_back(sum[i]); | |
} | |
} | |
void test_CPUMethod(std::vector<float>& result) { | |
boost::chrono::steady_clock::time_point start = boost::chrono::steady_clock::now(); | |
float aCPP[size]; | |
float bCPP[size]; | |
float sumCPP[size]; | |
for( int i = 0; i < size; i++ ) { | |
aCPP[i] = i; | |
bCPP[i] = i*i; | |
} | |
for( int idx = 0; idx < size; idx++) { | |
sumCPP[idx] = pow(aCPP[idx], bCPP[idx]); | |
for( int i = 0; i < loopCount; i++ ) { | |
sumCPP[idx] = pow(sumCPP[idx], bCPP[idx]); | |
} | |
} | |
boost::chrono::duration<double> sec = boost::chrono::steady_clock::now() - start; | |
std::cout << "cpu took " << sec.count() << " seconds\n"; | |
result.clear(); | |
for( int i = 0; i < size; i++ ) { | |
result.push_back(sumCPP[i]); | |
} | |
} | |
// ---------------------------------------------------------------------------- | |
int main(int argc, char* argv[]) { | |
std::vector<float> cpuResult; | |
std::vector<float> pplResult; | |
std::vector<float> gpuResult; | |
test_CPUMethod(cpuResult); | |
test_PPLMethod(pplResult); | |
test_AmpMethod(gpuResult); | |
for( int i = 0; i < cpuResult.size(); i++ ) | |
{ | |
if( (cpuResult[i] != pplResult[i]) || | |
(cpuResult[i] != gpuResult[i]) ) { | |
std::cout << "bad calc at " << i << "\n"; | |
std::cout << "cpuResult[i] " << cpuResult[i] << "\n"; | |
std::cout << "pplResult[i] " << pplResult[i] << "\n"; | |
std::cout << "gpuResult[i] " << gpuResult[i] << "\n"; | |
break; | |
} | |
} | |
return 0; | |
} |
Author
JosephLaurino
commented
Feb 7, 2013
- added data verification and the call to sum.synchronize() in test_AmpMethod, without the call to synchronize, the cost of copying data from the GPU back to the CPU was not included in the profile timings
I also discovered that the first run of the C++ AMP code triggers the compilation of the code for GPU. To properly profile, one needs to exclude that first run and do multiple runs.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment