Created
June 24, 2016 07:34
-
-
Save vfdev-5/5383b0db4777ce8bad9759c9009bbf01 to your computer and use it in GitHub Desktop.
Eliminate False Sharing (http://www.drdobbs.com/parallel/eliminate-false-sharing/217500206?pgno=1)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
project(FalseParallelSharing) | |
cmake_minimum_required(VERSION 2.8) | |
aux_source_directory(. SRC_LIST) | |
add_executable(${PROJECT_NAME} ${SRC_LIST}) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// STD | |
#include <iostream> | |
#include <algorithm> | |
#include <vector> | |
#include <thread> | |
#include <functional> | |
#include <chrono> | |
const unsigned int MAX_NB_THREADS = 8; | |
//*********************************************************************************************** | |
void false_parallel_sharing_compute(int nbThreads, const std::vector<int> & data, | |
std::vector<unsigned int> & results, int p) | |
{ | |
results[p] = 0; | |
unsigned int chunkSize = data.size() / nbThreads + 1; | |
unsigned int start = p * chunkSize; | |
unsigned int end = start + chunkSize > data.size() ? data.size() : start + chunkSize; | |
for (unsigned int i=start; i<end; i++) | |
{ | |
if (data[i] % 2) | |
{ | |
// Multiple access to closely placed memory place causes performance slow down | |
++results[p]; | |
} | |
} | |
} | |
//*********************************************************************************************** | |
void better_parallel_sharing_compute(int nbThreads, const std::vector<int> & data, | |
std::vector<unsigned int> & results, int p) | |
{ | |
// Here local variable avoids multiple access to results external variable | |
unsigned int counter = 0; | |
unsigned int chunkSize = data.size() / nbThreads + 1; | |
unsigned int start = p * chunkSize; | |
unsigned int end = start + chunkSize > data.size() ? data.size() : start + chunkSize; | |
for (unsigned int i=start; i<end; i++) | |
{ | |
if (data[i] % 2) | |
{ | |
++counter; | |
} | |
} | |
results[p] = counter; | |
} | |
//*********************************************************************************************** | |
int main() | |
{ | |
// Setup data: | |
std::vector<int> data(15000000); | |
int counter = 0; | |
std::for_each(std::begin(data), std::end(data), [&counter](int & value){ value = counter++; }); | |
// Print data | |
//std::cout << "data = [" << std::endl; | |
//std::for_each(std::begin(data), std::end(data), [](double value){ std::cout << value << " "; }); | |
//std::cout << "]" << std::endl; | |
// Example 1: Simple parallel version (flawed) | |
int c; | |
std::cout << std::endl << "Example 1: Simple parallel version (flawed). Press any key" << std::endl; | |
c = getchar(); | |
for (unsigned int nbThreads=1; nbThreads<=MAX_NB_THREADS; nbThreads++) | |
{ | |
std::cout << "Run example with " << nbThreads << " thread(s)" << std::endl; | |
// setup the resulting memory container | |
std::vector<unsigned int> results(nbThreads, 0); | |
// Start timer: | |
auto t0 = std::chrono::high_resolution_clock::now(); | |
std::vector<std::thread> pool; | |
for (unsigned int p=0; p<nbThreads; p++) | |
{ | |
pool.push_back(std::thread(false_parallel_sharing_compute, nbThreads, std::cref(data), std::ref(results), p)); | |
} | |
// pool join: | |
for (unsigned int p=0; p<nbThreads; p++) | |
{ | |
pool[p].join(); | |
} | |
// Stop timer: | |
auto elapsed = std::chrono::high_resolution_clock::now() - t0; | |
std::cout << "Elapsed seconds : " << std::chrono::duration<float>(elapsed).count() << std::endl; | |
int odds = 0; | |
std::for_each(std::begin(results), std::end(results), [&odds](unsigned int value) { | |
odds += value; | |
}); | |
std::cout << "Number of odd values is " << odds << std::endl; | |
} | |
// Example 2: From Zero to Hero | |
std::cout << std::endl << "Example 2: From Zero to Hero. Press any key" << std::endl; | |
c = getchar(); | |
for (unsigned int nbThreads=1; nbThreads<=MAX_NB_THREADS; nbThreads++) | |
{ | |
std::cout << "Run example with " << nbThreads << " thread(s)" << std::endl; | |
// setup the resulting memory container | |
std::vector<unsigned int> results(nbThreads, 0); | |
// Start timer: | |
auto t0 = std::chrono::high_resolution_clock::now(); | |
std::vector<std::thread> pool; | |
for (unsigned int p=0; p<nbThreads; p++) | |
{ | |
pool.push_back(std::thread(better_parallel_sharing_compute, nbThreads, std::cref(data), std::ref(results), p)); | |
} | |
// pool join: | |
for (unsigned int p=0; p<nbThreads; p++) | |
{ | |
pool[p].join(); | |
} | |
// Stop timer: | |
auto elapsed = std::chrono::high_resolution_clock::now() - t0; | |
std::cout << "Elapsed seconds : " << std::chrono::duration<float>(elapsed).count() << std::endl; | |
int odds = 0; | |
std::for_each(std::begin(results), std::end(results), [&odds](unsigned int value) { | |
odds += value; | |
}); | |
std::cout << "Number of odd values is " << odds << std::endl; | |
} | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment