Written by Jämes Ménétry for the course GPGPU of the Master of Science in Engineering, University of Applied Sciences Western Switzerland.
Created
November 29, 2018 16:34
-
-
Save JamesMenetrey/2346792f1d76f850e10df258976064a6 to your computer and use it in GitHub Desktop.
Compute π on GPU by slicing an area (integration) in Cuda
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include "Slice.h" | |
#include <iostream> | |
#include <assert.h> | |
#include "Device.h" | |
using std::cout; | |
using std::endl; | |
/*----------------------------------------------------------------------*\ | |
|* Declaration *| | |
\*---------------------------------------------------------------------*/ | |
/*--------------------------------------*\ | |
|* Imported *| | |
\*-------------------------------------*/ | |
extern __global__ void slice(float* tabGM, int nbSlice); | |
/*----------------------------------------------------------------------*\ | |
|* Implementation *| | |
\*---------------------------------------------------------------------*/ | |
/*--------------------------------------*\ | |
|* Constructeur *| | |
\*-------------------------------------*/ | |
Slice::Slice(const Grid& grid, int nbSlice, float* ptrPiHat) : | |
ptrPiHat(ptrPiHat), nbSlice(nbSlice) | |
{ | |
// MM | |
{ | |
size_t sizeOctet = sizeof(float); | |
Device::malloc(&tabGM, sizeOctet); | |
Device::memclear(tabGM, sizeOctet); | |
} | |
// Grid | |
{ | |
this->dg = grid.dg; | |
this->db = grid.db; | |
} | |
} | |
Slice::~Slice(void) | |
{ | |
//MM (device free) | |
{ | |
Device::free(tabGM); | |
} | |
} | |
/*--------------------------------------*\ | |
|* Methode *| | |
\*-------------------------------------*/ | |
void Slice::run() | |
{ | |
// As every thread must have a cell in the SM, the number of cells in the SM | |
// must be equal to the number of threads. | |
size_t sizeSM = this->db.x * this->db.y * sizeof(float); | |
printf("sizeSM: %d\n", sizeSM); | |
slice<<<dg, db, sizeSM>>>(tabGM, nbSlice); | |
float sum; | |
Device::memcpyDToH(&sum, tabGM, sizeof(float)); | |
printf("=> PI = %f\n", sum / nbSlice); | |
*(this->ptrPiHat) = sum / nbSlice; | |
} | |
/*--------------------------------------*\ | |
|* Private *| | |
\*-------------------------------------*/ | |
/*----------------------------------------------------------------------*\ | |
|* End *| | |
\*---------------------------------------------------------------------*/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#pragma once | |
#include "cudaTools.h" | |
#include "Grid.h" | |
/*----------------------------------------------------------------------*\ | |
|* Declaration *| | |
\*---------------------------------------------------------------------*/ | |
/*--------------------------------------*\ | |
|* Public *| | |
\*-------------------------------------*/ | |
/** | |
* On passse la grille \ufffd\ufffd Slice pour pouvoir facilement la faire varier de l'ext\ufffd\ufffdrieur pour trouver l'optimum, ou faire des tests avec des grilles diff\ufffd\ufffdrentes | |
*/ | |
class Slice | |
{ | |
/*--------------------------------------*\ | |
|* Constructor *| | |
\*-------------------------------------*/ | |
public: | |
/** | |
* update piHat | |
* Hyp : nbThread est une puissance de 2 | |
*/ | |
Slice(const Grid& grid,int nbSlice, float* ptrPiHat); | |
virtual ~Slice(void); | |
/*--------------------------------------*\ | |
|* Methodes *| | |
\*-------------------------------------*/ | |
public: | |
void run(); | |
/*--------------------------------------*\ | |
|* Attributs *| | |
\*-------------------------------------*/ | |
private: | |
// Inputs | |
dim3 dg; | |
dim3 db; | |
int nbSlice; | |
// Inputs/Outputs | |
float* ptrPiHat; | |
// Tools | |
float* tabGM; // promotion de tabeau en GM | |
size_t sizeTabGM;// [octet] | |
}; | |
/*----------------------------------------------------------------------*\ | |
|* End *| | |
\*---------------------------------------------------------------------*/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include "Indice2D.h" | |
#include "Indice1D.h" | |
#include "cudaTools.h" | |
#include "reductionAddTools.h" | |
#include <stdio.h> | |
/*----------------------------------------------------------------------*\ | |
|* Declaration *| | |
\*---------------------------------------------------------------------*/ | |
/*--------------------------------------*\ | |
|* Imported *| | |
\*-------------------------------------*/ | |
/*--------------------------------------*\ | |
|* Public *| | |
\*-------------------------------------*/ | |
__device__ void reductionIntraThreadSlice(float* tabSM, int nbSlice); | |
/*--------------------------------------*\ | |
|* Private *| | |
\*-------------------------------------*/ | |
//static __device__ float aireRectangle(float xs, int nbSlice); | |
static __device__ float fSlice(float x); | |
/*----------------------------------------------------------------------*\ | |
|* Implementation *| | |
\*---------------------------------------------------------------------*/ | |
/*--------------------------------------*\ | |
|* Public *| | |
\*-------------------------------------*/ | |
__global__ void slice(float* tabGM, int nbSlice) | |
{ | |
__shared__ extern float tabSM[]; | |
reductionIntraThreadSlice(tabSM, nbSlice); | |
ReductionAddTools tools; | |
tools.reductionADD(tabSM, tabGM); | |
} | |
/** | |
* Chaque thread effecteur une reduction avec le patern d'entrelacement, | |
* puis stocke son r\ufffd\ufffdsultat dans SA case dans tabGM | |
* tabGM est un tableau promu, qui a autant de case que de thread | |
*/ | |
__device__ void reductionIntraThreadSlice(float* tabSM, int nbSlice) | |
{ | |
const int TID = Indice2D::tid(); | |
const int TID_LOCALE = Indice2D::tidLocal(); | |
const int NB_THREAD = Indice2D::nbThread(); | |
printf("I'm thread TID_LOCALE %d\n", TID_LOCALE); | |
int s = TID; | |
float sum = 0.0; | |
while (s < nbSlice) | |
{ | |
float xs = s / (float) nbSlice; | |
sum += fSlice(xs); | |
s += NB_THREAD; | |
} | |
// Only save the sum in the tab once for performance reasons. | |
// Indeed, using a temporary var can be optimized by the compiler to | |
// use a processor register. | |
tabSM[TID_LOCALE] = sum; | |
} | |
/*--------------------------------------*\ | |
|* Private *| | |
\*-------------------------------------*/ | |
/*-----------------------*\ | |
| Reduction Intrathread *| | |
\*-----------------------*/ | |
/*__device__ float aireRectangle(float xs, int nbSlice) | |
{ | |
return (1 / nbSlice) * f(xs); | |
}*/ | |
__device__ float fSlice(float x) | |
{ | |
return 4 / (1 + x * x); | |
} | |
/*----------------------------------------------------------------------*\ | |
|* End *| | |
\*---------------------------------------------------------------------*/ | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <iostream> | |
#include "Grid.h" | |
#include "Device.h" | |
#include "Slice.h" | |
#include "LimitsTools.h" | |
#include "MathTools.h" | |
using std::cout; | |
using std::endl; | |
/*----------------------------------------------------------------------*\ | |
|* Declaration *| | |
\*---------------------------------------------------------------------*/ | |
/*--------------------------------------*\ | |
|* Imported *| | |
\*-------------------------------------*/ | |
/*--------------------------------------*\ | |
|* Public *| | |
\*-------------------------------------*/ | |
bool isSliceOK(); | |
bool isSliceOK(const Grid& grid); | |
/*--------------------------------------*\ | |
|* Private *| | |
\*-------------------------------------*/ | |
/*----------------------------------------------------------------------*\ | |
|* Implementation *| | |
\*---------------------------------------------------------------------*/ | |
/*--------------------------------------*\ | |
|* Public *| | |
\*-------------------------------------*/ | |
/** | |
* On passse la grille \ufffd\ufffd useAddVecteur pour pouvoir facilement la faire varier de l'ext\ufffd\ufffdrieur pour trouver l'optimum, ou faire des tests avec des grilles diff\ufffd\ufffdrentes | |
* Contrainte : grid : puissance de 2 pour etre compatible avec la version naivePlus | |
*/ | |
bool isSliceOK(const Grid& grid) | |
{ | |
int nbSlice = 2000000; | |
float piHat; | |
Slice slice(grid, nbSlice, &piHat); | |
slice.run(); | |
bool isOk = MathTools::isEquals((double) piHat, PI, 1e-4); | |
return isOk; | |
} | |
bool isSliceOK() | |
{ | |
//int mp = Device::getMPCount(); | |
int coreMP = Device::getCoreCountMP(); | |
dim3 dg = dim3(2, 1, 1); // disons, a optimiser selon le gpu, peut drastiqument ameliorer ou baisser les performances | |
dim3 db = dim3(4, 1, 1); // disons, a optimiser selon le gpu, peut drastiqument ameliorer ou baisser les performances | |
Grid grid(dg, db); // puissance de 2 pour etre compatible avec la version naivePlus | |
return isSliceOK(grid); | |
} | |
/*--------------------------------------*\ | |
|* Private *| | |
\*-------------------------------------*/ | |
/*----------------------------------------------------------------------*\ | |
|* End *| | |
\*---------------------------------------------------------------------*/ | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment