Skip to content

Instantly share code, notes, and snippets.

@JamesMenetrey
Created November 29, 2018 16:34
Show Gist options
  • Save JamesMenetrey/2346792f1d76f850e10df258976064a6 to your computer and use it in GitHub Desktop.
Save JamesMenetrey/2346792f1d76f850e10df258976064a6 to your computer and use it in GitHub Desktop.
Compute π on GPU by slicing an area (integration) in Cuda

Written by Jämes Ménétry for the course GPGPU of the Master of Science in Engineering, University of Applied Sciences Western Switzerland.

#include "Slice.h"
#include <iostream>
#include <assert.h>
#include "Device.h"
using std::cout;
using std::endl;
/*----------------------------------------------------------------------*\
|* Declaration *|
\*---------------------------------------------------------------------*/
/*--------------------------------------*\
|* Imported *|
\*-------------------------------------*/
extern __global__ void slice(float* tabGM, int nbSlice);
/*----------------------------------------------------------------------*\
|* Implementation *|
\*---------------------------------------------------------------------*/
/*--------------------------------------*\
|* Constructeur *|
\*-------------------------------------*/
Slice::Slice(const Grid& grid, int nbSlice, float* ptrPiHat) :
ptrPiHat(ptrPiHat), nbSlice(nbSlice)
{
// MM
{
size_t sizeOctet = sizeof(float);
Device::malloc(&tabGM, sizeOctet);
Device::memclear(tabGM, sizeOctet);
}
// Grid
{
this->dg = grid.dg;
this->db = grid.db;
}
}
Slice::~Slice(void)
{
//MM (device free)
{
Device::free(tabGM);
}
}
/*--------------------------------------*\
|* Methode *|
\*-------------------------------------*/
void Slice::run()
{
// As every thread must have a cell in the SM, the number of cells in the SM
// must be equal to the number of threads.
size_t sizeSM = this->db.x * this->db.y * sizeof(float);
printf("sizeSM: %d\n", sizeSM);
slice<<<dg, db, sizeSM>>>(tabGM, nbSlice);
float sum;
Device::memcpyDToH(&sum, tabGM, sizeof(float));
printf("=> PI = %f\n", sum / nbSlice);
*(this->ptrPiHat) = sum / nbSlice;
}
/*--------------------------------------*\
|* Private *|
\*-------------------------------------*/
/*----------------------------------------------------------------------*\
|* End *|
\*---------------------------------------------------------------------*/
#pragma once
#include "cudaTools.h"
#include "Grid.h"
/*----------------------------------------------------------------------*\
|* Declaration *|
\*---------------------------------------------------------------------*/
/*--------------------------------------*\
|* Public *|
\*-------------------------------------*/
/**
* On passse la grille \ufffd\ufffd Slice pour pouvoir facilement la faire varier de l'ext\ufffd\ufffdrieur pour trouver l'optimum, ou faire des tests avec des grilles diff\ufffd\ufffdrentes
*/
class Slice
{
/*--------------------------------------*\
|* Constructor *|
\*-------------------------------------*/
public:
/**
* update piHat
* Hyp : nbThread est une puissance de 2
*/
Slice(const Grid& grid,int nbSlice, float* ptrPiHat);
virtual ~Slice(void);
/*--------------------------------------*\
|* Methodes *|
\*-------------------------------------*/
public:
void run();
/*--------------------------------------*\
|* Attributs *|
\*-------------------------------------*/
private:
// Inputs
dim3 dg;
dim3 db;
int nbSlice;
// Inputs/Outputs
float* ptrPiHat;
// Tools
float* tabGM; // promotion de tabeau en GM
size_t sizeTabGM;// [octet]
};
/*----------------------------------------------------------------------*\
|* End *|
\*---------------------------------------------------------------------*/
#include "Indice2D.h"
#include "Indice1D.h"
#include "cudaTools.h"
#include "reductionAddTools.h"
#include <stdio.h>
/*----------------------------------------------------------------------*\
|* Declaration *|
\*---------------------------------------------------------------------*/
/*--------------------------------------*\
|* Imported *|
\*-------------------------------------*/
/*--------------------------------------*\
|* Public *|
\*-------------------------------------*/
__device__ void reductionIntraThreadSlice(float* tabSM, int nbSlice);
/*--------------------------------------*\
|* Private *|
\*-------------------------------------*/
//static __device__ float aireRectangle(float xs, int nbSlice);
static __device__ float fSlice(float x);
/*----------------------------------------------------------------------*\
|* Implementation *|
\*---------------------------------------------------------------------*/
/*--------------------------------------*\
|* Public *|
\*-------------------------------------*/
__global__ void slice(float* tabGM, int nbSlice)
{
__shared__ extern float tabSM[];
reductionIntraThreadSlice(tabSM, nbSlice);
ReductionAddTools tools;
tools.reductionADD(tabSM, tabGM);
}
/**
* Chaque thread effecteur une reduction avec le patern d'entrelacement,
* puis stocke son r\ufffd\ufffdsultat dans SA case dans tabGM
* tabGM est un tableau promu, qui a autant de case que de thread
*/
__device__ void reductionIntraThreadSlice(float* tabSM, int nbSlice)
{
const int TID = Indice2D::tid();
const int TID_LOCALE = Indice2D::tidLocal();
const int NB_THREAD = Indice2D::nbThread();
printf("I'm thread TID_LOCALE %d\n", TID_LOCALE);
int s = TID;
float sum = 0.0;
while (s < nbSlice)
{
float xs = s / (float) nbSlice;
sum += fSlice(xs);
s += NB_THREAD;
}
// Only save the sum in the tab once for performance reasons.
// Indeed, using a temporary var can be optimized by the compiler to
// use a processor register.
tabSM[TID_LOCALE] = sum;
}
/*--------------------------------------*\
|* Private *|
\*-------------------------------------*/
/*-----------------------*\
| Reduction Intrathread *|
\*-----------------------*/
/*__device__ float aireRectangle(float xs, int nbSlice)
{
return (1 / nbSlice) * f(xs);
}*/
__device__ float fSlice(float x)
{
return 4 / (1 + x * x);
}
/*----------------------------------------------------------------------*\
|* End *|
\*---------------------------------------------------------------------*/
#include <iostream>
#include "Grid.h"
#include "Device.h"
#include "Slice.h"
#include "LimitsTools.h"
#include "MathTools.h"
using std::cout;
using std::endl;
/*----------------------------------------------------------------------*\
|* Declaration *|
\*---------------------------------------------------------------------*/
/*--------------------------------------*\
|* Imported *|
\*-------------------------------------*/
/*--------------------------------------*\
|* Public *|
\*-------------------------------------*/
bool isSliceOK();
bool isSliceOK(const Grid& grid);
/*--------------------------------------*\
|* Private *|
\*-------------------------------------*/
/*----------------------------------------------------------------------*\
|* Implementation *|
\*---------------------------------------------------------------------*/
/*--------------------------------------*\
|* Public *|
\*-------------------------------------*/
/**
* On passse la grille \ufffd\ufffd useAddVecteur pour pouvoir facilement la faire varier de l'ext\ufffd\ufffdrieur pour trouver l'optimum, ou faire des tests avec des grilles diff\ufffd\ufffdrentes
* Contrainte : grid : puissance de 2 pour etre compatible avec la version naivePlus
*/
bool isSliceOK(const Grid& grid)
{
int nbSlice = 2000000;
float piHat;
Slice slice(grid, nbSlice, &piHat);
slice.run();
bool isOk = MathTools::isEquals((double) piHat, PI, 1e-4);
return isOk;
}
bool isSliceOK()
{
//int mp = Device::getMPCount();
int coreMP = Device::getCoreCountMP();
dim3 dg = dim3(2, 1, 1); // disons, a optimiser selon le gpu, peut drastiqument ameliorer ou baisser les performances
dim3 db = dim3(4, 1, 1); // disons, a optimiser selon le gpu, peut drastiqument ameliorer ou baisser les performances
Grid grid(dg, db); // puissance de 2 pour etre compatible avec la version naivePlus
return isSliceOK(grid);
}
/*--------------------------------------*\
|* Private *|
\*-------------------------------------*/
/*----------------------------------------------------------------------*\
|* End *|
\*---------------------------------------------------------------------*/
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment