JamesMenetrey · November 29, 2018 16:34
diff --git a/README.md b/README.md
diff --git a/Slice.cu b/Slice.cu
 #include "Slice.h"

 #include <iostream>
 #include <assert.h>

 #include "Device.h"

 using std::cout;
 using std::endl;

 /*----------------------------------------------------------------------*\
 |*			Declaration 					*|
 \*---------------------------------------------------------------------*/

 /*--------------------------------------*\
 |*		Imported	 	*|
 \*-------------------------------------*/

 extern __global__ void slice(float* tabGM, int nbSlice);

 /*----------------------------------------------------------------------*\
 |*			Implementation 					*|
 \*---------------------------------------------------------------------*/

 /*--------------------------------------*\
 |*		Constructeur			*|
 \*-------------------------------------*/

 Slice::Slice(const Grid& grid, int nbSlice, float* ptrPiHat) :
 		ptrPiHat(ptrPiHat), nbSlice(nbSlice)
 {
 	// MM
 	{
 		size_t sizeOctet = sizeof(float);

 		Device::malloc(&tabGM, sizeOctet);
 		Device::memclear(tabGM, sizeOctet);
 	}

 	// Grid
 	{
 		this->dg = grid.dg;
 		this->db = grid.db;
 	}
 }

 Slice::~Slice(void)
 {
 	//MM (device free)
 	{
 		Device::free(tabGM);
 	}
 }

 /*--------------------------------------*\
 |*		Methode			*|
 \*-------------------------------------*/

 void Slice::run()
 {
 	// As every thread must have a cell in the SM, the number of cells in the SM
 	// must be equal to the number of threads.
 	size_t sizeSM = this->db.x * this->db.y * sizeof(float);
 	printf("sizeSM: %d\n", sizeSM);

 	slice<<<dg, db, sizeSM>>>(tabGM, nbSlice);

 	float sum;
 	Device::memcpyDToH(&sum, tabGM, sizeof(float));

 	printf("=> PI = %f\n", sum / nbSlice);

 	*(this->ptrPiHat) = sum / nbSlice;
 }

 /*--------------------------------------*\
 |*		Private			*|
 \*-------------------------------------*/

 /*----------------------------------------------------------------------*\
 |*			End	 					*|
 \*---------------------------------------------------------------------*/
diff --git a/Slice.h b/Slice.h
 #pragma once

 #include "cudaTools.h"
 #include "Grid.h"


 /*----------------------------------------------------------------------*\
 |*			Declaration 					*|
 \*---------------------------------------------------------------------*/

 /*--------------------------------------*\
 |*		Public			*|
 \*-------------------------------------*/

 /**
 * On passse la grille \ufffd\ufffd Slice pour pouvoir facilement la faire varier de l'ext\ufffd\ufffdrieur pour trouver l'optimum, ou faire des tests avec des grilles diff\ufffd\ufffdrentes
 */
 class Slice
    {
 	/*--------------------------------------*\
 	|*		Constructor		*|
 	 \*-------------------------------------*/

    public:

 	/**
 	 * update piHat
 	 * Hyp : nbThread est une puissance de 2
 	 */
 	Slice(const Grid& grid,int nbSlice, float* ptrPiHat);

 	virtual ~Slice(void);

 	/*--------------------------------------*\
 	|*		Methodes		*|
 	 \*-------------------------------------*/

    public:

 	void run();

 	/*--------------------------------------*\
 	|*		Attributs		*|
 	 \*-------------------------------------*/

    private:

 	// Inputs
 	dim3 dg;
 	dim3 db;
 	int nbSlice;

 	// Inputs/Outputs
 	float* ptrPiHat;

 	// Tools
 	float* tabGM; // promotion de tabeau en GM
 	size_t sizeTabGM;// [octet]

    };

 /*----------------------------------------------------------------------*\
 |*			End	 					*|
 \*---------------------------------------------------------------------*/
diff --git a/slice_device.cu b/slice_device.cu
 #include "Indice2D.h"
 #include "Indice1D.h"
 #include "cudaTools.h"
 #include "reductionAddTools.h"

 #include <stdio.h>

 /*----------------------------------------------------------------------*\
 |*			Declaration 					*|
 \*---------------------------------------------------------------------*/

 /*--------------------------------------*\
 |*		Imported	 	*|
 \*-------------------------------------*/

 /*--------------------------------------*\
 |*		Public			*|
 \*-------------------------------------*/

 __device__ void reductionIntraThreadSlice(float* tabSM, int nbSlice);

 /*--------------------------------------*\
 |*		Private			*|
 \*-------------------------------------*/

 //static __device__ float aireRectangle(float xs, int nbSlice);
 static __device__ float fSlice(float x);

 /*----------------------------------------------------------------------*\
 |*			Implementation 					*|
 \*---------------------------------------------------------------------*/

 /*--------------------------------------*\
 |*		Public			*|
 \*-------------------------------------*/

 __global__ void slice(float* tabGM, int nbSlice)
 {
 	__shared__ extern float tabSM[];

 	reductionIntraThreadSlice(tabSM, nbSlice);

 	ReductionAddTools tools;
 	tools.reductionADD(tabSM, tabGM);
 }

 /**
 * Chaque thread effecteur une reduction avec le patern d'entrelacement,
 * puis stocke son r\ufffd\ufffdsultat dans SA case dans tabGM
 * tabGM est un tableau promu, qui a autant de case que de thread
 */
 __device__ void reductionIntraThreadSlice(float* tabSM, int nbSlice)
 {
 	const int TID = Indice2D::tid();
 	const int TID_LOCALE = Indice2D::tidLocal();
 	const int NB_THREAD = Indice2D::nbThread();
 	printf("I'm thread TID_LOCALE %d\n", TID_LOCALE);

 	int s = TID;
 	float sum = 0.0;
 	while (s < nbSlice)
 	{
 		float xs = s / (float) nbSlice;
 		sum += fSlice(xs);

 		s += NB_THREAD;
 	}

 	// Only save the sum in the tab once for performance reasons.
 	// Indeed, using a temporary var can be optimized by the compiler to
 	// use a processor register.
 	tabSM[TID_LOCALE] = sum;
 }

 /*--------------------------------------*\
 |*		Private			*|
 \*-------------------------------------*/

 /*-----------------------*\
 | Reduction Intrathread  *|
 \*-----------------------*/

 /*__device__ float aireRectangle(float xs, int nbSlice)
 {
 return (1 / nbSlice) * f(xs);
 }*/

 __device__ float fSlice(float x)
 {
 	return 4 / (1 + x * x);
 }

 /*----------------------------------------------------------------------*\
 |*			End	 					*|
 \*---------------------------------------------------------------------*/

diff --git a/useSlice.cpp b/useSlice.cpp
 #include <iostream>
 #include "Grid.h"
 #include "Device.h"
 #include "Slice.h"
 #include "LimitsTools.h"
 #include "MathTools.h"

 using std::cout;
 using std::endl;

 /*----------------------------------------------------------------------*\
 |*			Declaration 					*|
 \*---------------------------------------------------------------------*/

 /*--------------------------------------*\
 |*		Imported	 	*|
 \*-------------------------------------*/

 /*--------------------------------------*\
 |*		Public			*|
 \*-------------------------------------*/

 bool isSliceOK();
 bool isSliceOK(const Grid& grid);

 /*--------------------------------------*\
 |*		Private			*|
 \*-------------------------------------*/

 /*----------------------------------------------------------------------*\
 |*			Implementation 					*|
 \*---------------------------------------------------------------------*/

 /*--------------------------------------*\
 |*		Public			*|
 \*-------------------------------------*/

 /**
 * On passse la grille \ufffd\ufffd useAddVecteur pour pouvoir facilement la faire varier de l'ext\ufffd\ufffdrieur pour trouver l'optimum, ou faire des tests avec des grilles diff\ufffd\ufffdrentes
 * Contrainte : grid :  puissance de 2 pour etre compatible avec la version naivePlus
 */
 bool isSliceOK(const Grid& grid)
    {
    int nbSlice = 2000000;
    float piHat;

    Slice slice(grid, nbSlice, &piHat);
    slice.run();

    bool isOk = MathTools::isEquals((double) piHat, PI, 1e-4);

    return isOk;
    }

 bool isSliceOK()
    {
    //int mp = Device::getMPCount();
    int coreMP = Device::getCoreCountMP();

    dim3 dg = dim3(2, 1, 1);  		// disons, a optimiser selon le gpu, peut drastiqument ameliorer ou baisser les performances
    dim3 db = dim3(4, 1, 1);   		// disons, a optimiser selon le gpu, peut drastiqument ameliorer ou baisser les performances
    Grid grid(dg, db); // puissance de 2 pour etre compatible avec la version naivePlus

    return isSliceOK(grid);
    }

 /*--------------------------------------*\
 |*		Private			*|
 \*-------------------------------------*/

 /*----------------------------------------------------------------------*\
 |*			End	 					*|
 \*---------------------------------------------------------------------*/
	#include "Slice.h"

	#include <iostream>
	#include <assert.h>

	#include "Device.h"

	using std::cout;
	using std::endl;

	/----------------------------------------------------------------------\
	\|* Declaration *\|
	\---------------------------------------------------------------------/

	/--------------------------------------\
	\|* Imported *\|
	\-------------------------------------/

	extern __global__ void slice(float* tabGM, int nbSlice);

	/----------------------------------------------------------------------\
	\|* Implementation *\|
	\---------------------------------------------------------------------/

	/--------------------------------------\
	\|* Constructeur *\|
	\-------------------------------------/

	Slice::Slice(const Grid& grid, int nbSlice, float* ptrPiHat) :
	ptrPiHat(ptrPiHat), nbSlice(nbSlice)
	{
	// MM
	{
	size_t sizeOctet = sizeof(float);

	Device::malloc(&tabGM, sizeOctet);
	Device::memclear(tabGM, sizeOctet);
	}

	// Grid
	{
	this->dg = grid.dg;
	this->db = grid.db;
	}
	}

	Slice::~Slice(void)
	{
	//MM (device free)
	{
	Device::free(tabGM);
	}
	}

	/--------------------------------------\
	\|* Methode *\|
	\-------------------------------------/

	void Slice::run()
	{
	// As every thread must have a cell in the SM, the number of cells in the SM
	// must be equal to the number of threads.
	size_t sizeSM = this->db.x * this->db.y * sizeof(float);
	printf("sizeSM: %d\n", sizeSM);

	slice<<<dg, db, sizeSM>>>(tabGM, nbSlice);

	float sum;
	Device::memcpyDToH(&sum, tabGM, sizeof(float));

	printf("=> PI = %f\n", sum / nbSlice);

	*(this->ptrPiHat) = sum / nbSlice;
	}

	/--------------------------------------\
	\|* Private *\|
	\-------------------------------------/

	/----------------------------------------------------------------------\
	\|* End *\|
	\---------------------------------------------------------------------/
	#pragma once

	#include "cudaTools.h"
	#include "Grid.h"


	/----------------------------------------------------------------------\
	\|* Declaration *\|
	\---------------------------------------------------------------------/

	/--------------------------------------\
	\|* Public *\|
	\-------------------------------------/

	/**
	* On passse la grille \ufffd\ufffd Slice pour pouvoir facilement la faire varier de l'ext\ufffd\ufffdrieur pour trouver l'optimum, ou faire des tests avec des grilles diff\ufffd\ufffdrentes
	*/
	class Slice
	{
	/--------------------------------------\
	\|* Constructor *\|
	\-------------------------------------/

	public:

	/**
	* update piHat
	* Hyp : nbThread est une puissance de 2
	*/
	Slice(const Grid& grid,int nbSlice, float* ptrPiHat);

	virtual ~Slice(void);

	/--------------------------------------\
	\|* Methodes *\|
	\-------------------------------------/

	public:

	void run();

	/--------------------------------------\
	\|* Attributs *\|
	\-------------------------------------/

	private:

	// Inputs
	dim3 dg;
	dim3 db;
	int nbSlice;

	// Inputs/Outputs
	float* ptrPiHat;

	// Tools
	float* tabGM; // promotion de tabeau en GM
	size_t sizeTabGM;// [octet]

	};

	/----------------------------------------------------------------------\
	\|* End *\|
	\---------------------------------------------------------------------/
	#include "Indice2D.h"
	#include "Indice1D.h"
	#include "cudaTools.h"
	#include "reductionAddTools.h"

	#include <stdio.h>

	/----------------------------------------------------------------------\
	\|* Declaration *\|
	\---------------------------------------------------------------------/

	/--------------------------------------\
	\|* Imported *\|
	\-------------------------------------/

	/--------------------------------------\
	\|* Public *\|
	\-------------------------------------/

	__device__ void reductionIntraThreadSlice(float* tabSM, int nbSlice);

	/--------------------------------------\
	\|* Private *\|
	\-------------------------------------/

	//static __device__ float aireRectangle(float xs, int nbSlice);
	static __device__ float fSlice(float x);

	/----------------------------------------------------------------------\
	\|* Implementation *\|
	\---------------------------------------------------------------------/

	/--------------------------------------\
	\|* Public *\|
	\-------------------------------------/

	__global__ void slice(float* tabGM, int nbSlice)
	{
	__shared__ extern float tabSM[];

	reductionIntraThreadSlice(tabSM, nbSlice);

	ReductionAddTools tools;
	tools.reductionADD(tabSM, tabGM);
	}

	/**
	* Chaque thread effecteur une reduction avec le patern d'entrelacement,
	* puis stocke son r\ufffd\ufffdsultat dans SA case dans tabGM
	* tabGM est un tableau promu, qui a autant de case que de thread
	*/
	__device__ void reductionIntraThreadSlice(float* tabSM, int nbSlice)
	{
	const int TID = Indice2D::tid();
	const int TID_LOCALE = Indice2D::tidLocal();
	const int NB_THREAD = Indice2D::nbThread();
	printf("I'm thread TID_LOCALE %d\n", TID_LOCALE);

	int s = TID;
	float sum = 0.0;
	while (s < nbSlice)
	{
	float xs = s / (float) nbSlice;
	sum += fSlice(xs);

	s += NB_THREAD;
	}

	// Only save the sum in the tab once for performance reasons.
	// Indeed, using a temporary var can be optimized by the compiler to
	// use a processor register.
	tabSM[TID_LOCALE] = sum;
	}

	/--------------------------------------\
	\|* Private *\|
	\-------------------------------------/

	/-----------------------\
	\| Reduction Intrathread *\|
	\-----------------------/

	/*__device__ float aireRectangle(float xs, int nbSlice)
	{
	return (1 / nbSlice) * f(xs);
	}*/

	__device__ float fSlice(float x)
	{
	return 4 / (1 + x * x);
	}

	/----------------------------------------------------------------------\
	\|* End *\|
	\---------------------------------------------------------------------/