Created
October 21, 2015 16:25
-
-
Save gigablah/1c72acbe718844310b10 to your computer and use it in GitHub Desktop.
Parallel implementation of the Gaussian elimination algorithm for the STI Cell
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <spu_intrinsics.h> | |
#include <spu_mfcio.h> | |
#include <stdio.h> | |
#include <string.h> | |
#include "le_spe.h" | |
// Local store structures and buffers. | |
volatile parm_context ctx __attribute__ ((aligned(16))); | |
volatile float a[BLOCKSIZE] __attribute__ ((aligned(16))); // active row | |
volatile float x[BLOCKSIZE] __attribute__ ((aligned(16))); // target row | |
volatile vector float b __attribute__ ((aligned(16))); // active right side | |
volatile vector float y __attribute__ ((aligned(16))); // target right side | |
int main(unsigned long long spu_id __attribute__ ((unused)), unsigned long long parm) { | |
unsigned int tag_id; | |
unsigned int part, offset, moffset, i, j, k, l; | |
unsigned long long time1, time2, nb_ticks; | |
float ratio __attribute__ ((aligned(16))); | |
float elapsed; | |
//vector float *va = (vector float *) &a[0]; | |
//vector float *vx = (vector float *) &x[0]; | |
//vector float vr; | |
// Reserve a tag for application usage | |
if ((tag_id = mfc_tag_reserve()) == MFC_TAG_INVALID) { | |
printf("ERROR: unable to reserve a tag\n"); | |
return 1; | |
} | |
spu_writech(MFC_WrTagMask, -1); | |
// Fetch the context. Wait for the DMA to complete | |
spu_mfcdma32((void *)(&ctx), (unsigned int)parm, sizeof(parm_context), tag_id, MFC_GET_CMD); | |
(void)spu_mfcstat(MFC_TAG_UPDATE_ALL); | |
part = ctx.partition; | |
i = spu_read_in_mbox(); | |
// printf("%d\t", cur); fflush(stdout); | |
time2 = TIMEBASE*5; | |
spu_write_decrementer(time2); | |
offset = 0; | |
while (i < N) { | |
moffset = i - i%4; | |
//for(j=0;j<N;j++) { | |
// printf("%d:%.2f\t", i, b[0]); | |
//} | |
// Right side of active row only needs to be fetched once | |
spu_mfcdma32(&b, (unsigned int)(ctx.ptr_b+i), sizeof(vector float), tag_id, MFC_GET_CMD); | |
(void)spu_mfcstat(MFC_TAG_UPDATE_ALL); | |
// forward elimination | |
for (j=i+1+part; j<N; j+=MAX_SPE_THREADS) { | |
// Fetch initial data blocks for active and target row. Wait for the DMA to complete | |
spu_mfcdma32((void *)a, (unsigned int)(ctx.ptr_a+N*i+offset), BLOCKSIZE*sizeof(float), tag_id, MFC_GET_CMD); | |
spu_mfcdma32((void *)x, (unsigned int)(ctx.ptr_a+N*j+offset), BLOCKSIZE*sizeof(float), tag_id, MFC_GET_CMD); | |
// Right side of target row is fetched once | |
spu_mfcdma32(&y, (unsigned int)(ctx.ptr_b+j), sizeof(vector float), tag_id, MFC_GET_CMD); | |
(void)spu_mfcstat(MFC_TAG_UPDATE_ALL); | |
ratio = -x[i%BLOCKSIZE] / a[i%BLOCKSIZE]; // initialize ratio | |
//vr = spu_splats(ratio); // initialize ratio vector | |
y[0] += ratio * b[0]; // right side | |
// Do initial block | |
// row manipulation - scalar | |
for (k=i; k<BLOCKSIZE; k++) { | |
x[k] = ratio * a[k] + x[k]; | |
} | |
//for (k=moffset/4; k<BLOCKSIZE/4; k++) { | |
// vx[k] = spu_madd(va[k], vr, vx[k]); | |
//} | |
// Put initial block back into system memory | |
spu_mfcdma32((void *)x, (unsigned int)(ctx.ptr_a+N*j+offset), BLOCKSIZE*sizeof(float), tag_id, MFC_PUT_CMD); | |
spu_mfcdma32(&y, (unsigned int)(ctx.ptr_b+j), sizeof(vector float), tag_id, MFC_PUT_CMD); | |
// Fetch and manipulate the remaining blocks | |
/* for (l=offset+BLOCKSIZE; l<N; l+=BLOCKSIZE) { | |
// Fetch next data blocks for current and target row. Wait for the DMA to complete | |
spu_mfcdma32((void *)a, (unsigned int)(ctx.ptr_a+N*i+l), BLOCKSIZE*sizeof(float), tag_id, MFC_GET_CMD); | |
spu_mfcdma32((void *)x, (unsigned int)(ctx.ptr_a+N*j+l), BLOCKSIZE*sizeof(float), tag_id, MFC_GET_CMD); | |
(void)spu_mfcstat(MFC_TAG_UPDATE_ALL); | |
// row manipulation - scalar | |
//for (k=i; k<N; k++) { | |
// x[k] = ratio * a[k] + x[k]; | |
//} | |
// row manipulation - SIMDized | |
for (k=0; k<BLOCKSIZE/4; k++) { | |
vx[k] = spu_madd(va[k], vr, vx[k]); | |
} | |
// Put the data back into system memory | |
spu_mfcdma32((void *)x, (unsigned int)(ctx.ptr_a+N*j+l), BLOCKSIZE*sizeof(float), tag_id, MFC_PUT_CMD); | |
}*/ | |
} | |
// Current task finished, notify PPE | |
spu_write_out_mbox(1); | |
// Grab next row to work on | |
i = spu_read_in_mbox(); | |
} | |
time1 = spu_read_decrementer(); | |
nb_ticks = time2 - time1; | |
elapsed = nb_ticks*1.0 / (TIMEBASE*1.0); | |
printf("%d:%f\n", part, elapsed); | |
return (0); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment