Skip to content

Instantly share code, notes, and snippets.

@gigablah
Created October 21, 2015 16:25
Show Gist options
  • Save gigablah/1c72acbe718844310b10 to your computer and use it in GitHub Desktop.
Save gigablah/1c72acbe718844310b10 to your computer and use it in GitHub Desktop.
Parallel implementation of the Gaussian elimination algorithm for the STI Cell
#include <spu_intrinsics.h>
#include <spu_mfcio.h>
#include <stdio.h>
#include <string.h>
#include "le_spe.h"
// Local store structures and buffers.
volatile parm_context ctx __attribute__ ((aligned(16)));
volatile float a[BLOCKSIZE] __attribute__ ((aligned(16))); // active row
volatile float x[BLOCKSIZE] __attribute__ ((aligned(16))); // target row
volatile vector float b __attribute__ ((aligned(16))); // active right side
volatile vector float y __attribute__ ((aligned(16))); // target right side
int main(unsigned long long spu_id __attribute__ ((unused)), unsigned long long parm) {
unsigned int tag_id;
unsigned int part, offset, moffset, i, j, k, l;
unsigned long long time1, time2, nb_ticks;
float ratio __attribute__ ((aligned(16)));
float elapsed;
//vector float *va = (vector float *) &a[0];
//vector float *vx = (vector float *) &x[0];
//vector float vr;
// Reserve a tag for application usage
if ((tag_id = mfc_tag_reserve()) == MFC_TAG_INVALID) {
printf("ERROR: unable to reserve a tag\n");
return 1;
}
spu_writech(MFC_WrTagMask, -1);
// Fetch the context. Wait for the DMA to complete
spu_mfcdma32((void *)(&ctx), (unsigned int)parm, sizeof(parm_context), tag_id, MFC_GET_CMD);
(void)spu_mfcstat(MFC_TAG_UPDATE_ALL);
part = ctx.partition;
i = spu_read_in_mbox();
// printf("%d\t", cur); fflush(stdout);
time2 = TIMEBASE*5;
spu_write_decrementer(time2);
offset = 0;
while (i < N) {
moffset = i - i%4;
//for(j=0;j<N;j++) {
// printf("%d:%.2f\t", i, b[0]);
//}
// Right side of active row only needs to be fetched once
spu_mfcdma32(&b, (unsigned int)(ctx.ptr_b+i), sizeof(vector float), tag_id, MFC_GET_CMD);
(void)spu_mfcstat(MFC_TAG_UPDATE_ALL);
// forward elimination
for (j=i+1+part; j<N; j+=MAX_SPE_THREADS) {
// Fetch initial data blocks for active and target row. Wait for the DMA to complete
spu_mfcdma32((void *)a, (unsigned int)(ctx.ptr_a+N*i+offset), BLOCKSIZE*sizeof(float), tag_id, MFC_GET_CMD);
spu_mfcdma32((void *)x, (unsigned int)(ctx.ptr_a+N*j+offset), BLOCKSIZE*sizeof(float), tag_id, MFC_GET_CMD);
// Right side of target row is fetched once
spu_mfcdma32(&y, (unsigned int)(ctx.ptr_b+j), sizeof(vector float), tag_id, MFC_GET_CMD);
(void)spu_mfcstat(MFC_TAG_UPDATE_ALL);
ratio = -x[i%BLOCKSIZE] / a[i%BLOCKSIZE]; // initialize ratio
//vr = spu_splats(ratio); // initialize ratio vector
y[0] += ratio * b[0]; // right side
// Do initial block
// row manipulation - scalar
for (k=i; k<BLOCKSIZE; k++) {
x[k] = ratio * a[k] + x[k];
}
//for (k=moffset/4; k<BLOCKSIZE/4; k++) {
// vx[k] = spu_madd(va[k], vr, vx[k]);
//}
// Put initial block back into system memory
spu_mfcdma32((void *)x, (unsigned int)(ctx.ptr_a+N*j+offset), BLOCKSIZE*sizeof(float), tag_id, MFC_PUT_CMD);
spu_mfcdma32(&y, (unsigned int)(ctx.ptr_b+j), sizeof(vector float), tag_id, MFC_PUT_CMD);
// Fetch and manipulate the remaining blocks
/* for (l=offset+BLOCKSIZE; l<N; l+=BLOCKSIZE) {
// Fetch next data blocks for current and target row. Wait for the DMA to complete
spu_mfcdma32((void *)a, (unsigned int)(ctx.ptr_a+N*i+l), BLOCKSIZE*sizeof(float), tag_id, MFC_GET_CMD);
spu_mfcdma32((void *)x, (unsigned int)(ctx.ptr_a+N*j+l), BLOCKSIZE*sizeof(float), tag_id, MFC_GET_CMD);
(void)spu_mfcstat(MFC_TAG_UPDATE_ALL);
// row manipulation - scalar
//for (k=i; k<N; k++) {
// x[k] = ratio * a[k] + x[k];
//}
// row manipulation - SIMDized
for (k=0; k<BLOCKSIZE/4; k++) {
vx[k] = spu_madd(va[k], vr, vx[k]);
}
// Put the data back into system memory
spu_mfcdma32((void *)x, (unsigned int)(ctx.ptr_a+N*j+l), BLOCKSIZE*sizeof(float), tag_id, MFC_PUT_CMD);
}*/
}
// Current task finished, notify PPE
spu_write_out_mbox(1);
// Grab next row to work on
i = spu_read_in_mbox();
}
time1 = spu_read_decrementer();
nb_ticks = time2 - time1;
elapsed = nb_ticks*1.0 / (TIMEBASE*1.0);
printf("%d:%f\n", part, elapsed);
return (0);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment