pratikone · July 26, 2016 17:07
diff --git a/knapsack.c b/knapsack.c
 // The code is based on needle.c for Needleman-Wunsch algorithm at
 // https://github.com/vtsynergy/OpenDwarfs/blob/master/dynamic-programming/nw/needle.c

 #define LIMIT -999
 #include <stdlib.h>
 #include <stdio.h>
 #include <string.h>
 #include <math.h>
 #include "needle.h"
 #include <sys/time.h>
 #include "../../include/rdtsc.h"
 #include "../../include/common_args.h"
 #include <malloc.h>
 #define AOCL_ALIGNMENT 64


 //#define TRACE

 void runTest( int argc, char** argv);


 double gettime() {
 	struct timeval t;
 	gettimeofday(&t,NULL);
 	return t.tv_sec+t.tv_usec*1e-6;
 }

 int 
 maximum( int a,
 		int b,
 		int c){

 	int k;
 	if( a <= b )
 		k = b;
 	else 
 		k = a;

 	if( k <=c )
 		return(c);
 	else
 		return(k);

 }

 ////////////////////////////////////////////////////////////////////////////////
 // Program main
 ////////////////////////////////////////////////////////////////////////////////
 	int
 main( int argc, char** argv) 
 {
 	ocd_init(&argc, &argv, NULL);
 	ocd_initCL();
 	runTest( argc, argv);
 	ocd_finalize();
 	return EXIT_SUCCESS;
 }

 void usage(int argc, char **argv)
 {
 	fprintf(stderr, "Usage: %s <max_rows/max_cols> <penalty> \n", argv[0]);
 	fprintf(stderr, "\t<dimension>  - x and y dimensions\n");
 	fprintf(stderr, "\t<penalty> - penalty(positive integer)\n");
 	exit(1);
 }

 void runTest( int argc, char** argv) 
 {
 	int max_rows, max_cols, penalty;
 	int *input_itemsets, *weigths, *values;
 	cl_mem matrix_cuda,weigths_cuda, values_cuda;
 	int size;
 	int i, j;

 	cl_int errcode;

 	// the lengths of the two sequences should be able to divided by 16.
 	// And at current stage  max_rows needs to equal max_cols
 	if (argc == 3)
 	{
 		max_rows = atoi(argv[1]);    //no of items
 		max_cols = atoi(argv[2]);    // maximum capacity , column range will be W
 	}
 	else{
 		usage(argc, argv);
 	}

 	if(atoi(argv[1])%16!=0){
 		fprintf(stderr,"The dimension values must be a multiple of 16\n");
 		exit(1);
 	}

 	max_rows = max_rows + 1;
 	max_cols = max_cols + 1;
    if(_deviceType == 3) {
 	    weigths      = (int *)memalign(AOCL_ALIGNMENT, max_rows * sizeof(int) );
 	    values      = (int *)memalign(AOCL_ALIGNMENT, max_rows * sizeof(int) );
 	    input_itemsets  = (int *)memalign(AOCL_ALIGNMENT, max_rows * max_cols * sizeof(int) );

    } else { 
        weigths      = (int *)malloc( max_rows * sizeof(int) );
        values      = (int *)malloc( max_rows * sizeof(int) );
 	    input_itemsets  = (int *)malloc(max_rows * max_cols * sizeof(int) );
    }

 	if (!input_itemsets)
 		fprintf(stderr, "error: can not allocate memory");

 	srand ( 7 );   //TODO
 	//Initialize to zero 
 	for (i = 0 ; i < max_cols; i++){
 		for (j = 0 ; j < max_rows; j++){
 			input_itemsets[i*max_cols+j] = 0;
 		}
 	}

 	//Used for input itemsets, then after the weigths matrix is constructed it is used/rewritten to store scores.
 	for(i=0; i< max_rows ; i++){    //please define your own sequence. 
 		weigths[i] = rand() % 10;  
 		values[i] = rand() % 10;  
 	}


 	//cl_device_id device_id;
 	//cl_context context;
 	//cl_command_queue commands;
 	cl_program clProgram;
 	cl_kernel clKernel_nw1;
 	cl_kernel clKernel_nw2;


 	FILE *kernelFile;
 	char *kernelSource;
 	size_t kernelLength;
    char* kernel_files;
    int num_kernels = 1;
    kernel_files = (char*) malloc(sizeof(char*)*num_kernels);
 	strcpy(kernel_files,"knapsack_kernel");
    clProgram = ocdBuildProgramFromFile(context,device_id, kernel_files, NULL);

 	clKernel_nw1 = clCreateKernel(clProgram, "knapsack_opencl_shared_1", &errcode);
 	CHKERR(errcode, "Failed to create kernel!");
 	
 	size = max_cols * max_rows;
 	weigths_cuda = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(int)*max_rows, NULL, &errcode);
 	CHKERR(errcode, "Failed to create buffer!");
 	
 	values_cuda = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(int)*max_rows, NULL, &errcode);
 	CHKERR(errcode, "Failed to create buffer!");

 	matrix_cuda = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(int)*size, NULL, &errcode);
 	CHKERR(errcode, "Failed to create buffer!");

 	errcode = clEnqueueWriteBuffer(commands, weigths_cuda, CL_TRUE, 0, sizeof(int)*max_rows, (void *) weigths, 0, NULL, &ocdTempEvent);
 	clFinish(commands);
 	START_TIMER(ocdTempEvent, OCD_TIMER_H2D, "NW Reference Copy", ocdTempTimer)
 	END_TIMER(ocdTempTimer)
 	CHKERR(errcode, "Failed to enqueue write buffer!");

 	errcode = clEnqueueWriteBuffer(commands, values_cuda, CL_TRUE, 0, sizeof(int)*max_rows, (void *) values, 0, NULL, &ocdTempEvent);
 	clFinish(commands);
 	START_TIMER(ocdTempEvent, OCD_TIMER_H2D, "NW Reference Copy", ocdTempTimer)
 	END_TIMER(ocdTempTimer)
 	CHKERR(errcode, "Failed to enqueue write buffer!");




 	errcode = clEnqueueWriteBuffer(commands, matrix_cuda, CL_TRUE, 0, sizeof(int)*size, (void *) input_itemsets, 0, NULL, &ocdTempEvent);
 	clFinish(commands);
 	START_TIMER(ocdTempEvent, OCD_TIMER_H2D, "Knapsack table Set Copy", ocdTempTimer)
 	END_TIMER(ocdTempTimer)
 	CHKERR(errcode, "Failed to enqueue write buffer!");

 	size_t localWorkSize[2] = {BLOCK_SIZE, 1}; //BLOCK_SIZE work items per work-group in 1D only.
 	size_t globalWorkSize[2];
 	int block_width = ( max_cols - 1 )/BLOCK_SIZE;


 	
 	// Processing the whole Knapsack table row by row here
 	for(i = 1 ; i < max_rows ; i++){
 		globalWorkSize[0] = max_cols - 1; 
 		
 		globalWorkSize[1] = 1*localWorkSize[1];

 		//printf("%s %d %s %d\n", "running as wave front, work-group nos ", globalWorkSize[0]/localWorkSize[0] , "with globalWorkSize", globalWorkSize[0]);
 		errcode = clSetKernelArg(clKernel_nw1,  0, sizeof(cl_mem), (void *) &weigths_cuda);
 		errcode = clSetKernelArg(clKernel_nw1,  1, sizeof(cl_mem), (void *) &values_cuda);
 		errcode |= clSetKernelArg(clKernel_nw1, 2, sizeof(cl_mem), (void *) &matrix_cuda);
 		errcode |= clSetKernelArg(clKernel_nw1, 3, sizeof(int), (void *) &max_cols);
 		errcode |= clSetKernelArg(clKernel_nw1, 4, sizeof(int), (void *) &i);
 		CHKERR(errcode, "Failed to set kernel arguments!");
 		errcode = clEnqueueNDRangeKernel(commands, clKernel_nw1, 2, NULL, globalWorkSize, localWorkSize, 0, NULL, &ocdTempEvent);
 		clFinish(commands);
 		START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "NW Kernel nw1", ocdTempTimer)
 	 	END_TIMER(ocdTempTimer)
 		CHKERR(errcode, "Failed to enqueue kernel!");
 		
 	}


 	errcode = clEnqueueReadBuffer(commands, matrix_cuda, CL_TRUE, 0, sizeof(float)*size, (void *) input_itemsets, 0, NULL, &ocdTempEvent);
 	clFinish(commands);
 	START_TIMER(ocdTempEvent, OCD_TIMER_D2H, "knapsack table Set Copy", ocdTempTimer)
 	END_TIMER(ocdTempTimer)
 	CHKERR(errcode, "Failed to enqueue read buffer!");


 	for (i = 0; i < size; ++i)
 	{
 		printf("%d   ",  input_itemsets[ i ] );
 	}


 	clReleaseMemObject(weigths_cuda);
 	clReleaseMemObject(matrix_cuda);
 	clReleaseKernel(clKernel_nw1);
 	clReleaseProgram(clProgram);
 	clReleaseCommandQueue(commands);
 	clReleaseContext(context);

 }

diff --git a/knapsack_kernel.cl b/knapsack_kernel.cl
 #define BLOCK_SIZE 16

 int 
 maximum( int a,
 		int b){

 	if( a <= b )
 		return (b);
 	else 
 		return (a);
 }

 	__kernel void
 knapsack_opencl_shared_1(  __global int* weights,
 		__global int* values,
 		__global int* matrix_opencl,  
 		int cols,
 		int i)
 {
 	int bx = get_group_id(0);
 	int tx = get_local_id(0);


 	//printf("@@@ kernel group-id %d local-id %d\n", bx, tx);

 	int b_index_row = i;
 	int b_index_col = BLOCK_SIZE * bx + tx;  // j index in a W.G. 
 	int index   = b_index_row * cols + b_index_col;


 	int universal_row = ref_x/cols;
 	int universal_col = ref_x % cols;

 	if (weights[ b_index_row - 1 ] > universal_col )  //copy previous value if weight of item is more than current capacity
 		matrix_opencl[index] = matrix_opencl[index - cols] ;  	
 	
 	else{

 		matrix_opencl[index] = maximum( matrix_opencl[index - (cols + weights[b_index_row]  )] + values[b_index_row],
 											matrix_opencl[index - cols]);
 	}



 }
	// The code is based on needle.c for Needleman-Wunsch algorithm at
	// https://github.com/vtsynergy/OpenDwarfs/blob/master/dynamic-programming/nw/needle.c

	#define LIMIT -999
	#include <stdlib.h>
	#include <stdio.h>
	#include <string.h>
	#include <math.h>
	#include "needle.h"
	#include <sys/time.h>
	#include "../../include/rdtsc.h"
	#include "../../include/common_args.h"
	#include <malloc.h>
	#define AOCL_ALIGNMENT 64


	//#define TRACE

	void runTest( int argc, char** argv);


	double gettime() {
	struct timeval t;
	gettimeofday(&t,NULL);
	return t.tv_sec+t.tv_usec*1e-6;
	}

	int
	maximum( int a,
	int b,
	int c){

	int k;
	if( a <= b )
	k = b;
	else
	k = a;

	if( k <=c )
	return(c);
	else
	return(k);

	}

	////////////////////////////////////////////////////////////////////////////////
	// Program main
	////////////////////////////////////////////////////////////////////////////////
	int
	main( int argc, char** argv)
	{
	ocd_init(&argc, &argv, NULL);
	ocd_initCL();
	runTest( argc, argv);
	ocd_finalize();
	return EXIT_SUCCESS;
	}

	void usage(int argc, char **argv)
	{
	fprintf(stderr, "Usage: %s <max_rows/max_cols> <penalty> \n", argv[0]);
	fprintf(stderr, "\t<dimension> - x and y dimensions\n");
	fprintf(stderr, "\t<penalty> - penalty(positive integer)\n");
	exit(1);
	}

	void runTest( int argc, char** argv)
	{
	int max_rows, max_cols, penalty;
	int input_itemsets, weigths, *values;
	cl_mem matrix_cuda,weigths_cuda, values_cuda;
	int size;
	int i, j;

	cl_int errcode;

	// the lengths of the two sequences should be able to divided by 16.
	// And at current stage max_rows needs to equal max_cols
	if (argc == 3)
	{
	max_rows = atoi(argv[1]); //no of items
	max_cols = atoi(argv[2]); // maximum capacity , column range will be W
	}
	else{
	usage(argc, argv);
	}

	if(atoi(argv[1])%16!=0){
	fprintf(stderr,"The dimension values must be a multiple of 16\n");
	exit(1);
	}

	max_rows = max_rows + 1;
	max_cols = max_cols + 1;
	if(_deviceType == 3) {
	weigths = (int )memalign(AOCL_ALIGNMENT, max_rows sizeof(int) );
	values = (int )memalign(AOCL_ALIGNMENT, max_rows sizeof(int) );
	input_itemsets = (int )memalign(AOCL_ALIGNMENT, max_rows max_cols * sizeof(int) );

	} else {
	weigths = (int )malloc( max_rows sizeof(int) );
	values = (int )malloc( max_rows sizeof(int) );
	input_itemsets = (int )malloc(max_rows max_cols * sizeof(int) );
	}

	if (!input_itemsets)
	fprintf(stderr, "error: can not allocate memory");

	srand ( 7 ); //TODO
	//Initialize to zero
	for (i = 0 ; i < max_cols; i++){
	for (j = 0 ; j < max_rows; j++){
	input_itemsets[i*max_cols+j] = 0;
	}
	}

	//Used for input itemsets, then after the weigths matrix is constructed it is used/rewritten to store scores.
	for(i=0; i< max_rows ; i++){ //please define your own sequence.
	weigths[i] = rand() % 10;
	values[i] = rand() % 10;
	}


	//cl_device_id device_id;
	//cl_context context;
	//cl_command_queue commands;
	cl_program clProgram;
	cl_kernel clKernel_nw1;
	cl_kernel clKernel_nw2;


	FILE *kernelFile;
	char *kernelSource;
	size_t kernelLength;
	char* kernel_files;
	int num_kernels = 1;
	kernel_files = (char) malloc(sizeof(char)*num_kernels);
	strcpy(kernel_files,"knapsack_kernel");
	clProgram = ocdBuildProgramFromFile(context,device_id, kernel_files, NULL);

	clKernel_nw1 = clCreateKernel(clProgram, "knapsack_opencl_shared_1", &errcode);
	CHKERR(errcode, "Failed to create kernel!");

	size = max_cols * max_rows;
	weigths_cuda = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(int)*max_rows, NULL, &errcode);
	CHKERR(errcode, "Failed to create buffer!");

	values_cuda = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(int)*max_rows, NULL, &errcode);
	CHKERR(errcode, "Failed to create buffer!");

	matrix_cuda = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(int)*size, NULL, &errcode);
	CHKERR(errcode, "Failed to create buffer!");

	errcode = clEnqueueWriteBuffer(commands, weigths_cuda, CL_TRUE, 0, sizeof(int)max_rows, (void ) weigths, 0, NULL, &ocdTempEvent);
	clFinish(commands);
	START_TIMER(ocdTempEvent, OCD_TIMER_H2D, "NW Reference Copy", ocdTempTimer)
	END_TIMER(ocdTempTimer)
	CHKERR(errcode, "Failed to enqueue write buffer!");

	errcode = clEnqueueWriteBuffer(commands, values_cuda, CL_TRUE, 0, sizeof(int)max_rows, (void ) values, 0, NULL, &ocdTempEvent);
	clFinish(commands);
	START_TIMER(ocdTempEvent, OCD_TIMER_H2D, "NW Reference Copy", ocdTempTimer)
	END_TIMER(ocdTempTimer)
	CHKERR(errcode, "Failed to enqueue write buffer!");




	errcode = clEnqueueWriteBuffer(commands, matrix_cuda, CL_TRUE, 0, sizeof(int)size, (void ) input_itemsets, 0, NULL, &ocdTempEvent);
	clFinish(commands);
	START_TIMER(ocdTempEvent, OCD_TIMER_H2D, "Knapsack table Set Copy", ocdTempTimer)
	END_TIMER(ocdTempTimer)
	CHKERR(errcode, "Failed to enqueue write buffer!");

	size_t localWorkSize[2] = {BLOCK_SIZE, 1}; //BLOCK_SIZE work items per work-group in 1D only.
	size_t globalWorkSize[2];
	int block_width = ( max_cols - 1 )/BLOCK_SIZE;



	// Processing the whole Knapsack table row by row here
	for(i = 1 ; i < max_rows ; i++){
	globalWorkSize[0] = max_cols - 1;

	globalWorkSize[1] = 1*localWorkSize[1];

	//printf("%s %d %s %d\n", "running as wave front, work-group nos ", globalWorkSize[0]/localWorkSize[0] , "with globalWorkSize", globalWorkSize[0]);
	errcode = clSetKernelArg(clKernel_nw1, 0, sizeof(cl_mem), (void *) &weigths_cuda);
	errcode = clSetKernelArg(clKernel_nw1, 1, sizeof(cl_mem), (void *) &values_cuda);
	errcode \|= clSetKernelArg(clKernel_nw1, 2, sizeof(cl_mem), (void *) &matrix_cuda);
	errcode \|= clSetKernelArg(clKernel_nw1, 3, sizeof(int), (void *) &max_cols);
	errcode \|= clSetKernelArg(clKernel_nw1, 4, sizeof(int), (void *) &i);
	CHKERR(errcode, "Failed to set kernel arguments!");
	errcode = clEnqueueNDRangeKernel(commands, clKernel_nw1, 2, NULL, globalWorkSize, localWorkSize, 0, NULL, &ocdTempEvent);
	clFinish(commands);
	START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "NW Kernel nw1", ocdTempTimer)
	END_TIMER(ocdTempTimer)
	CHKERR(errcode, "Failed to enqueue kernel!");

	}


	errcode = clEnqueueReadBuffer(commands, matrix_cuda, CL_TRUE, 0, sizeof(float)size, (void ) input_itemsets, 0, NULL, &ocdTempEvent);
	clFinish(commands);
	START_TIMER(ocdTempEvent, OCD_TIMER_D2H, "knapsack table Set Copy", ocdTempTimer)
	END_TIMER(ocdTempTimer)
	CHKERR(errcode, "Failed to enqueue read buffer!");


	for (i = 0; i < size; ++i)
	{
	printf("%d ", input_itemsets[ i ] );
	}


	clReleaseMemObject(weigths_cuda);
	clReleaseMemObject(matrix_cuda);
	clReleaseKernel(clKernel_nw1);
	clReleaseProgram(clProgram);
	clReleaseCommandQueue(commands);
	clReleaseContext(context);

	}
	#define BLOCK_SIZE 16

	int
	maximum( int a,
	int b){

	if( a <= b )
	return (b);
	else
	return (a);
	}

	__kernel void
	knapsack_opencl_shared_1( __global int* weights,
	__global int* values,
	__global int* matrix_opencl,
	int cols,
	int i)
	{
	int bx = get_group_id(0);
	int tx = get_local_id(0);


	//printf("@@@ kernel group-id %d local-id %d\n", bx, tx);

	int b_index_row = i;
	int b_index_col = BLOCK_SIZE * bx + tx; // j index in a W.G.
	int index = b_index_row * cols + b_index_col;


	int universal_row = ref_x/cols;
	int universal_col = ref_x % cols;

	if (weights[ b_index_row - 1 ] > universal_col ) //copy previous value if weight of item is more than current capacity
	matrix_opencl[index] = matrix_opencl[index - cols] ;

	else{

	matrix_opencl[index] = maximum( matrix_opencl[index - (cols + weights[b_index_row] )] + values[b_index_row],
	matrix_opencl[index - cols]);
	}



	}