davidberard98 · July 7, 2023 23:55
diff --git a/build.sh b/build.sh
 # $CUDA_HOME/bin/nvcc binary_search_cuda.cu -std=c++17 -o binary_search_cuda -O3 # -Wl,-rpath $CUDA_HOME/lib64
 $CUDA_HOME/bin/nvcc dense_to_jagged.cu -std=c++17 -o dense_to_jagged -O3 # -Wl,-rpath $CUDA_HOME/lib64
diff --git a/dense_to_jagged.cu b/dense_to_jagged.cu
 // Template copied from https://github.com/NVIDIA/cuda-samples
 /* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

 /**
 * Vector addition: C = A + B.
 *
 * This sample is a very basic sample that implements element by element
 * vector addition. It is the same as the sample illustrating Chapter 2
 * of the programming guide with some additions like error checking.
 */

 #include <stdlib.h>
 #include <stdio.h>

 // For the CUDA runtime routines (prefixed with "cuda_")
 #include <cuda_runtime.h>

 #include "helpers.h"

 #define THREADS_PER_BLOCK 256

 void cpu_dense_to_jagged(float *values, int *offsets, float *output, const int B, const int S, const int M, const int J) {
  for (int b=0; b<B; ++b) {
    for (int s=0; s<S; ++s) {
      int j = offsets[b] + s;
      if (j >= offsets[b+1]) {
        break;
      }
      for (int m=0; m<M; ++m) {
        output[m + M * j] = values[m + M * (s + S * b)];
      }
    }
  }
 }

 __global__ void cuda_dense_to_jagged(
  float *values,
  int *offsets,
  float *output,
  const int B,
  const int S,
  const int M,
  const int J
 ) {
  int idx = blockIdx.x * blockDim.x + threadIdx.x;
  int j = idx / M;
  int m = idx % M;

  if (j < J) {
    int lo = 0;
    int hi = B;
    while (lo < hi) {
      int mid = (hi + lo + 1) / 2;
      if (j >= offsets[mid]) {
        lo = mid;
      } else {
        hi = mid-1;
      }
    }

    int b = lo;
    int s = j - offsets[b];
    output[m + M * j] = values[m + M * (s + S * b)];
  }
 }

 __global__ void cuda_binary_search(
  int *offsets,
  int *output,
  const int B,
  const int J
 ) {
  int idx = blockIdx.x * blockDim.x + threadIdx.x;

  if (idx < J) {
    int lo = 0;
    int hi = B;
    while (lo < hi) {
      int mid = (hi + lo + 1) / 2;
      if (idx >= offsets[mid]) {
        lo = mid;
      } else {
        hi = mid-1;
      }
    }

    output[idx] = lo;
  }
 }

 __global__ void cuda_dense_to_jagged_precomputed(
  float *values,
  int *offsets,
  int *inv_offsets,
  float *output,
  const int B,
  const int S,
  const int M,
  const int J
 ) {
  int idx = blockIdx.x * blockDim.x + threadIdx.x;
  int j = idx / M;
  int m = idx % M;

  if (j < J) {
    int b = inv_offsets[j];
    int s = j - offsets[b];
    output[m + M * j] = values[m + M * (s + S * b)];
  }
 }

 void run_N_dense_to_jagged(
  float *d_Values,
  int *d_Offsets,
  int *d_InvOffsets,
  float *d_Output,
  const int B,
  const int S,
  const int M,
  const int J,
  const int blocksPerGrid,
  const int num_conversions,
  bool use_precomputed
 ) {
  if (use_precomputed) {
    cuda_binary_search<<<blocksPerGrid, THREADS_PER_BLOCK>>>(d_Offsets, d_InvOffsets, B, J);
  }
  for (int i=0; i<num_conversions; ++i) {
    if (use_precomputed) {
      cuda_dense_to_jagged_precomputed<<<blocksPerGrid, THREADS_PER_BLOCK>>>(d_Values, d_Offsets, d_InvOffsets, d_Output, B, S, M, J);
    } else {
      cuda_dense_to_jagged<<<blocksPerGrid, THREADS_PER_BLOCK>>>(d_Values, d_Offsets, d_Output, B, S, M, J);
    }
  }
 }

 void benchmark_N(
  float *h_Output,
  float *h_OutputRef,
  float *d_Values,
  int *d_Offsets,
  int *d_InvOffsets,
  float *d_Output,
  const int B,
  const int S,
  const int M,
  const int J,
  const int blocksPerGrid,
  const int num_conversions,
  bool use_precomputed
 ) {
  for (int i = 0; i < 4; ++i) {
    run_N_dense_to_jagged(
      d_Values,
      d_Offsets,
      d_InvOffsets,
      d_Output,
      B, S, M, J,
      blocksPerGrid, num_conversions,
      use_precomputed
    );
    CHECK(cudaDeviceSynchronize());
  }

  char buf[100];
  if (use_precomputed) {
    sprintf(buf, "dense-to-jagged PRECOMPUTED-BS on GPU, %dx", num_conversions);
  } else {
    sprintf(buf, "dense-to-jagged REPEATED-BS on GPU, %dx", num_conversions);
  }
  {
    TIMER t(buf, 20);
    for (int i = 0; i < 20; ++ i) {
      run_N_dense_to_jagged(
        d_Values,
        d_Offsets,
        d_InvOffsets,
        d_Output,
        B, S, M, J,
        blocksPerGrid, num_conversions,
        use_precomputed
      );
    }
    CHECK(cudaDeviceSynchronize());
  }

  CHECK(cudaMemcpy(h_OutputRef, d_Output, J * M * sizeof(float), cudaMemcpyDeviceToHost));
  // Verify that the result vector is correct
  for (int i = 0; i < J * M; ++i) {
    if (h_Output[i] != h_OutputRef[i]) {
      fprintf(stderr, "Result verification failed at element %d: expect %d, got %d!\n", i, h_Output[i], h_OutputRef[i]);
      exit(EXIT_FAILURE);
    }
  }
  printf("Test PASSED\n");
 }

 int main(void) {

  // Print the vector length to be used, and compute its size
  int B = 1024;
  int S = 512;
  int M = 96;
  int numOffsets = B + 1;
  int numElements = B * S * M;
  printf("[Dense->jagged with dense having %d elements (%d x %d x %d)]\n", numElements, B, S, M);

  float *h_Values = (float *)malloc(B * S * M * sizeof(float));
  int *h_Offsets = (int *)malloc(numOffsets * sizeof(int));

  // Verify that allocations succeeded
  if (h_Values == NULL || h_Offsets == NULL) {
    fprintf(stderr, "Failed to allocate host vectors!\n");
    exit(EXIT_FAILURE);
  }

  for (int i = 0; i < B*S*M; ++i) {
    h_Values[i] = rand() / (float)RAND_MAX;
  }

  for (int i = 0; i < B; ++i) {
    h_Offsets[i+1] = h_Offsets[i] + (int) (rand() / (float)RAND_MAX * (S-2)) + 2;
  }

  int TOTAL_LENGTH = h_Offsets[B];
  int J = TOTAL_LENGTH;
  int numJaggedElements = M * TOTAL_LENGTH;
  printf("Jagged tensor is %d x %d\n", J, M);

  float *h_Output = (float *)malloc(numJaggedElements * sizeof(float));
  float *h_OutputRef = (float *)malloc(numJaggedElements * sizeof(float));

  if (h_Output == NULL || h_OutputRef == NULL) {
    fprintf(stderr, "Failed to allocate host vectors!\n");
    exit(EXIT_FAILURE);
  }

  float *d_Values = NULL;
  int *d_Offsets = NULL;
  int *d_InvOffsets = NULL;
  float *d_Output = NULL;

  CHECK(cudaMalloc((void **) &d_Values, numElements * sizeof(float)));
  CHECK(cudaMalloc((void **) &d_Offsets, numOffsets * sizeof(int)));
  CHECK(cudaMalloc((void **) &d_InvOffsets, J * sizeof(int)));
  CHECK(cudaMalloc((void **) &d_Output, numJaggedElements * sizeof(float)));

  // Copy the host input vectors A and B in host memory to the device input
  // vectors in
  // device memory
  printf("Copy input data from the host memory to the CUDA device\n");
  CHECK(cudaMemcpy(d_Values, h_Values, numElements * sizeof(float), cudaMemcpyHostToDevice));
  CHECK(cudaMemcpy(d_Offsets, h_Offsets, numOffsets * sizeof(int), cudaMemcpyHostToDevice));

  // Launch the Vector Add CUDA Kernel
  int blocksPerGrid = (numJaggedElements + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
  printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid,
         THREADS_PER_BLOCK);

  printf("Run CPU to get reference\n");
  {
    TIMER t("dense_to_jagged on CPU, 1x");
    cpu_dense_to_jagged(h_Values, h_Offsets, h_Output, B, S, M, J);
  }

  for (int i=1; i<=32; i *= 2) {
    for (int j=0; j<2; ++j) {
      benchmark_N(
        h_Output,
        h_OutputRef,
        d_Values,
        d_Offsets,
        d_InvOffsets,
        d_Output,
        B, S, M, J,
        blocksPerGrid, i,
        j
      );
    }
  }

  CHECK(cudaFree(d_Values));
  CHECK(cudaFree(d_Offsets));
  CHECK(cudaFree(d_InvOffsets));
  CHECK(cudaFree(d_Output));


  // Free host memory
  free(h_Values);
  free(h_Offsets);
  free(h_Output);
  free(h_OutputRef);

  printf("Done\n");
  return 0;
 }
diff --git a/helpers.h b/helpers.h
 #include <stdio.h>
 #include <string>
 #include <sys/time.h>

 double cpuSecond() {
 	struct timeval tp;
 	gettimeofday(&tp,NULL);
 	return ((double)tp.tv_sec + (double)tp.tv_usec*1.e-6);
 }

 struct TIMER {
  std::string name;
  double start_time;
  int repeats;
 	TIMER(std::string name, int repeats = 1) : name(std::move(name)), repeats(repeats) {
    start_time = cpuSecond();
 	}
  ~TIMER() {
    double end_time = cpuSecond();
    printf("Time for %s: %f ms\n", name.c_str(), (end_time - start_time) * 1000 / repeats);
  }
 };

 #define CHECK(call) \
 { \
  const cudaError_t error = call; \
  if (error != cudaSuccess) \
  { \
    printf("Error: %s:%d, ", __FILE__, __LINE__); \
    printf("code:%d, reason: %s\n", error, cudaGetErrorString(error)); \
    exit(1); \
  } \
 }
diff --git a/result.txt b/result.txt
 [Dense->jagged with dense having 50331648 elements (1024 x 512 x 96)]
 Jagged tensor is 259493 x 96
 Copy input data from the host memory to the CUDA device
 CUDA kernel launch with 97310 blocks of 256 threads
 Run CPU to get reference
 Time for dense_to_jagged on CPU, 1x: 53.478956 ms
 Time for dense-to-jagged REPEATED-BS on GPU, 1x: 0.320649 ms
 Test PASSED
 Time for dense-to-jagged PRECOMPUTED-BS on GPU, 1x: 0.286150 ms
 Test PASSED
 Time for dense-to-jagged REPEATED-BS on GPU, 2x: 0.640309 ms
 Test PASSED
 Time for dense-to-jagged PRECOMPUTED-BS on GPU, 2x: 0.496244 ms
 Test PASSED
 Time for dense-to-jagged REPEATED-BS on GPU, 4x: 1.279855 ms
 Test PASSED
 Time for dense-to-jagged PRECOMPUTED-BS on GPU, 4x: 0.915551 ms
 Test PASSED
 Time for dense-to-jagged REPEATED-BS on GPU, 8x: 2.558100 ms
 Test PASSED
 Time for dense-to-jagged PRECOMPUTED-BS on GPU, 8x: 1.755655 ms
 Test PASSED
 Time for dense-to-jagged REPEATED-BS on GPU, 16x: 5.114698 ms
 Test PASSED
 Time for dense-to-jagged PRECOMPUTED-BS on GPU, 16x: 3.434896 ms
 Test PASSED
 Time for dense-to-jagged REPEATED-BS on GPU, 32x: 10.237050 ms
 Test PASSED
 Time for dense-to-jagged PRECOMPUTED-BS on GPU, 32x: 6.793153 ms
 Test PASSED
 Done
	# $CUDA_HOME/bin/nvcc binary_search_cuda.cu -std=c++17 -o binary_search_cuda -O3 # -Wl,-rpath $CUDA_HOME/lib64
	$CUDA_HOME/bin/nvcc dense_to_jagged.cu -std=c++17 -o dense_to_jagged -O3 # -Wl,-rpath $CUDA_HOME/lib64
	// Template copied from https://github.com/NVIDIA/cuda-samples
	/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* * Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* * Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* * Neither the name of NVIDIA CORPORATION nor the names of its
	* contributors may be used to endorse or promote products derived
	* from this software without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
	* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
	* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
	* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
	* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
	* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
	* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
	* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
	* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*/

	/**
	* Vector addition: C = A + B.
	*
	* This sample is a very basic sample that implements element by element
	* vector addition. It is the same as the sample illustrating Chapter 2
	* of the programming guide with some additions like error checking.
	*/

	#include <stdlib.h>
	#include <stdio.h>

	// For the CUDA runtime routines (prefixed with "cuda_")
	#include <cuda_runtime.h>

	#include "helpers.h"

	#define THREADS_PER_BLOCK 256

	void cpu_dense_to_jagged(float values, int offsets, float *output, const int B, const int S, const int M, const int J) {
	for (int b=0; b<B; ++b) {
	for (int s=0; s<S; ++s) {
	int j = offsets[b] + s;
	if (j >= offsets[b+1]) {
	break;
	}
	for (int m=0; m<M; ++m) {
	output[m + M * j] = values[m + M * (s + S * b)];
	}
	}
	}
	}

	__global__ void cuda_dense_to_jagged(
	float *values,
	int *offsets,
	float *output,
	const int B,
	const int S,
	const int M,
	const int J
	) {
	int idx = blockIdx.x * blockDim.x + threadIdx.x;
	int j = idx / M;
	int m = idx % M;

	if (j < J) {
	int lo = 0;
	int hi = B;
	while (lo < hi) {
	int mid = (hi + lo + 1) / 2;
	if (j >= offsets[mid]) {
	lo = mid;
	} else {
	hi = mid-1;
	}
	}

	int b = lo;
	int s = j - offsets[b];
	output[m + M * j] = values[m + M * (s + S * b)];
	}
	}

	__global__ void cuda_binary_search(
	int *offsets,
	int *output,
	const int B,
	const int J
	) {
	int idx = blockIdx.x * blockDim.x + threadIdx.x;

	if (idx < J) {
	int lo = 0;
	int hi = B;
	while (lo < hi) {
	int mid = (hi + lo + 1) / 2;
	if (idx >= offsets[mid]) {
	lo = mid;
	} else {
	hi = mid-1;
	}
	}

	output[idx] = lo;
	}
	}

	__global__ void cuda_dense_to_jagged_precomputed(
	float *values,
	int *offsets,
	int *inv_offsets,
	float *output,
	const int B,
	const int S,
	const int M,
	const int J
	) {
	int idx = blockIdx.x * blockDim.x + threadIdx.x;
	int j = idx / M;
	int m = idx % M;

	if (j < J) {
	int b = inv_offsets[j];
	int s = j - offsets[b];
	output[m + M * j] = values[m + M * (s + S * b)];
	}
	}

	void run_N_dense_to_jagged(
	float *d_Values,
	int *d_Offsets,
	int *d_InvOffsets,
	float *d_Output,
	const int B,
	const int S,
	const int M,
	const int J,
	const int blocksPerGrid,
	const int num_conversions,
	bool use_precomputed
	) {
	if (use_precomputed) {
	cuda_binary_search<<<blocksPerGrid, THREADS_PER_BLOCK>>>(d_Offsets, d_InvOffsets, B, J);
	}
	for (int i=0; i<num_conversions; ++i) {
	if (use_precomputed) {
	cuda_dense_to_jagged_precomputed<<<blocksPerGrid, THREADS_PER_BLOCK>>>(d_Values, d_Offsets, d_InvOffsets, d_Output, B, S, M, J);
	} else {
	cuda_dense_to_jagged<<<blocksPerGrid, THREADS_PER_BLOCK>>>(d_Values, d_Offsets, d_Output, B, S, M, J);
	}
	}
	}

	void benchmark_N(
	float *h_Output,
	float *h_OutputRef,
	float *d_Values,
	int *d_Offsets,
	int *d_InvOffsets,
	float *d_Output,
	const int B,
	const int S,
	const int M,
	const int J,
	const int blocksPerGrid,
	const int num_conversions,
	bool use_precomputed
	) {
	for (int i = 0; i < 4; ++i) {
	run_N_dense_to_jagged(
	d_Values,
	d_Offsets,
	d_InvOffsets,
	d_Output,
	B, S, M, J,
	blocksPerGrid, num_conversions,
	use_precomputed
	);
	CHECK(cudaDeviceSynchronize());
	}

	char buf[100];
	if (use_precomputed) {
	sprintf(buf, "dense-to-jagged PRECOMPUTED-BS on GPU, %dx", num_conversions);
	} else {
	sprintf(buf, "dense-to-jagged REPEATED-BS on GPU, %dx", num_conversions);
	}
	{
	TIMER t(buf, 20);
	for (int i = 0; i < 20; ++ i) {
	run_N_dense_to_jagged(
	d_Values,
	d_Offsets,
	d_InvOffsets,
	d_Output,
	B, S, M, J,
	blocksPerGrid, num_conversions,
	use_precomputed
	);
	}
	CHECK(cudaDeviceSynchronize());
	}

	CHECK(cudaMemcpy(h_OutputRef, d_Output, J * M * sizeof(float), cudaMemcpyDeviceToHost));
	// Verify that the result vector is correct
	for (int i = 0; i < J * M; ++i) {
	if (h_Output[i] != h_OutputRef[i]) {
	fprintf(stderr, "Result verification failed at element %d: expect %d, got %d!\n", i, h_Output[i], h_OutputRef[i]);
	exit(EXIT_FAILURE);
	}
	}
	printf("Test PASSED\n");
	}

	int main(void) {

	// Print the vector length to be used, and compute its size
	int B = 1024;
	int S = 512;
	int M = 96;
	int numOffsets = B + 1;
	int numElements = B * S * M;
	printf("[Dense->jagged with dense having %d elements (%d x %d x %d)]\n", numElements, B, S, M);

	float h_Values = (float )malloc(B * S * M * sizeof(float));
	int h_Offsets = (int )malloc(numOffsets * sizeof(int));

	// Verify that allocations succeeded
	if (h_Values == NULL \|\| h_Offsets == NULL) {
	fprintf(stderr, "Failed to allocate host vectors!\n");
	exit(EXIT_FAILURE);
	}

	for (int i = 0; i < BSM; ++i) {
	h_Values[i] = rand() / (float)RAND_MAX;
	}

	for (int i = 0; i < B; ++i) {
	h_Offsets[i+1] = h_Offsets[i] + (int) (rand() / (float)RAND_MAX * (S-2)) + 2;
	}

	int TOTAL_LENGTH = h_Offsets[B];
	int J = TOTAL_LENGTH;
	int numJaggedElements = M * TOTAL_LENGTH;
	printf("Jagged tensor is %d x %d\n", J, M);

	float h_Output = (float )malloc(numJaggedElements * sizeof(float));
	float h_OutputRef = (float )malloc(numJaggedElements * sizeof(float));

	if (h_Output == NULL \|\| h_OutputRef == NULL) {
	fprintf(stderr, "Failed to allocate host vectors!\n");
	exit(EXIT_FAILURE);
	}

	float *d_Values = NULL;
	int *d_Offsets = NULL;
	int *d_InvOffsets = NULL;
	float *d_Output = NULL;

	CHECK(cudaMalloc((void *) &d_Values, numElements sizeof(float)));
	CHECK(cudaMalloc((void *) &d_Offsets, numOffsets sizeof(int)));
	CHECK(cudaMalloc((void *) &d_InvOffsets, J sizeof(int)));
	CHECK(cudaMalloc((void *) &d_Output, numJaggedElements sizeof(float)));

	// Copy the host input vectors A and B in host memory to the device input
	// vectors in
	// device memory
	printf("Copy input data from the host memory to the CUDA device\n");
	CHECK(cudaMemcpy(d_Values, h_Values, numElements * sizeof(float), cudaMemcpyHostToDevice));
	CHECK(cudaMemcpy(d_Offsets, h_Offsets, numOffsets * sizeof(int), cudaMemcpyHostToDevice));

	// Launch the Vector Add CUDA Kernel
	int blocksPerGrid = (numJaggedElements + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
	printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid,
	THREADS_PER_BLOCK);

	printf("Run CPU to get reference\n");
	{
	TIMER t("dense_to_jagged on CPU, 1x");
	cpu_dense_to_jagged(h_Values, h_Offsets, h_Output, B, S, M, J);
	}

	for (int i=1; i<=32; i *= 2) {
	for (int j=0; j<2; ++j) {
	benchmark_N(
	h_Output,
	h_OutputRef,
	d_Values,
	d_Offsets,
	d_InvOffsets,
	d_Output,
	B, S, M, J,
	blocksPerGrid, i,
	j
	);
	}
	}

	CHECK(cudaFree(d_Values));
	CHECK(cudaFree(d_Offsets));
	CHECK(cudaFree(d_InvOffsets));
	CHECK(cudaFree(d_Output));


	// Free host memory
	free(h_Values);
	free(h_Offsets);
	free(h_Output);
	free(h_OutputRef);

	printf("Done\n");
	return 0;
	}
	#include <stdio.h>
	#include <string>
	#include <sys/time.h>

	double cpuSecond() {
	struct timeval tp;
	gettimeofday(&tp,NULL);
	return ((double)tp.tv_sec + (double)tp.tv_usec*1.e-6);
	}

	struct TIMER {
	std::string name;
	double start_time;
	int repeats;
	TIMER(std::string name, int repeats = 1) : name(std::move(name)), repeats(repeats) {
	start_time = cpuSecond();
	}
	~TIMER() {
	double end_time = cpuSecond();
	printf("Time for %s: %f ms\n", name.c_str(), (end_time - start_time) * 1000 / repeats);
	}
	};

	#define CHECK(call) \
	{ \
	const cudaError_t error = call; \
	if (error != cudaSuccess) \
	{ \
	printf("Error: %s:%d, ", __FILE__, __LINE__); \
	printf("code:%d, reason: %s\n", error, cudaGetErrorString(error)); \
	exit(1); \
	} \
	}
	[Dense->jagged with dense having 50331648 elements (1024 x 512 x 96)]
	Jagged tensor is 259493 x 96
	Copy input data from the host memory to the CUDA device
	CUDA kernel launch with 97310 blocks of 256 threads
	Run CPU to get reference
	Time for dense_to_jagged on CPU, 1x: 53.478956 ms
	Time for dense-to-jagged REPEATED-BS on GPU, 1x: 0.320649 ms
	Test PASSED
	Time for dense-to-jagged PRECOMPUTED-BS on GPU, 1x: 0.286150 ms
	Test PASSED
	Time for dense-to-jagged REPEATED-BS on GPU, 2x: 0.640309 ms
	Test PASSED
	Time for dense-to-jagged PRECOMPUTED-BS on GPU, 2x: 0.496244 ms
	Test PASSED
	Time for dense-to-jagged REPEATED-BS on GPU, 4x: 1.279855 ms
	Test PASSED
	Time for dense-to-jagged PRECOMPUTED-BS on GPU, 4x: 0.915551 ms
	Test PASSED
	Time for dense-to-jagged REPEATED-BS on GPU, 8x: 2.558100 ms
	Test PASSED
	Time for dense-to-jagged PRECOMPUTED-BS on GPU, 8x: 1.755655 ms
	Test PASSED
	Time for dense-to-jagged REPEATED-BS on GPU, 16x: 5.114698 ms
	Test PASSED
	Time for dense-to-jagged PRECOMPUTED-BS on GPU, 16x: 3.434896 ms
	Test PASSED
	Time for dense-to-jagged REPEATED-BS on GPU, 32x: 10.237050 ms
	Test PASSED
	Time for dense-to-jagged PRECOMPUTED-BS on GPU, 32x: 6.793153 ms
	Test PASSED
	Done