Skip to content

Instantly share code, notes, and snippets.

@davidberard98
Last active July 7, 2023 23:55
Show Gist options
  • Save davidberard98/4f72fa2489b6920a9d968fab0aebd037 to your computer and use it in GitHub Desktop.
Save davidberard98/4f72fa2489b6920a9d968fab0aebd037 to your computer and use it in GitHub Desktop.
# $CUDA_HOME/bin/nvcc binary_search_cuda.cu -std=c++17 -o binary_search_cuda -O3 # -Wl,-rpath $CUDA_HOME/lib64
$CUDA_HOME/bin/nvcc dense_to_jagged.cu -std=c++17 -o dense_to_jagged -O3 # -Wl,-rpath $CUDA_HOME/lib64
// Template copied from https://github.com/NVIDIA/cuda-samples
/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of NVIDIA CORPORATION nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/**
* Vector addition: C = A + B.
*
* This sample is a very basic sample that implements element by element
* vector addition. It is the same as the sample illustrating Chapter 2
* of the programming guide with some additions like error checking.
*/
#include <stdlib.h>
#include <stdio.h>
// For the CUDA runtime routines (prefixed with "cuda_")
#include <cuda_runtime.h>
#include "helpers.h"
#define THREADS_PER_BLOCK 256
void cpu_dense_to_jagged(float *values, int *offsets, float *output, const int B, const int S, const int M, const int J) {
for (int b=0; b<B; ++b) {
for (int s=0; s<S; ++s) {
int j = offsets[b] + s;
if (j >= offsets[b+1]) {
break;
}
for (int m=0; m<M; ++m) {
output[m + M * j] = values[m + M * (s + S * b)];
}
}
}
}
__global__ void cuda_dense_to_jagged(
float *values,
int *offsets,
float *output,
const int B,
const int S,
const int M,
const int J
) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
int j = idx / M;
int m = idx % M;
if (j < J) {
int lo = 0;
int hi = B;
while (lo < hi) {
int mid = (hi + lo + 1) / 2;
if (j >= offsets[mid]) {
lo = mid;
} else {
hi = mid-1;
}
}
int b = lo;
int s = j - offsets[b];
output[m + M * j] = values[m + M * (s + S * b)];
}
}
__global__ void cuda_binary_search(
int *offsets,
int *output,
const int B,
const int J
) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < J) {
int lo = 0;
int hi = B;
while (lo < hi) {
int mid = (hi + lo + 1) / 2;
if (idx >= offsets[mid]) {
lo = mid;
} else {
hi = mid-1;
}
}
output[idx] = lo;
}
}
__global__ void cuda_dense_to_jagged_precomputed(
float *values,
int *offsets,
int *inv_offsets,
float *output,
const int B,
const int S,
const int M,
const int J
) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
int j = idx / M;
int m = idx % M;
if (j < J) {
int b = inv_offsets[j];
int s = j - offsets[b];
output[m + M * j] = values[m + M * (s + S * b)];
}
}
void run_N_dense_to_jagged(
float *d_Values,
int *d_Offsets,
int *d_InvOffsets,
float *d_Output,
const int B,
const int S,
const int M,
const int J,
const int blocksPerGrid,
const int num_conversions,
bool use_precomputed
) {
if (use_precomputed) {
cuda_binary_search<<<blocksPerGrid, THREADS_PER_BLOCK>>>(d_Offsets, d_InvOffsets, B, J);
}
for (int i=0; i<num_conversions; ++i) {
if (use_precomputed) {
cuda_dense_to_jagged_precomputed<<<blocksPerGrid, THREADS_PER_BLOCK>>>(d_Values, d_Offsets, d_InvOffsets, d_Output, B, S, M, J);
} else {
cuda_dense_to_jagged<<<blocksPerGrid, THREADS_PER_BLOCK>>>(d_Values, d_Offsets, d_Output, B, S, M, J);
}
}
}
void benchmark_N(
float *h_Output,
float *h_OutputRef,
float *d_Values,
int *d_Offsets,
int *d_InvOffsets,
float *d_Output,
const int B,
const int S,
const int M,
const int J,
const int blocksPerGrid,
const int num_conversions,
bool use_precomputed
) {
for (int i = 0; i < 4; ++i) {
run_N_dense_to_jagged(
d_Values,
d_Offsets,
d_InvOffsets,
d_Output,
B, S, M, J,
blocksPerGrid, num_conversions,
use_precomputed
);
CHECK(cudaDeviceSynchronize());
}
char buf[100];
if (use_precomputed) {
sprintf(buf, "dense-to-jagged PRECOMPUTED-BS on GPU, %dx", num_conversions);
} else {
sprintf(buf, "dense-to-jagged REPEATED-BS on GPU, %dx", num_conversions);
}
{
TIMER t(buf, 20);
for (int i = 0; i < 20; ++ i) {
run_N_dense_to_jagged(
d_Values,
d_Offsets,
d_InvOffsets,
d_Output,
B, S, M, J,
blocksPerGrid, num_conversions,
use_precomputed
);
}
CHECK(cudaDeviceSynchronize());
}
CHECK(cudaMemcpy(h_OutputRef, d_Output, J * M * sizeof(float), cudaMemcpyDeviceToHost));
// Verify that the result vector is correct
for (int i = 0; i < J * M; ++i) {
if (h_Output[i] != h_OutputRef[i]) {
fprintf(stderr, "Result verification failed at element %d: expect %d, got %d!\n", i, h_Output[i], h_OutputRef[i]);
exit(EXIT_FAILURE);
}
}
printf("Test PASSED\n");
}
int main(void) {
// Print the vector length to be used, and compute its size
int B = 1024;
int S = 512;
int M = 96;
int numOffsets = B + 1;
int numElements = B * S * M;
printf("[Dense->jagged with dense having %d elements (%d x %d x %d)]\n", numElements, B, S, M);
float *h_Values = (float *)malloc(B * S * M * sizeof(float));
int *h_Offsets = (int *)malloc(numOffsets * sizeof(int));
// Verify that allocations succeeded
if (h_Values == NULL || h_Offsets == NULL) {
fprintf(stderr, "Failed to allocate host vectors!\n");
exit(EXIT_FAILURE);
}
for (int i = 0; i < B*S*M; ++i) {
h_Values[i] = rand() / (float)RAND_MAX;
}
for (int i = 0; i < B; ++i) {
h_Offsets[i+1] = h_Offsets[i] + (int) (rand() / (float)RAND_MAX * (S-2)) + 2;
}
int TOTAL_LENGTH = h_Offsets[B];
int J = TOTAL_LENGTH;
int numJaggedElements = M * TOTAL_LENGTH;
printf("Jagged tensor is %d x %d\n", J, M);
float *h_Output = (float *)malloc(numJaggedElements * sizeof(float));
float *h_OutputRef = (float *)malloc(numJaggedElements * sizeof(float));
if (h_Output == NULL || h_OutputRef == NULL) {
fprintf(stderr, "Failed to allocate host vectors!\n");
exit(EXIT_FAILURE);
}
float *d_Values = NULL;
int *d_Offsets = NULL;
int *d_InvOffsets = NULL;
float *d_Output = NULL;
CHECK(cudaMalloc((void **) &d_Values, numElements * sizeof(float)));
CHECK(cudaMalloc((void **) &d_Offsets, numOffsets * sizeof(int)));
CHECK(cudaMalloc((void **) &d_InvOffsets, J * sizeof(int)));
CHECK(cudaMalloc((void **) &d_Output, numJaggedElements * sizeof(float)));
// Copy the host input vectors A and B in host memory to the device input
// vectors in
// device memory
printf("Copy input data from the host memory to the CUDA device\n");
CHECK(cudaMemcpy(d_Values, h_Values, numElements * sizeof(float), cudaMemcpyHostToDevice));
CHECK(cudaMemcpy(d_Offsets, h_Offsets, numOffsets * sizeof(int), cudaMemcpyHostToDevice));
// Launch the Vector Add CUDA Kernel
int blocksPerGrid = (numJaggedElements + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid,
THREADS_PER_BLOCK);
printf("Run CPU to get reference\n");
{
TIMER t("dense_to_jagged on CPU, 1x");
cpu_dense_to_jagged(h_Values, h_Offsets, h_Output, B, S, M, J);
}
for (int i=1; i<=32; i *= 2) {
for (int j=0; j<2; ++j) {
benchmark_N(
h_Output,
h_OutputRef,
d_Values,
d_Offsets,
d_InvOffsets,
d_Output,
B, S, M, J,
blocksPerGrid, i,
j
);
}
}
CHECK(cudaFree(d_Values));
CHECK(cudaFree(d_Offsets));
CHECK(cudaFree(d_InvOffsets));
CHECK(cudaFree(d_Output));
// Free host memory
free(h_Values);
free(h_Offsets);
free(h_Output);
free(h_OutputRef);
printf("Done\n");
return 0;
}
#include <stdio.h>
#include <string>
#include <sys/time.h>
double cpuSecond() {
struct timeval tp;
gettimeofday(&tp,NULL);
return ((double)tp.tv_sec + (double)tp.tv_usec*1.e-6);
}
struct TIMER {
std::string name;
double start_time;
int repeats;
TIMER(std::string name, int repeats = 1) : name(std::move(name)), repeats(repeats) {
start_time = cpuSecond();
}
~TIMER() {
double end_time = cpuSecond();
printf("Time for %s: %f ms\n", name.c_str(), (end_time - start_time) * 1000 / repeats);
}
};
#define CHECK(call) \
{ \
const cudaError_t error = call; \
if (error != cudaSuccess) \
{ \
printf("Error: %s:%d, ", __FILE__, __LINE__); \
printf("code:%d, reason: %s\n", error, cudaGetErrorString(error)); \
exit(1); \
} \
}
[Dense->jagged with dense having 50331648 elements (1024 x 512 x 96)]
Jagged tensor is 259493 x 96
Copy input data from the host memory to the CUDA device
CUDA kernel launch with 97310 blocks of 256 threads
Run CPU to get reference
Time for dense_to_jagged on CPU, 1x: 53.478956 ms
Time for dense-to-jagged REPEATED-BS on GPU, 1x: 0.320649 ms
Test PASSED
Time for dense-to-jagged PRECOMPUTED-BS on GPU, 1x: 0.286150 ms
Test PASSED
Time for dense-to-jagged REPEATED-BS on GPU, 2x: 0.640309 ms
Test PASSED
Time for dense-to-jagged PRECOMPUTED-BS on GPU, 2x: 0.496244 ms
Test PASSED
Time for dense-to-jagged REPEATED-BS on GPU, 4x: 1.279855 ms
Test PASSED
Time for dense-to-jagged PRECOMPUTED-BS on GPU, 4x: 0.915551 ms
Test PASSED
Time for dense-to-jagged REPEATED-BS on GPU, 8x: 2.558100 ms
Test PASSED
Time for dense-to-jagged PRECOMPUTED-BS on GPU, 8x: 1.755655 ms
Test PASSED
Time for dense-to-jagged REPEATED-BS on GPU, 16x: 5.114698 ms
Test PASSED
Time for dense-to-jagged PRECOMPUTED-BS on GPU, 16x: 3.434896 ms
Test PASSED
Time for dense-to-jagged REPEATED-BS on GPU, 32x: 10.237050 ms
Test PASSED
Time for dense-to-jagged PRECOMPUTED-BS on GPU, 32x: 6.793153 ms
Test PASSED
Done
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment