Last active
July 7, 2023 23:55
-
-
Save davidberard98/4f72fa2489b6920a9d968fab0aebd037 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# $CUDA_HOME/bin/nvcc binary_search_cuda.cu -std=c++17 -o binary_search_cuda -O3 # -Wl,-rpath $CUDA_HOME/lib64 | |
$CUDA_HOME/bin/nvcc dense_to_jagged.cu -std=c++17 -o dense_to_jagged -O3 # -Wl,-rpath $CUDA_HOME/lib64 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Template copied from https://github.com/NVIDIA/cuda-samples | |
/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. | |
* | |
* Redistribution and use in source and binary forms, with or without | |
* modification, are permitted provided that the following conditions | |
* are met: | |
* * Redistributions of source code must retain the above copyright | |
* notice, this list of conditions and the following disclaimer. | |
* * Redistributions in binary form must reproduce the above copyright | |
* notice, this list of conditions and the following disclaimer in the | |
* documentation and/or other materials provided with the distribution. | |
* * Neither the name of NVIDIA CORPORATION nor the names of its | |
* contributors may be used to endorse or promote products derived | |
* from this software without specific prior written permission. | |
* | |
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY | |
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR | |
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR | |
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, | |
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, | |
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR | |
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY | |
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | |
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
*/ | |
/** | |
* Vector addition: C = A + B. | |
* | |
* This sample is a very basic sample that implements element by element | |
* vector addition. It is the same as the sample illustrating Chapter 2 | |
* of the programming guide with some additions like error checking. | |
*/ | |
#include <stdlib.h> | |
#include <stdio.h> | |
// For the CUDA runtime routines (prefixed with "cuda_") | |
#include <cuda_runtime.h> | |
#include "helpers.h" | |
#define THREADS_PER_BLOCK 256 | |
void cpu_dense_to_jagged(float *values, int *offsets, float *output, const int B, const int S, const int M, const int J) { | |
for (int b=0; b<B; ++b) { | |
for (int s=0; s<S; ++s) { | |
int j = offsets[b] + s; | |
if (j >= offsets[b+1]) { | |
break; | |
} | |
for (int m=0; m<M; ++m) { | |
output[m + M * j] = values[m + M * (s + S * b)]; | |
} | |
} | |
} | |
} | |
__global__ void cuda_dense_to_jagged( | |
float *values, | |
int *offsets, | |
float *output, | |
const int B, | |
const int S, | |
const int M, | |
const int J | |
) { | |
int idx = blockIdx.x * blockDim.x + threadIdx.x; | |
int j = idx / M; | |
int m = idx % M; | |
if (j < J) { | |
int lo = 0; | |
int hi = B; | |
while (lo < hi) { | |
int mid = (hi + lo + 1) / 2; | |
if (j >= offsets[mid]) { | |
lo = mid; | |
} else { | |
hi = mid-1; | |
} | |
} | |
int b = lo; | |
int s = j - offsets[b]; | |
output[m + M * j] = values[m + M * (s + S * b)]; | |
} | |
} | |
__global__ void cuda_binary_search( | |
int *offsets, | |
int *output, | |
const int B, | |
const int J | |
) { | |
int idx = blockIdx.x * blockDim.x + threadIdx.x; | |
if (idx < J) { | |
int lo = 0; | |
int hi = B; | |
while (lo < hi) { | |
int mid = (hi + lo + 1) / 2; | |
if (idx >= offsets[mid]) { | |
lo = mid; | |
} else { | |
hi = mid-1; | |
} | |
} | |
output[idx] = lo; | |
} | |
} | |
__global__ void cuda_dense_to_jagged_precomputed( | |
float *values, | |
int *offsets, | |
int *inv_offsets, | |
float *output, | |
const int B, | |
const int S, | |
const int M, | |
const int J | |
) { | |
int idx = blockIdx.x * blockDim.x + threadIdx.x; | |
int j = idx / M; | |
int m = idx % M; | |
if (j < J) { | |
int b = inv_offsets[j]; | |
int s = j - offsets[b]; | |
output[m + M * j] = values[m + M * (s + S * b)]; | |
} | |
} | |
void run_N_dense_to_jagged( | |
float *d_Values, | |
int *d_Offsets, | |
int *d_InvOffsets, | |
float *d_Output, | |
const int B, | |
const int S, | |
const int M, | |
const int J, | |
const int blocksPerGrid, | |
const int num_conversions, | |
bool use_precomputed | |
) { | |
if (use_precomputed) { | |
cuda_binary_search<<<blocksPerGrid, THREADS_PER_BLOCK>>>(d_Offsets, d_InvOffsets, B, J); | |
} | |
for (int i=0; i<num_conversions; ++i) { | |
if (use_precomputed) { | |
cuda_dense_to_jagged_precomputed<<<blocksPerGrid, THREADS_PER_BLOCK>>>(d_Values, d_Offsets, d_InvOffsets, d_Output, B, S, M, J); | |
} else { | |
cuda_dense_to_jagged<<<blocksPerGrid, THREADS_PER_BLOCK>>>(d_Values, d_Offsets, d_Output, B, S, M, J); | |
} | |
} | |
} | |
void benchmark_N( | |
float *h_Output, | |
float *h_OutputRef, | |
float *d_Values, | |
int *d_Offsets, | |
int *d_InvOffsets, | |
float *d_Output, | |
const int B, | |
const int S, | |
const int M, | |
const int J, | |
const int blocksPerGrid, | |
const int num_conversions, | |
bool use_precomputed | |
) { | |
for (int i = 0; i < 4; ++i) { | |
run_N_dense_to_jagged( | |
d_Values, | |
d_Offsets, | |
d_InvOffsets, | |
d_Output, | |
B, S, M, J, | |
blocksPerGrid, num_conversions, | |
use_precomputed | |
); | |
CHECK(cudaDeviceSynchronize()); | |
} | |
char buf[100]; | |
if (use_precomputed) { | |
sprintf(buf, "dense-to-jagged PRECOMPUTED-BS on GPU, %dx", num_conversions); | |
} else { | |
sprintf(buf, "dense-to-jagged REPEATED-BS on GPU, %dx", num_conversions); | |
} | |
{ | |
TIMER t(buf, 20); | |
for (int i = 0; i < 20; ++ i) { | |
run_N_dense_to_jagged( | |
d_Values, | |
d_Offsets, | |
d_InvOffsets, | |
d_Output, | |
B, S, M, J, | |
blocksPerGrid, num_conversions, | |
use_precomputed | |
); | |
} | |
CHECK(cudaDeviceSynchronize()); | |
} | |
CHECK(cudaMemcpy(h_OutputRef, d_Output, J * M * sizeof(float), cudaMemcpyDeviceToHost)); | |
// Verify that the result vector is correct | |
for (int i = 0; i < J * M; ++i) { | |
if (h_Output[i] != h_OutputRef[i]) { | |
fprintf(stderr, "Result verification failed at element %d: expect %d, got %d!\n", i, h_Output[i], h_OutputRef[i]); | |
exit(EXIT_FAILURE); | |
} | |
} | |
printf("Test PASSED\n"); | |
} | |
int main(void) { | |
// Print the vector length to be used, and compute its size | |
int B = 1024; | |
int S = 512; | |
int M = 96; | |
int numOffsets = B + 1; | |
int numElements = B * S * M; | |
printf("[Dense->jagged with dense having %d elements (%d x %d x %d)]\n", numElements, B, S, M); | |
float *h_Values = (float *)malloc(B * S * M * sizeof(float)); | |
int *h_Offsets = (int *)malloc(numOffsets * sizeof(int)); | |
// Verify that allocations succeeded | |
if (h_Values == NULL || h_Offsets == NULL) { | |
fprintf(stderr, "Failed to allocate host vectors!\n"); | |
exit(EXIT_FAILURE); | |
} | |
for (int i = 0; i < B*S*M; ++i) { | |
h_Values[i] = rand() / (float)RAND_MAX; | |
} | |
for (int i = 0; i < B; ++i) { | |
h_Offsets[i+1] = h_Offsets[i] + (int) (rand() / (float)RAND_MAX * (S-2)) + 2; | |
} | |
int TOTAL_LENGTH = h_Offsets[B]; | |
int J = TOTAL_LENGTH; | |
int numJaggedElements = M * TOTAL_LENGTH; | |
printf("Jagged tensor is %d x %d\n", J, M); | |
float *h_Output = (float *)malloc(numJaggedElements * sizeof(float)); | |
float *h_OutputRef = (float *)malloc(numJaggedElements * sizeof(float)); | |
if (h_Output == NULL || h_OutputRef == NULL) { | |
fprintf(stderr, "Failed to allocate host vectors!\n"); | |
exit(EXIT_FAILURE); | |
} | |
float *d_Values = NULL; | |
int *d_Offsets = NULL; | |
int *d_InvOffsets = NULL; | |
float *d_Output = NULL; | |
CHECK(cudaMalloc((void **) &d_Values, numElements * sizeof(float))); | |
CHECK(cudaMalloc((void **) &d_Offsets, numOffsets * sizeof(int))); | |
CHECK(cudaMalloc((void **) &d_InvOffsets, J * sizeof(int))); | |
CHECK(cudaMalloc((void **) &d_Output, numJaggedElements * sizeof(float))); | |
// Copy the host input vectors A and B in host memory to the device input | |
// vectors in | |
// device memory | |
printf("Copy input data from the host memory to the CUDA device\n"); | |
CHECK(cudaMemcpy(d_Values, h_Values, numElements * sizeof(float), cudaMemcpyHostToDevice)); | |
CHECK(cudaMemcpy(d_Offsets, h_Offsets, numOffsets * sizeof(int), cudaMemcpyHostToDevice)); | |
// Launch the Vector Add CUDA Kernel | |
int blocksPerGrid = (numJaggedElements + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; | |
printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid, | |
THREADS_PER_BLOCK); | |
printf("Run CPU to get reference\n"); | |
{ | |
TIMER t("dense_to_jagged on CPU, 1x"); | |
cpu_dense_to_jagged(h_Values, h_Offsets, h_Output, B, S, M, J); | |
} | |
for (int i=1; i<=32; i *= 2) { | |
for (int j=0; j<2; ++j) { | |
benchmark_N( | |
h_Output, | |
h_OutputRef, | |
d_Values, | |
d_Offsets, | |
d_InvOffsets, | |
d_Output, | |
B, S, M, J, | |
blocksPerGrid, i, | |
j | |
); | |
} | |
} | |
CHECK(cudaFree(d_Values)); | |
CHECK(cudaFree(d_Offsets)); | |
CHECK(cudaFree(d_InvOffsets)); | |
CHECK(cudaFree(d_Output)); | |
// Free host memory | |
free(h_Values); | |
free(h_Offsets); | |
free(h_Output); | |
free(h_OutputRef); | |
printf("Done\n"); | |
return 0; | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdio.h> | |
#include <string> | |
#include <sys/time.h> | |
double cpuSecond() { | |
struct timeval tp; | |
gettimeofday(&tp,NULL); | |
return ((double)tp.tv_sec + (double)tp.tv_usec*1.e-6); | |
} | |
struct TIMER { | |
std::string name; | |
double start_time; | |
int repeats; | |
TIMER(std::string name, int repeats = 1) : name(std::move(name)), repeats(repeats) { | |
start_time = cpuSecond(); | |
} | |
~TIMER() { | |
double end_time = cpuSecond(); | |
printf("Time for %s: %f ms\n", name.c_str(), (end_time - start_time) * 1000 / repeats); | |
} | |
}; | |
#define CHECK(call) \ | |
{ \ | |
const cudaError_t error = call; \ | |
if (error != cudaSuccess) \ | |
{ \ | |
printf("Error: %s:%d, ", __FILE__, __LINE__); \ | |
printf("code:%d, reason: %s\n", error, cudaGetErrorString(error)); \ | |
exit(1); \ | |
} \ | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
[Dense->jagged with dense having 50331648 elements (1024 x 512 x 96)] | |
Jagged tensor is 259493 x 96 | |
Copy input data from the host memory to the CUDA device | |
CUDA kernel launch with 97310 blocks of 256 threads | |
Run CPU to get reference | |
Time for dense_to_jagged on CPU, 1x: 53.478956 ms | |
Time for dense-to-jagged REPEATED-BS on GPU, 1x: 0.320649 ms | |
Test PASSED | |
Time for dense-to-jagged PRECOMPUTED-BS on GPU, 1x: 0.286150 ms | |
Test PASSED | |
Time for dense-to-jagged REPEATED-BS on GPU, 2x: 0.640309 ms | |
Test PASSED | |
Time for dense-to-jagged PRECOMPUTED-BS on GPU, 2x: 0.496244 ms | |
Test PASSED | |
Time for dense-to-jagged REPEATED-BS on GPU, 4x: 1.279855 ms | |
Test PASSED | |
Time for dense-to-jagged PRECOMPUTED-BS on GPU, 4x: 0.915551 ms | |
Test PASSED | |
Time for dense-to-jagged REPEATED-BS on GPU, 8x: 2.558100 ms | |
Test PASSED | |
Time for dense-to-jagged PRECOMPUTED-BS on GPU, 8x: 1.755655 ms | |
Test PASSED | |
Time for dense-to-jagged REPEATED-BS on GPU, 16x: 5.114698 ms | |
Test PASSED | |
Time for dense-to-jagged PRECOMPUTED-BS on GPU, 16x: 3.434896 ms | |
Test PASSED | |
Time for dense-to-jagged REPEATED-BS on GPU, 32x: 10.237050 ms | |
Test PASSED | |
Time for dense-to-jagged PRECOMPUTED-BS on GPU, 32x: 6.793153 ms | |
Test PASSED | |
Done |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment