Skip to content

Instantly share code, notes, and snippets.

@jerry73204
Created October 22, 2025 12:48
Show Gist options
  • Save jerry73204/d1f28c09eb4ea5d8413988668817813f to your computer and use it in GitHub Desktop.
Save jerry73204/d1f28c09eb4ea5d8413988668817813f to your computer and use it in GitHub Desktop.
#!/bin/bash
# CUDA test script to verify GPU functionality in container
set -e
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
TEST_DIR="${SCRIPT_DIR}/cuda_test"
CUDA_SRC="${TEST_DIR}/cuda_test.cu"
CUDA_BIN="${TEST_DIR}/cuda_test"
# Create test directory
mkdir -p "${TEST_DIR}"
# Create CUDA test program
cat > "${CUDA_SRC}" << 'EOF'
#include <stdio.h>
#include <cuda_runtime.h>
// CUDA kernel for simple vector addition
__global__ void vectorAdd(const float *A, const float *B, float *C, int numElements)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements) {
C[i] = A[i] + B[i];
}
}
void checkCudaError(cudaError_t err, const char *msg)
{
if (err != cudaSuccess) {
fprintf(stderr, "CUDA Error: %s - %s\n", msg, cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
}
int main(void)
{
printf("=== CUDA Device Test ===\n\n");
// Check CUDA device count
int deviceCount = 0;
cudaError_t err = cudaGetDeviceCount(&deviceCount);
checkCudaError(err, "Failed to get device count");
printf("CUDA Devices Found: %d\n\n", deviceCount);
if (deviceCount == 0) {
printf("No CUDA-capable devices found!\n");
return 1;
}
// Display device properties
for (int dev = 0; dev < deviceCount; dev++) {
cudaDeviceProp deviceProp;
cudaGetDeviceProperties(&deviceProp, dev);
printf("Device %d: %s\n", dev, deviceProp.name);
printf(" Compute Capability: %d.%d\n", deviceProp.major, deviceProp.minor);
printf(" Total Global Memory: %.2f GB\n",
(float)deviceProp.totalGlobalMem / (1024*1024*1024));
printf(" Multiprocessors: %d\n", deviceProp.multiProcessorCount);
printf(" CUDA Cores: %d\n",
deviceProp.multiProcessorCount * 128); // Approximate for Jetson
printf(" Max Threads per Block: %d\n", deviceProp.maxThreadsPerBlock);
printf(" Max Threads Dim: (%d, %d, %d)\n",
deviceProp.maxThreadsDim[0],
deviceProp.maxThreadsDim[1],
deviceProp.maxThreadsDim[2]);
printf(" Max Grid Size: (%d, %d, %d)\n",
deviceProp.maxGridSize[0],
deviceProp.maxGridSize[1],
deviceProp.maxGridSize[2]);
printf(" Warp Size: %d\n", deviceProp.warpSize);
printf(" Memory Clock Rate: %.2f MHz\n", deviceProp.memoryClockRate / 1000.0);
printf(" Memory Bus Width: %d-bit\n", deviceProp.memoryBusWidth);
printf(" L2 Cache Size: %.2f MB\n", deviceProp.l2CacheSize / (1024.0*1024.0));
printf("\n");
}
// Perform simple vector addition test
printf("=== Vector Addition Test ===\n");
int numElements = 50000;
size_t size = numElements * sizeof(float);
printf("Vector size: %d elements (%.2f KB)\n", numElements, size / 1024.0);
// Allocate host memory
float *h_A = (float *)malloc(size);
float *h_B = (float *)malloc(size);
float *h_C = (float *)malloc(size);
// Initialize input vectors
for (int i = 0; i < numElements; ++i) {
h_A[i] = rand() / (float)RAND_MAX;
h_B[i] = rand() / (float)RAND_MAX;
}
// Allocate device memory
float *d_A = NULL, *d_B = NULL, *d_C = NULL;
checkCudaError(cudaMalloc((void **)&d_A, size), "Failed to allocate d_A");
checkCudaError(cudaMalloc((void **)&d_B, size), "Failed to allocate d_B");
checkCudaError(cudaMalloc((void **)&d_C, size), "Failed to allocate d_C");
// Copy data to device
checkCudaError(cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice),
"Failed to copy h_A to device");
checkCudaError(cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice),
"Failed to copy h_B to device");
// Launch kernel
int threadsPerBlock = 256;
int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
printf("Launching kernel: %d blocks x %d threads\n", blocksPerGrid, threadsPerBlock);
vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, numElements);
checkCudaError(cudaGetLastError(), "Failed to launch kernel");
// Copy result back to host
checkCudaError(cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost),
"Failed to copy result to host");
// Verify result
printf("Verifying results...\n");
bool success = true;
for (int i = 0; i < numElements; ++i) {
float expected = h_A[i] + h_B[i];
if (fabs(h_C[i] - expected) > 1e-5) {
fprintf(stderr, "Result verification failed at element %d!\n", i);
fprintf(stderr, "Expected: %f, Got: %f\n", expected, h_C[i]);
success = false;
break;
}
}
if (success) {
printf("✓ Test PASSED! GPU computation successful.\n");
}
// Cleanup
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
free(h_A);
free(h_B);
free(h_C);
printf("\n=== CUDA Test Complete ===\n");
return success ? 0 : 1;
}
EOF
echo "=== CUDA Test Script ==="
echo "Source file: ${CUDA_SRC}"
echo ""
# Check if nvcc is available
if ! command -v nvcc &> /dev/null; then
echo "ERROR: nvcc (CUDA compiler) not found!"
echo "Please ensure CUDA toolkit is installed in the container."
exit 1
fi
# Display CUDA version
echo "CUDA Version Information:"
nvcc --version
echo ""
# Compile the CUDA program
echo "Compiling CUDA test program..."
nvcc -o "${CUDA_BIN}" "${CUDA_SRC}" -O2
if [ $? -eq 0 ]; then
echo "✓ Compilation successful!"
echo ""
# Run the test
echo "Running CUDA test..."
echo ""
"${CUDA_BIN}"
exit_code=$?
echo ""
if [ $exit_code -eq 0 ]; then
echo "✓ All tests passed!"
else
echo "✗ Tests failed with exit code: $exit_code"
fi
exit $exit_code
else
echo "✗ Compilation failed!"
exit 1
fi
#!/bin/bash
# TensorRT test script to verify inference optimization functionality
set -e
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
TEST_DIR="${SCRIPT_DIR}/tensorrt_test"
TRT_SRC="${TEST_DIR}/tensorrt_test.cpp"
TRT_BIN="${TEST_DIR}/tensorrt_test"
# Create test directory
mkdir -p "${TEST_DIR}"
# Create TensorRT test program
cat > "${TRT_SRC}" << 'EOF'
#include <iostream>
#include <fstream>
#include <vector>
#include <cuda_runtime_api.h>
#include <NvInfer.h>
#include <NvOnnxParser.h>
using namespace nvinfer1;
// Logger for TensorRT
class Logger : public ILogger {
void log(Severity severity, const char* msg) noexcept override {
if (severity <= Severity::kWARNING)
std::cout << msg << std::endl;
}
} gLogger;
void checkCudaError(cudaError_t err, const char* msg) {
if (err != cudaSuccess) {
std::cerr << "CUDA Error: " << msg << " - " << cudaGetErrorString(err) << std::endl;
exit(EXIT_FAILURE);
}
}
int main() {
std::cout << "=== TensorRT Test ===" << std::endl << std::endl;
// Check TensorRT version
std::cout << "TensorRT Version: "
<< NV_TENSORRT_MAJOR << "."
<< NV_TENSORRT_MINOR << "."
<< NV_TENSORRT_PATCH << std::endl << std::endl;
// Check CUDA device
int deviceCount = 0;
cudaError_t err = cudaGetDeviceCount(&deviceCount);
checkCudaError(err, "Failed to get device count");
if (deviceCount == 0) {
std::cerr << "No CUDA-capable devices found!" << std::endl;
return 1;
}
cudaDeviceProp deviceProp;
cudaGetDeviceProperties(&deviceProp, 0);
std::cout << "Using GPU: " << deviceProp.name << std::endl;
std::cout << "Compute Capability: " << deviceProp.major << "." << deviceProp.minor << std::endl << std::endl;
// Create builder
std::cout << "Creating TensorRT builder..." << std::endl;
IBuilder* builder = createInferBuilder(gLogger);
if (!builder) {
std::cerr << "Failed to create builder!" << std::endl;
return 1;
}
// Create network
const uint32_t explicitBatch = 1U << static_cast<uint32_t>(NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
INetworkDefinition* network = builder->createNetworkV2(explicitBatch);
if (!network) {
std::cerr << "Failed to create network!" << std::endl;
return 1;
}
std::cout << "Building simple test network..." << std::endl;
// Create a simple network: Input -> Convolution -> Output
// Input: [1, 3, 4, 4] (batch, channels, height, width)
// Output: [1, 2, 4, 4]
const int batchSize = 1;
const int inputChannels = 3;
const int outputChannels = 2;
const int height = 4;
const int width = 4;
ITensor* input = network->addInput("input", DataType::kFLOAT, Dims4{batchSize, inputChannels, height, width});
if (!input) {
std::cerr << "Failed to add input!" << std::endl;
return 1;
}
// Create weights for convolution (1x1 conv)
const int kernelSize = 1;
const int weightsSize = outputChannels * inputChannels * kernelSize * kernelSize;
std::vector<float> weights(weightsSize);
std::vector<float> bias(outputChannels);
// Initialize weights and bias (simple pattern for testing)
for (int i = 0; i < weightsSize; i++) {
weights[i] = 0.1f;
}
for (int i = 0; i < outputChannels; i++) {
bias[i] = 0.5f;
}
Weights kernelWeights{DataType::kFLOAT, weights.data(), weightsSize};
Weights biasWeights{DataType::kFLOAT, bias.data(), outputChannels};
// Add 1x1 convolution layer
IConvolutionLayer* conv = network->addConvolutionNd(*input, outputChannels, DimsHW{1, 1}, kernelWeights, biasWeights);
if (!conv) {
std::cerr << "Failed to add convolution layer!" << std::endl;
return 1;
}
conv->getOutput(0)->setName("output");
network->markOutput(*conv->getOutput(0));
std::cout << "Network structure:" << std::endl;
std::cout << " Input: [1, 3, 4, 4]" << std::endl;
std::cout << " Conv 1x1: 3 channels -> 2 channels" << std::endl;
std::cout << " Output: [1, 2, 4, 4]" << std::endl << std::endl;
// Build engine
std::cout << "Building TensorRT engine..." << std::endl;
IBuilderConfig* config = builder->createBuilderConfig();
if (!config) {
std::cerr << "Failed to create builder config!" << std::endl;
return 1;
}
// Set memory pool limit
config->setMemoryPoolLimit(MemoryPoolType::kWORKSPACE, 1U << 28); // 256 MB
// Enable FP16 if supported
if (builder->platformHasFastFp16()) {
std::cout << "FP16 mode: ENABLED" << std::endl;
config->setFlag(BuilderFlag::kFP16);
} else {
std::cout << "FP16 mode: NOT SUPPORTED" << std::endl;
}
// Build serialized network
IHostMemory* serializedModel = builder->buildSerializedNetwork(*network, *config);
if (!serializedModel) {
std::cerr << "Failed to build engine!" << std::endl;
return 1;
}
std::cout << "✓ Engine built successfully!" << std::endl;
std::cout << "Engine size: " << serializedModel->size() / 1024.0 << " KB" << std::endl << std::endl;
// Create runtime and deserialize engine
std::cout << "Creating runtime and deserializing engine..." << std::endl;
IRuntime* runtime = createInferRuntime(gLogger);
if (!runtime) {
std::cerr << "Failed to create runtime!" << std::endl;
return 1;
}
ICudaEngine* engine = runtime->deserializeCudaEngine(serializedModel->data(), serializedModel->size());
if (!engine) {
std::cerr << "Failed to deserialize engine!" << std::endl;
return 1;
}
IExecutionContext* context = engine->createExecutionContext();
if (!context) {
std::cerr << "Failed to create execution context!" << std::endl;
return 1;
}
std::cout << "✓ Runtime initialized successfully!" << std::endl << std::endl;
// Prepare input and output buffers
std::cout << "Running inference test..." << std::endl;
const int inputSize = batchSize * inputChannels * height * width;
const int outputSize = batchSize * outputChannels * height * width;
std::vector<float> hostInput(inputSize, 1.0f); // Initialize with 1.0
std::vector<float> hostOutput(outputSize, 0.0f);
void* deviceInput = nullptr;
void* deviceOutput = nullptr;
checkCudaError(cudaMalloc(&deviceInput, inputSize * sizeof(float)), "Failed to allocate device input");
checkCudaError(cudaMalloc(&deviceOutput, outputSize * sizeof(float)), "Failed to allocate device output");
// Copy input to device
checkCudaError(cudaMemcpy(deviceInput, hostInput.data(), inputSize * sizeof(float), cudaMemcpyHostToDevice),
"Failed to copy input to device");
// Set up bindings
void* bindings[] = {deviceInput, deviceOutput};
// Execute inference
bool status = context->executeV2(bindings);
if (!status) {
std::cerr << "Failed to execute inference!" << std::endl;
return 1;
}
// Copy output back to host
checkCudaError(cudaMemcpy(hostOutput.data(), deviceOutput, outputSize * sizeof(float), cudaMemcpyDeviceToHost),
"Failed to copy output to host");
std::cout << "✓ Inference completed successfully!" << std::endl << std::endl;
// Display results
std::cout << "Input tensor: [" << batchSize << ", " << inputChannels << ", "
<< height << ", " << width << "] = " << inputSize << " elements" << std::endl;
std::cout << "Output tensor: [" << batchSize << ", " << outputChannels << ", "
<< height << ", " << width << "] = " << outputSize << " elements" << std::endl << std::endl;
std::cout << "Sample input values (first 5):" << std::endl;
for (int i = 0; i < std::min(5, inputSize); i++) {
std::cout << " [" << i << "] = " << hostInput[i] << std::endl;
}
std::cout << "\nSample output values (first 5):" << std::endl;
for (int i = 0; i < std::min(5, outputSize); i++) {
std::cout << " [" << i << "] = " << hostOutput[i] << std::endl;
}
// Verify output (with input=1.0 and weights=0.1 for 3 channels, bias=0.5)
// Expected: 3 * 0.1 + 0.5 = 0.8
std::cout << "\nVerifying results..." << std::endl;
bool success = true;
float expected = inputChannels * 0.1f + 0.5f;
int errors = 0;
const int maxErrors = 5;
for (int i = 0; i < outputSize; i++) {
if (std::abs(hostOutput[i] - expected) > 0.01f) {
if (errors < maxErrors) {
std::cerr << "Verification failed at output[" << i << "]!" << std::endl;
std::cerr << "Expected: ~" << expected << ", Got: " << hostOutput[i] << std::endl;
}
errors++;
success = false;
}
}
if (errors > maxErrors) {
std::cerr << "... and " << (errors - maxErrors) << " more errors" << std::endl;
}
if (success) {
std::cout << "✓ Results verified successfully!" << std::endl;
}
// Cleanup
cudaFree(deviceInput);
cudaFree(deviceOutput);
delete context;
delete engine;
delete runtime;
delete serializedModel;
delete config;
delete network;
delete builder;
std::cout << "\n=== TensorRT Test Complete ===" << std::endl;
return success ? 0 : 1;
}
EOF
echo "=== TensorRT Test Script ==="
echo "Source file: ${TRT_SRC}"
echo ""
# Check if nvcc is available
if ! command -v nvcc &> /dev/null; then
echo "ERROR: nvcc (CUDA compiler) not found!"
echo "Please ensure CUDA toolkit is installed in the container."
exit 1
fi
# Check for TensorRT headers
TENSORRT_INCLUDE="/usr/include/aarch64-linux-gnu"
if [ ! -f "${TENSORRT_INCLUDE}/NvInfer.h" ] && [ ! -f "/usr/include/NvInfer.h" ]; then
echo "ERROR: TensorRT headers not found!"
echo "Please ensure TensorRT is installed in the container."
echo "Checked paths:"
echo " - /usr/include/NvInfer.h"
echo " - ${TENSORRT_INCLUDE}/NvInfer.h"
exit 1
fi
# Display versions
echo "CUDA Version:"
nvcc --version | grep "release"
echo ""
# Compile the TensorRT program
echo "Compiling TensorRT test program..."
# Try to find TensorRT libraries
TRT_LIB_PATH="/usr/lib/aarch64-linux-gnu"
if [ ! -d "${TRT_LIB_PATH}" ]; then
TRT_LIB_PATH="/usr/lib"
fi
# Compile with appropriate flags
nvcc -o "${TRT_BIN}" "${TRT_SRC}" \
-I/usr/include/aarch64-linux-gnu \
-L${TRT_LIB_PATH} \
-lnvinfer \
-lnvonnxparser \
-lcudart \
-std=c++11 \
-O2
if [ $? -eq 0 ]; then
echo "✓ Compilation successful!"
echo ""
# Run the test
echo "Running TensorRT test..."
echo ""
"${TRT_BIN}"
exit_code=$?
echo ""
if [ $exit_code -eq 0 ]; then
echo "✓ All tests passed!"
else
echo "✗ Tests failed with exit code: $exit_code"
fi
exit $exit_code
else
echo "✗ Compilation failed!"
echo ""
echo "Debug information:"
echo "TensorRT include path: ${TENSORRT_INCLUDE}"
echo "TensorRT library path: ${TRT_LIB_PATH}"
echo ""
echo "Available TensorRT libraries:"
ls -l ${TRT_LIB_PATH}/libnv* 2>/dev/null | grep -i infer || echo " No TensorRT libraries found"
exit 1
fi
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment