Created
October 22, 2025 12:48
-
-
Save jerry73204/d1f28c09eb4ea5d8413988668817813f to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/bin/bash | |
| # CUDA test script to verify GPU functionality in container | |
| set -e | |
| SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" | |
| TEST_DIR="${SCRIPT_DIR}/cuda_test" | |
| CUDA_SRC="${TEST_DIR}/cuda_test.cu" | |
| CUDA_BIN="${TEST_DIR}/cuda_test" | |
| # Create test directory | |
| mkdir -p "${TEST_DIR}" | |
| # Create CUDA test program | |
| cat > "${CUDA_SRC}" << 'EOF' | |
| #include <stdio.h> | |
| #include <cuda_runtime.h> | |
| // CUDA kernel for simple vector addition | |
| __global__ void vectorAdd(const float *A, const float *B, float *C, int numElements) | |
| { | |
| int i = blockDim.x * blockIdx.x + threadIdx.x; | |
| if (i < numElements) { | |
| C[i] = A[i] + B[i]; | |
| } | |
| } | |
| void checkCudaError(cudaError_t err, const char *msg) | |
| { | |
| if (err != cudaSuccess) { | |
| fprintf(stderr, "CUDA Error: %s - %s\n", msg, cudaGetErrorString(err)); | |
| exit(EXIT_FAILURE); | |
| } | |
| } | |
| int main(void) | |
| { | |
| printf("=== CUDA Device Test ===\n\n"); | |
| // Check CUDA device count | |
| int deviceCount = 0; | |
| cudaError_t err = cudaGetDeviceCount(&deviceCount); | |
| checkCudaError(err, "Failed to get device count"); | |
| printf("CUDA Devices Found: %d\n\n", deviceCount); | |
| if (deviceCount == 0) { | |
| printf("No CUDA-capable devices found!\n"); | |
| return 1; | |
| } | |
| // Display device properties | |
| for (int dev = 0; dev < deviceCount; dev++) { | |
| cudaDeviceProp deviceProp; | |
| cudaGetDeviceProperties(&deviceProp, dev); | |
| printf("Device %d: %s\n", dev, deviceProp.name); | |
| printf(" Compute Capability: %d.%d\n", deviceProp.major, deviceProp.minor); | |
| printf(" Total Global Memory: %.2f GB\n", | |
| (float)deviceProp.totalGlobalMem / (1024*1024*1024)); | |
| printf(" Multiprocessors: %d\n", deviceProp.multiProcessorCount); | |
| printf(" CUDA Cores: %d\n", | |
| deviceProp.multiProcessorCount * 128); // Approximate for Jetson | |
| printf(" Max Threads per Block: %d\n", deviceProp.maxThreadsPerBlock); | |
| printf(" Max Threads Dim: (%d, %d, %d)\n", | |
| deviceProp.maxThreadsDim[0], | |
| deviceProp.maxThreadsDim[1], | |
| deviceProp.maxThreadsDim[2]); | |
| printf(" Max Grid Size: (%d, %d, %d)\n", | |
| deviceProp.maxGridSize[0], | |
| deviceProp.maxGridSize[1], | |
| deviceProp.maxGridSize[2]); | |
| printf(" Warp Size: %d\n", deviceProp.warpSize); | |
| printf(" Memory Clock Rate: %.2f MHz\n", deviceProp.memoryClockRate / 1000.0); | |
| printf(" Memory Bus Width: %d-bit\n", deviceProp.memoryBusWidth); | |
| printf(" L2 Cache Size: %.2f MB\n", deviceProp.l2CacheSize / (1024.0*1024.0)); | |
| printf("\n"); | |
| } | |
| // Perform simple vector addition test | |
| printf("=== Vector Addition Test ===\n"); | |
| int numElements = 50000; | |
| size_t size = numElements * sizeof(float); | |
| printf("Vector size: %d elements (%.2f KB)\n", numElements, size / 1024.0); | |
| // Allocate host memory | |
| float *h_A = (float *)malloc(size); | |
| float *h_B = (float *)malloc(size); | |
| float *h_C = (float *)malloc(size); | |
| // Initialize input vectors | |
| for (int i = 0; i < numElements; ++i) { | |
| h_A[i] = rand() / (float)RAND_MAX; | |
| h_B[i] = rand() / (float)RAND_MAX; | |
| } | |
| // Allocate device memory | |
| float *d_A = NULL, *d_B = NULL, *d_C = NULL; | |
| checkCudaError(cudaMalloc((void **)&d_A, size), "Failed to allocate d_A"); | |
| checkCudaError(cudaMalloc((void **)&d_B, size), "Failed to allocate d_B"); | |
| checkCudaError(cudaMalloc((void **)&d_C, size), "Failed to allocate d_C"); | |
| // Copy data to device | |
| checkCudaError(cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice), | |
| "Failed to copy h_A to device"); | |
| checkCudaError(cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice), | |
| "Failed to copy h_B to device"); | |
| // Launch kernel | |
| int threadsPerBlock = 256; | |
| int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; | |
| printf("Launching kernel: %d blocks x %d threads\n", blocksPerGrid, threadsPerBlock); | |
| vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, numElements); | |
| checkCudaError(cudaGetLastError(), "Failed to launch kernel"); | |
| // Copy result back to host | |
| checkCudaError(cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost), | |
| "Failed to copy result to host"); | |
| // Verify result | |
| printf("Verifying results...\n"); | |
| bool success = true; | |
| for (int i = 0; i < numElements; ++i) { | |
| float expected = h_A[i] + h_B[i]; | |
| if (fabs(h_C[i] - expected) > 1e-5) { | |
| fprintf(stderr, "Result verification failed at element %d!\n", i); | |
| fprintf(stderr, "Expected: %f, Got: %f\n", expected, h_C[i]); | |
| success = false; | |
| break; | |
| } | |
| } | |
| if (success) { | |
| printf("✓ Test PASSED! GPU computation successful.\n"); | |
| } | |
| // Cleanup | |
| cudaFree(d_A); | |
| cudaFree(d_B); | |
| cudaFree(d_C); | |
| free(h_A); | |
| free(h_B); | |
| free(h_C); | |
| printf("\n=== CUDA Test Complete ===\n"); | |
| return success ? 0 : 1; | |
| } | |
| EOF | |
| echo "=== CUDA Test Script ===" | |
| echo "Source file: ${CUDA_SRC}" | |
| echo "" | |
| # Check if nvcc is available | |
| if ! command -v nvcc &> /dev/null; then | |
| echo "ERROR: nvcc (CUDA compiler) not found!" | |
| echo "Please ensure CUDA toolkit is installed in the container." | |
| exit 1 | |
| fi | |
| # Display CUDA version | |
| echo "CUDA Version Information:" | |
| nvcc --version | |
| echo "" | |
| # Compile the CUDA program | |
| echo "Compiling CUDA test program..." | |
| nvcc -o "${CUDA_BIN}" "${CUDA_SRC}" -O2 | |
| if [ $? -eq 0 ]; then | |
| echo "✓ Compilation successful!" | |
| echo "" | |
| # Run the test | |
| echo "Running CUDA test..." | |
| echo "" | |
| "${CUDA_BIN}" | |
| exit_code=$? | |
| echo "" | |
| if [ $exit_code -eq 0 ]; then | |
| echo "✓ All tests passed!" | |
| else | |
| echo "✗ Tests failed with exit code: $exit_code" | |
| fi | |
| exit $exit_code | |
| else | |
| echo "✗ Compilation failed!" | |
| exit 1 | |
| fi |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/bin/bash | |
| # TensorRT test script to verify inference optimization functionality | |
| set -e | |
| SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" | |
| TEST_DIR="${SCRIPT_DIR}/tensorrt_test" | |
| TRT_SRC="${TEST_DIR}/tensorrt_test.cpp" | |
| TRT_BIN="${TEST_DIR}/tensorrt_test" | |
| # Create test directory | |
| mkdir -p "${TEST_DIR}" | |
| # Create TensorRT test program | |
| cat > "${TRT_SRC}" << 'EOF' | |
| #include <iostream> | |
| #include <fstream> | |
| #include <vector> | |
| #include <cuda_runtime_api.h> | |
| #include <NvInfer.h> | |
| #include <NvOnnxParser.h> | |
| using namespace nvinfer1; | |
| // Logger for TensorRT | |
| class Logger : public ILogger { | |
| void log(Severity severity, const char* msg) noexcept override { | |
| if (severity <= Severity::kWARNING) | |
| std::cout << msg << std::endl; | |
| } | |
| } gLogger; | |
| void checkCudaError(cudaError_t err, const char* msg) { | |
| if (err != cudaSuccess) { | |
| std::cerr << "CUDA Error: " << msg << " - " << cudaGetErrorString(err) << std::endl; | |
| exit(EXIT_FAILURE); | |
| } | |
| } | |
| int main() { | |
| std::cout << "=== TensorRT Test ===" << std::endl << std::endl; | |
| // Check TensorRT version | |
| std::cout << "TensorRT Version: " | |
| << NV_TENSORRT_MAJOR << "." | |
| << NV_TENSORRT_MINOR << "." | |
| << NV_TENSORRT_PATCH << std::endl << std::endl; | |
| // Check CUDA device | |
| int deviceCount = 0; | |
| cudaError_t err = cudaGetDeviceCount(&deviceCount); | |
| checkCudaError(err, "Failed to get device count"); | |
| if (deviceCount == 0) { | |
| std::cerr << "No CUDA-capable devices found!" << std::endl; | |
| return 1; | |
| } | |
| cudaDeviceProp deviceProp; | |
| cudaGetDeviceProperties(&deviceProp, 0); | |
| std::cout << "Using GPU: " << deviceProp.name << std::endl; | |
| std::cout << "Compute Capability: " << deviceProp.major << "." << deviceProp.minor << std::endl << std::endl; | |
| // Create builder | |
| std::cout << "Creating TensorRT builder..." << std::endl; | |
| IBuilder* builder = createInferBuilder(gLogger); | |
| if (!builder) { | |
| std::cerr << "Failed to create builder!" << std::endl; | |
| return 1; | |
| } | |
| // Create network | |
| const uint32_t explicitBatch = 1U << static_cast<uint32_t>(NetworkDefinitionCreationFlag::kEXPLICIT_BATCH); | |
| INetworkDefinition* network = builder->createNetworkV2(explicitBatch); | |
| if (!network) { | |
| std::cerr << "Failed to create network!" << std::endl; | |
| return 1; | |
| } | |
| std::cout << "Building simple test network..." << std::endl; | |
| // Create a simple network: Input -> Convolution -> Output | |
| // Input: [1, 3, 4, 4] (batch, channels, height, width) | |
| // Output: [1, 2, 4, 4] | |
| const int batchSize = 1; | |
| const int inputChannels = 3; | |
| const int outputChannels = 2; | |
| const int height = 4; | |
| const int width = 4; | |
| ITensor* input = network->addInput("input", DataType::kFLOAT, Dims4{batchSize, inputChannels, height, width}); | |
| if (!input) { | |
| std::cerr << "Failed to add input!" << std::endl; | |
| return 1; | |
| } | |
| // Create weights for convolution (1x1 conv) | |
| const int kernelSize = 1; | |
| const int weightsSize = outputChannels * inputChannels * kernelSize * kernelSize; | |
| std::vector<float> weights(weightsSize); | |
| std::vector<float> bias(outputChannels); | |
| // Initialize weights and bias (simple pattern for testing) | |
| for (int i = 0; i < weightsSize; i++) { | |
| weights[i] = 0.1f; | |
| } | |
| for (int i = 0; i < outputChannels; i++) { | |
| bias[i] = 0.5f; | |
| } | |
| Weights kernelWeights{DataType::kFLOAT, weights.data(), weightsSize}; | |
| Weights biasWeights{DataType::kFLOAT, bias.data(), outputChannels}; | |
| // Add 1x1 convolution layer | |
| IConvolutionLayer* conv = network->addConvolutionNd(*input, outputChannels, DimsHW{1, 1}, kernelWeights, biasWeights); | |
| if (!conv) { | |
| std::cerr << "Failed to add convolution layer!" << std::endl; | |
| return 1; | |
| } | |
| conv->getOutput(0)->setName("output"); | |
| network->markOutput(*conv->getOutput(0)); | |
| std::cout << "Network structure:" << std::endl; | |
| std::cout << " Input: [1, 3, 4, 4]" << std::endl; | |
| std::cout << " Conv 1x1: 3 channels -> 2 channels" << std::endl; | |
| std::cout << " Output: [1, 2, 4, 4]" << std::endl << std::endl; | |
| // Build engine | |
| std::cout << "Building TensorRT engine..." << std::endl; | |
| IBuilderConfig* config = builder->createBuilderConfig(); | |
| if (!config) { | |
| std::cerr << "Failed to create builder config!" << std::endl; | |
| return 1; | |
| } | |
| // Set memory pool limit | |
| config->setMemoryPoolLimit(MemoryPoolType::kWORKSPACE, 1U << 28); // 256 MB | |
| // Enable FP16 if supported | |
| if (builder->platformHasFastFp16()) { | |
| std::cout << "FP16 mode: ENABLED" << std::endl; | |
| config->setFlag(BuilderFlag::kFP16); | |
| } else { | |
| std::cout << "FP16 mode: NOT SUPPORTED" << std::endl; | |
| } | |
| // Build serialized network | |
| IHostMemory* serializedModel = builder->buildSerializedNetwork(*network, *config); | |
| if (!serializedModel) { | |
| std::cerr << "Failed to build engine!" << std::endl; | |
| return 1; | |
| } | |
| std::cout << "✓ Engine built successfully!" << std::endl; | |
| std::cout << "Engine size: " << serializedModel->size() / 1024.0 << " KB" << std::endl << std::endl; | |
| // Create runtime and deserialize engine | |
| std::cout << "Creating runtime and deserializing engine..." << std::endl; | |
| IRuntime* runtime = createInferRuntime(gLogger); | |
| if (!runtime) { | |
| std::cerr << "Failed to create runtime!" << std::endl; | |
| return 1; | |
| } | |
| ICudaEngine* engine = runtime->deserializeCudaEngine(serializedModel->data(), serializedModel->size()); | |
| if (!engine) { | |
| std::cerr << "Failed to deserialize engine!" << std::endl; | |
| return 1; | |
| } | |
| IExecutionContext* context = engine->createExecutionContext(); | |
| if (!context) { | |
| std::cerr << "Failed to create execution context!" << std::endl; | |
| return 1; | |
| } | |
| std::cout << "✓ Runtime initialized successfully!" << std::endl << std::endl; | |
| // Prepare input and output buffers | |
| std::cout << "Running inference test..." << std::endl; | |
| const int inputSize = batchSize * inputChannels * height * width; | |
| const int outputSize = batchSize * outputChannels * height * width; | |
| std::vector<float> hostInput(inputSize, 1.0f); // Initialize with 1.0 | |
| std::vector<float> hostOutput(outputSize, 0.0f); | |
| void* deviceInput = nullptr; | |
| void* deviceOutput = nullptr; | |
| checkCudaError(cudaMalloc(&deviceInput, inputSize * sizeof(float)), "Failed to allocate device input"); | |
| checkCudaError(cudaMalloc(&deviceOutput, outputSize * sizeof(float)), "Failed to allocate device output"); | |
| // Copy input to device | |
| checkCudaError(cudaMemcpy(deviceInput, hostInput.data(), inputSize * sizeof(float), cudaMemcpyHostToDevice), | |
| "Failed to copy input to device"); | |
| // Set up bindings | |
| void* bindings[] = {deviceInput, deviceOutput}; | |
| // Execute inference | |
| bool status = context->executeV2(bindings); | |
| if (!status) { | |
| std::cerr << "Failed to execute inference!" << std::endl; | |
| return 1; | |
| } | |
| // Copy output back to host | |
| checkCudaError(cudaMemcpy(hostOutput.data(), deviceOutput, outputSize * sizeof(float), cudaMemcpyDeviceToHost), | |
| "Failed to copy output to host"); | |
| std::cout << "✓ Inference completed successfully!" << std::endl << std::endl; | |
| // Display results | |
| std::cout << "Input tensor: [" << batchSize << ", " << inputChannels << ", " | |
| << height << ", " << width << "] = " << inputSize << " elements" << std::endl; | |
| std::cout << "Output tensor: [" << batchSize << ", " << outputChannels << ", " | |
| << height << ", " << width << "] = " << outputSize << " elements" << std::endl << std::endl; | |
| std::cout << "Sample input values (first 5):" << std::endl; | |
| for (int i = 0; i < std::min(5, inputSize); i++) { | |
| std::cout << " [" << i << "] = " << hostInput[i] << std::endl; | |
| } | |
| std::cout << "\nSample output values (first 5):" << std::endl; | |
| for (int i = 0; i < std::min(5, outputSize); i++) { | |
| std::cout << " [" << i << "] = " << hostOutput[i] << std::endl; | |
| } | |
| // Verify output (with input=1.0 and weights=0.1 for 3 channels, bias=0.5) | |
| // Expected: 3 * 0.1 + 0.5 = 0.8 | |
| std::cout << "\nVerifying results..." << std::endl; | |
| bool success = true; | |
| float expected = inputChannels * 0.1f + 0.5f; | |
| int errors = 0; | |
| const int maxErrors = 5; | |
| for (int i = 0; i < outputSize; i++) { | |
| if (std::abs(hostOutput[i] - expected) > 0.01f) { | |
| if (errors < maxErrors) { | |
| std::cerr << "Verification failed at output[" << i << "]!" << std::endl; | |
| std::cerr << "Expected: ~" << expected << ", Got: " << hostOutput[i] << std::endl; | |
| } | |
| errors++; | |
| success = false; | |
| } | |
| } | |
| if (errors > maxErrors) { | |
| std::cerr << "... and " << (errors - maxErrors) << " more errors" << std::endl; | |
| } | |
| if (success) { | |
| std::cout << "✓ Results verified successfully!" << std::endl; | |
| } | |
| // Cleanup | |
| cudaFree(deviceInput); | |
| cudaFree(deviceOutput); | |
| delete context; | |
| delete engine; | |
| delete runtime; | |
| delete serializedModel; | |
| delete config; | |
| delete network; | |
| delete builder; | |
| std::cout << "\n=== TensorRT Test Complete ===" << std::endl; | |
| return success ? 0 : 1; | |
| } | |
| EOF | |
| echo "=== TensorRT Test Script ===" | |
| echo "Source file: ${TRT_SRC}" | |
| echo "" | |
| # Check if nvcc is available | |
| if ! command -v nvcc &> /dev/null; then | |
| echo "ERROR: nvcc (CUDA compiler) not found!" | |
| echo "Please ensure CUDA toolkit is installed in the container." | |
| exit 1 | |
| fi | |
| # Check for TensorRT headers | |
| TENSORRT_INCLUDE="/usr/include/aarch64-linux-gnu" | |
| if [ ! -f "${TENSORRT_INCLUDE}/NvInfer.h" ] && [ ! -f "/usr/include/NvInfer.h" ]; then | |
| echo "ERROR: TensorRT headers not found!" | |
| echo "Please ensure TensorRT is installed in the container." | |
| echo "Checked paths:" | |
| echo " - /usr/include/NvInfer.h" | |
| echo " - ${TENSORRT_INCLUDE}/NvInfer.h" | |
| exit 1 | |
| fi | |
| # Display versions | |
| echo "CUDA Version:" | |
| nvcc --version | grep "release" | |
| echo "" | |
| # Compile the TensorRT program | |
| echo "Compiling TensorRT test program..." | |
| # Try to find TensorRT libraries | |
| TRT_LIB_PATH="/usr/lib/aarch64-linux-gnu" | |
| if [ ! -d "${TRT_LIB_PATH}" ]; then | |
| TRT_LIB_PATH="/usr/lib" | |
| fi | |
| # Compile with appropriate flags | |
| nvcc -o "${TRT_BIN}" "${TRT_SRC}" \ | |
| -I/usr/include/aarch64-linux-gnu \ | |
| -L${TRT_LIB_PATH} \ | |
| -lnvinfer \ | |
| -lnvonnxparser \ | |
| -lcudart \ | |
| -std=c++11 \ | |
| -O2 | |
| if [ $? -eq 0 ]; then | |
| echo "✓ Compilation successful!" | |
| echo "" | |
| # Run the test | |
| echo "Running TensorRT test..." | |
| echo "" | |
| "${TRT_BIN}" | |
| exit_code=$? | |
| echo "" | |
| if [ $exit_code -eq 0 ]; then | |
| echo "✓ All tests passed!" | |
| else | |
| echo "✗ Tests failed with exit code: $exit_code" | |
| fi | |
| exit $exit_code | |
| else | |
| echo "✗ Compilation failed!" | |
| echo "" | |
| echo "Debug information:" | |
| echo "TensorRT include path: ${TENSORRT_INCLUDE}" | |
| echo "TensorRT library path: ${TRT_LIB_PATH}" | |
| echo "" | |
| echo "Available TensorRT libraries:" | |
| ls -l ${TRT_LIB_PATH}/libnv* 2>/dev/null | grep -i infer || echo " No TensorRT libraries found" | |
| exit 1 | |
| fi |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment