jerry73204 · October 22, 2025 12:48
diff --git a/test_cuda.sh b/test_cuda.sh
 #!/bin/bash
 # CUDA test script to verify GPU functionality in container

 set -e

 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 TEST_DIR="${SCRIPT_DIR}/cuda_test"
 CUDA_SRC="${TEST_DIR}/cuda_test.cu"
 CUDA_BIN="${TEST_DIR}/cuda_test"

 # Create test directory
 mkdir -p "${TEST_DIR}"

 # Create CUDA test program
 cat > "${CUDA_SRC}" << 'EOF'
 #include <stdio.h>
 #include <cuda_runtime.h>

 // CUDA kernel for simple vector addition
 __global__ void vectorAdd(const float *A, const float *B, float *C, int numElements)
 {
    int i = blockDim.x * blockIdx.x + threadIdx.x;
    if (i < numElements) {
        C[i] = A[i] + B[i];
    }
 }

 void checkCudaError(cudaError_t err, const char *msg)
 {
    if (err != cudaSuccess) {
        fprintf(stderr, "CUDA Error: %s - %s\n", msg, cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }
 }

 int main(void)
 {
    printf("=== CUDA Device Test ===\n\n");

    // Check CUDA device count
    int deviceCount = 0;
    cudaError_t err = cudaGetDeviceCount(&deviceCount);
    checkCudaError(err, "Failed to get device count");

    printf("CUDA Devices Found: %d\n\n", deviceCount);

    if (deviceCount == 0) {
        printf("No CUDA-capable devices found!\n");
        return 1;
    }

    // Display device properties
    for (int dev = 0; dev < deviceCount; dev++) {
        cudaDeviceProp deviceProp;
        cudaGetDeviceProperties(&deviceProp, dev);

        printf("Device %d: %s\n", dev, deviceProp.name);
        printf("  Compute Capability: %d.%d\n", deviceProp.major, deviceProp.minor);
        printf("  Total Global Memory: %.2f GB\n",
               (float)deviceProp.totalGlobalMem / (1024*1024*1024));
        printf("  Multiprocessors: %d\n", deviceProp.multiProcessorCount);
        printf("  CUDA Cores: %d\n",
               deviceProp.multiProcessorCount * 128); // Approximate for Jetson
        printf("  Max Threads per Block: %d\n", deviceProp.maxThreadsPerBlock);
        printf("  Max Threads Dim: (%d, %d, %d)\n",
               deviceProp.maxThreadsDim[0],
               deviceProp.maxThreadsDim[1],
               deviceProp.maxThreadsDim[2]);
        printf("  Max Grid Size: (%d, %d, %d)\n",
               deviceProp.maxGridSize[0],
               deviceProp.maxGridSize[1],
               deviceProp.maxGridSize[2]);
        printf("  Warp Size: %d\n", deviceProp.warpSize);
        printf("  Memory Clock Rate: %.2f MHz\n", deviceProp.memoryClockRate / 1000.0);
        printf("  Memory Bus Width: %d-bit\n", deviceProp.memoryBusWidth);
        printf("  L2 Cache Size: %.2f MB\n", deviceProp.l2CacheSize / (1024.0*1024.0));
        printf("\n");
    }

    // Perform simple vector addition test
    printf("=== Vector Addition Test ===\n");

    int numElements = 50000;
    size_t size = numElements * sizeof(float);
    printf("Vector size: %d elements (%.2f KB)\n", numElements, size / 1024.0);

    // Allocate host memory
    float *h_A = (float *)malloc(size);
    float *h_B = (float *)malloc(size);
    float *h_C = (float *)malloc(size);

    // Initialize input vectors
    for (int i = 0; i < numElements; ++i) {
        h_A[i] = rand() / (float)RAND_MAX;
        h_B[i] = rand() / (float)RAND_MAX;
    }

    // Allocate device memory
    float *d_A = NULL, *d_B = NULL, *d_C = NULL;
    checkCudaError(cudaMalloc((void **)&d_A, size), "Failed to allocate d_A");
    checkCudaError(cudaMalloc((void **)&d_B, size), "Failed to allocate d_B");
    checkCudaError(cudaMalloc((void **)&d_C, size), "Failed to allocate d_C");

    // Copy data to device
    checkCudaError(cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice),
                   "Failed to copy h_A to device");
    checkCudaError(cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice),
                   "Failed to copy h_B to device");

    // Launch kernel
    int threadsPerBlock = 256;
    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
    printf("Launching kernel: %d blocks x %d threads\n", blocksPerGrid, threadsPerBlock);

    vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, numElements);
    checkCudaError(cudaGetLastError(), "Failed to launch kernel");

    // Copy result back to host
    checkCudaError(cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost),
                   "Failed to copy result to host");

    // Verify result
    printf("Verifying results...\n");
    bool success = true;
    for (int i = 0; i < numElements; ++i) {
        float expected = h_A[i] + h_B[i];
        if (fabs(h_C[i] - expected) > 1e-5) {
            fprintf(stderr, "Result verification failed at element %d!\n", i);
            fprintf(stderr, "Expected: %f, Got: %f\n", expected, h_C[i]);
            success = false;
            break;
        }
    }

    if (success) {
        printf("✓ Test PASSED! GPU computation successful.\n");
    }

    // Cleanup
    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);
    free(h_A);
    free(h_B);
    free(h_C);

    printf("\n=== CUDA Test Complete ===\n");

    return success ? 0 : 1;
 }
 EOF

 echo "=== CUDA Test Script ==="
 echo "Source file: ${CUDA_SRC}"
 echo ""

 # Check if nvcc is available
 if ! command -v nvcc &> /dev/null; then
    echo "ERROR: nvcc (CUDA compiler) not found!"
    echo "Please ensure CUDA toolkit is installed in the container."
    exit 1
 fi

 # Display CUDA version
 echo "CUDA Version Information:"
 nvcc --version
 echo ""

 # Compile the CUDA program
 echo "Compiling CUDA test program..."
 nvcc -o "${CUDA_BIN}" "${CUDA_SRC}" -O2

 if [ $? -eq 0 ]; then
    echo "✓ Compilation successful!"
    echo ""

    # Run the test
    echo "Running CUDA test..."
    echo ""
    "${CUDA_BIN}"

    exit_code=$?
    echo ""
    if [ $exit_code -eq 0 ]; then
        echo "✓ All tests passed!"
    else
        echo "✗ Tests failed with exit code: $exit_code"
    fi
    exit $exit_code
 else
    echo "✗ Compilation failed!"
    exit 1
 fi
diff --git a/test_tensorrt.sh b/test_tensorrt.sh
 #!/bin/bash
 # TensorRT test script to verify inference optimization functionality

 set -e

 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 TEST_DIR="${SCRIPT_DIR}/tensorrt_test"
 TRT_SRC="${TEST_DIR}/tensorrt_test.cpp"
 TRT_BIN="${TEST_DIR}/tensorrt_test"

 # Create test directory
 mkdir -p "${TEST_DIR}"

 # Create TensorRT test program
 cat > "${TRT_SRC}" << 'EOF'
 #include <iostream>
 #include <fstream>
 #include <vector>
 #include <cuda_runtime_api.h>
 #include <NvInfer.h>
 #include <NvOnnxParser.h>

 using namespace nvinfer1;

 // Logger for TensorRT
 class Logger : public ILogger {
    void log(Severity severity, const char* msg) noexcept override {
        if (severity <= Severity::kWARNING)
            std::cout << msg << std::endl;
    }
 } gLogger;

 void checkCudaError(cudaError_t err, const char* msg) {
    if (err != cudaSuccess) {
        std::cerr << "CUDA Error: " << msg << " - " << cudaGetErrorString(err) << std::endl;
        exit(EXIT_FAILURE);
    }
 }

 int main() {
    std::cout << "=== TensorRT Test ===" << std::endl << std::endl;

    // Check TensorRT version
    std::cout << "TensorRT Version: "
              << NV_TENSORRT_MAJOR << "."
              << NV_TENSORRT_MINOR << "."
              << NV_TENSORRT_PATCH << std::endl << std::endl;

    // Check CUDA device
    int deviceCount = 0;
    cudaError_t err = cudaGetDeviceCount(&deviceCount);
    checkCudaError(err, "Failed to get device count");

    if (deviceCount == 0) {
        std::cerr << "No CUDA-capable devices found!" << std::endl;
        return 1;
    }

    cudaDeviceProp deviceProp;
    cudaGetDeviceProperties(&deviceProp, 0);
    std::cout << "Using GPU: " << deviceProp.name << std::endl;
    std::cout << "Compute Capability: " << deviceProp.major << "." << deviceProp.minor << std::endl << std::endl;

    // Create builder
    std::cout << "Creating TensorRT builder..." << std::endl;
    IBuilder* builder = createInferBuilder(gLogger);
    if (!builder) {
        std::cerr << "Failed to create builder!" << std::endl;
        return 1;
    }

    // Create network
    const uint32_t explicitBatch = 1U << static_cast<uint32_t>(NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
    INetworkDefinition* network = builder->createNetworkV2(explicitBatch);
    if (!network) {
        std::cerr << "Failed to create network!" << std::endl;
        return 1;
    }

    std::cout << "Building simple test network..." << std::endl;

    // Create a simple network: Input -> Convolution -> Output
    // Input: [1, 3, 4, 4] (batch, channels, height, width)
    // Output: [1, 2, 4, 4]
    const int batchSize = 1;
    const int inputChannels = 3;
    const int outputChannels = 2;
    const int height = 4;
    const int width = 4;

    ITensor* input = network->addInput("input", DataType::kFLOAT, Dims4{batchSize, inputChannels, height, width});
    if (!input) {
        std::cerr << "Failed to add input!" << std::endl;
        return 1;
    }

    // Create weights for convolution (1x1 conv)
    const int kernelSize = 1;
    const int weightsSize = outputChannels * inputChannels * kernelSize * kernelSize;
    std::vector<float> weights(weightsSize);
    std::vector<float> bias(outputChannels);

    // Initialize weights and bias (simple pattern for testing)
    for (int i = 0; i < weightsSize; i++) {
        weights[i] = 0.1f;
    }
    for (int i = 0; i < outputChannels; i++) {
        bias[i] = 0.5f;
    }

    Weights kernelWeights{DataType::kFLOAT, weights.data(), weightsSize};
    Weights biasWeights{DataType::kFLOAT, bias.data(), outputChannels};

    // Add 1x1 convolution layer
    IConvolutionLayer* conv = network->addConvolutionNd(*input, outputChannels, DimsHW{1, 1}, kernelWeights, biasWeights);
    if (!conv) {
        std::cerr << "Failed to add convolution layer!" << std::endl;
        return 1;
    }

    conv->getOutput(0)->setName("output");
    network->markOutput(*conv->getOutput(0));

    std::cout << "Network structure:" << std::endl;
    std::cout << "  Input:  [1, 3, 4, 4]" << std::endl;
    std::cout << "  Conv 1x1: 3 channels -> 2 channels" << std::endl;
    std::cout << "  Output: [1, 2, 4, 4]" << std::endl << std::endl;

    // Build engine
    std::cout << "Building TensorRT engine..." << std::endl;
    IBuilderConfig* config = builder->createBuilderConfig();
    if (!config) {
        std::cerr << "Failed to create builder config!" << std::endl;
        return 1;
    }

    // Set memory pool limit
    config->setMemoryPoolLimit(MemoryPoolType::kWORKSPACE, 1U << 28); // 256 MB

    // Enable FP16 if supported
    if (builder->platformHasFastFp16()) {
        std::cout << "FP16 mode: ENABLED" << std::endl;
        config->setFlag(BuilderFlag::kFP16);
    } else {
        std::cout << "FP16 mode: NOT SUPPORTED" << std::endl;
    }

    // Build serialized network
    IHostMemory* serializedModel = builder->buildSerializedNetwork(*network, *config);
    if (!serializedModel) {
        std::cerr << "Failed to build engine!" << std::endl;
        return 1;
    }

    std::cout << "✓ Engine built successfully!" << std::endl;
    std::cout << "Engine size: " << serializedModel->size() / 1024.0 << " KB" << std::endl << std::endl;

    // Create runtime and deserialize engine
    std::cout << "Creating runtime and deserializing engine..." << std::endl;
    IRuntime* runtime = createInferRuntime(gLogger);
    if (!runtime) {
        std::cerr << "Failed to create runtime!" << std::endl;
        return 1;
    }

    ICudaEngine* engine = runtime->deserializeCudaEngine(serializedModel->data(), serializedModel->size());
    if (!engine) {
        std::cerr << "Failed to deserialize engine!" << std::endl;
        return 1;
    }

    IExecutionContext* context = engine->createExecutionContext();
    if (!context) {
        std::cerr << "Failed to create execution context!" << std::endl;
        return 1;
    }

    std::cout << "✓ Runtime initialized successfully!" << std::endl << std::endl;

    // Prepare input and output buffers
    std::cout << "Running inference test..." << std::endl;

    const int inputSize = batchSize * inputChannels * height * width;
    const int outputSize = batchSize * outputChannels * height * width;

    std::vector<float> hostInput(inputSize, 1.0f);  // Initialize with 1.0
    std::vector<float> hostOutput(outputSize, 0.0f);

    void* deviceInput = nullptr;
    void* deviceOutput = nullptr;

    checkCudaError(cudaMalloc(&deviceInput, inputSize * sizeof(float)), "Failed to allocate device input");
    checkCudaError(cudaMalloc(&deviceOutput, outputSize * sizeof(float)), "Failed to allocate device output");

    // Copy input to device
    checkCudaError(cudaMemcpy(deviceInput, hostInput.data(), inputSize * sizeof(float), cudaMemcpyHostToDevice),
                   "Failed to copy input to device");

    // Set up bindings
    void* bindings[] = {deviceInput, deviceOutput};

    // Execute inference
    bool status = context->executeV2(bindings);
    if (!status) {
        std::cerr << "Failed to execute inference!" << std::endl;
        return 1;
    }

    // Copy output back to host
    checkCudaError(cudaMemcpy(hostOutput.data(), deviceOutput, outputSize * sizeof(float), cudaMemcpyDeviceToHost),
                   "Failed to copy output to host");

    std::cout << "✓ Inference completed successfully!" << std::endl << std::endl;

    // Display results
    std::cout << "Input tensor: [" << batchSize << ", " << inputChannels << ", "
              << height << ", " << width << "] = " << inputSize << " elements" << std::endl;
    std::cout << "Output tensor: [" << batchSize << ", " << outputChannels << ", "
              << height << ", " << width << "] = " << outputSize << " elements" << std::endl << std::endl;

    std::cout << "Sample input values (first 5):" << std::endl;
    for (int i = 0; i < std::min(5, inputSize); i++) {
        std::cout << "  [" << i << "] = " << hostInput[i] << std::endl;
    }

    std::cout << "\nSample output values (first 5):" << std::endl;
    for (int i = 0; i < std::min(5, outputSize); i++) {
        std::cout << "  [" << i << "] = " << hostOutput[i] << std::endl;
    }

    // Verify output (with input=1.0 and weights=0.1 for 3 channels, bias=0.5)
    // Expected: 3 * 0.1 + 0.5 = 0.8
    std::cout << "\nVerifying results..." << std::endl;
    bool success = true;
    float expected = inputChannels * 0.1f + 0.5f;
    int errors = 0;
    const int maxErrors = 5;
    for (int i = 0; i < outputSize; i++) {
        if (std::abs(hostOutput[i] - expected) > 0.01f) {
            if (errors < maxErrors) {
                std::cerr << "Verification failed at output[" << i << "]!" << std::endl;
                std::cerr << "Expected: ~" << expected << ", Got: " << hostOutput[i] << std::endl;
            }
            errors++;
            success = false;
        }
    }
    if (errors > maxErrors) {
        std::cerr << "... and " << (errors - maxErrors) << " more errors" << std::endl;
    }

    if (success) {
        std::cout << "✓ Results verified successfully!" << std::endl;
    }

    // Cleanup
    cudaFree(deviceInput);
    cudaFree(deviceOutput);
    delete context;
    delete engine;
    delete runtime;
    delete serializedModel;
    delete config;
    delete network;
    delete builder;

    std::cout << "\n=== TensorRT Test Complete ===" << std::endl;

    return success ? 0 : 1;
 }
 EOF

 echo "=== TensorRT Test Script ==="
 echo "Source file: ${TRT_SRC}"
 echo ""

 # Check if nvcc is available
 if ! command -v nvcc &> /dev/null; then
    echo "ERROR: nvcc (CUDA compiler) not found!"
    echo "Please ensure CUDA toolkit is installed in the container."
    exit 1
 fi

 # Check for TensorRT headers
 TENSORRT_INCLUDE="/usr/include/aarch64-linux-gnu"
 if [ ! -f "${TENSORRT_INCLUDE}/NvInfer.h" ] && [ ! -f "/usr/include/NvInfer.h" ]; then
    echo "ERROR: TensorRT headers not found!"
    echo "Please ensure TensorRT is installed in the container."
    echo "Checked paths:"
    echo "  - /usr/include/NvInfer.h"
    echo "  - ${TENSORRT_INCLUDE}/NvInfer.h"
    exit 1
 fi

 # Display versions
 echo "CUDA Version:"
 nvcc --version | grep "release"
 echo ""

 # Compile the TensorRT program
 echo "Compiling TensorRT test program..."

 # Try to find TensorRT libraries
 TRT_LIB_PATH="/usr/lib/aarch64-linux-gnu"
 if [ ! -d "${TRT_LIB_PATH}" ]; then
    TRT_LIB_PATH="/usr/lib"
 fi

 # Compile with appropriate flags
 nvcc -o "${TRT_BIN}" "${TRT_SRC}" \
    -I/usr/include/aarch64-linux-gnu \
    -L${TRT_LIB_PATH} \
    -lnvinfer \
    -lnvonnxparser \
    -lcudart \
    -std=c++11 \
    -O2

 if [ $? -eq 0 ]; then
    echo "✓ Compilation successful!"
    echo ""

    # Run the test
    echo "Running TensorRT test..."
    echo ""
    "${TRT_BIN}"

    exit_code=$?
    echo ""
    if [ $exit_code -eq 0 ]; then
        echo "✓ All tests passed!"
    else
        echo "✗ Tests failed with exit code: $exit_code"
    fi
    exit $exit_code
 else
    echo "✗ Compilation failed!"
    echo ""
    echo "Debug information:"
    echo "TensorRT include path: ${TENSORRT_INCLUDE}"
    echo "TensorRT library path: ${TRT_LIB_PATH}"
    echo ""
    echo "Available TensorRT libraries:"
    ls -l ${TRT_LIB_PATH}/libnv* 2>/dev/null | grep -i infer || echo "  No TensorRT libraries found"
    exit 1
 fi
	#!/bin/bash
	# CUDA test script to verify GPU functionality in container

	set -e

	SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
	TEST_DIR="${SCRIPT_DIR}/cuda_test"
	CUDA_SRC="${TEST_DIR}/cuda_test.cu"
	CUDA_BIN="${TEST_DIR}/cuda_test"

	# Create test directory
	mkdir -p "${TEST_DIR}"

	# Create CUDA test program
	cat > "${CUDA_SRC}" << 'EOF'
	#include <stdio.h>
	#include <cuda_runtime.h>

	// CUDA kernel for simple vector addition
	__global__ void vectorAdd(const float A, const float B, float *C, int numElements)
	{
	int i = blockDim.x * blockIdx.x + threadIdx.x;
	if (i < numElements) {
	C[i] = A[i] + B[i];
	}
	}

	void checkCudaError(cudaError_t err, const char *msg)
	{
	if (err != cudaSuccess) {
	fprintf(stderr, "CUDA Error: %s - %s\n", msg, cudaGetErrorString(err));
	exit(EXIT_FAILURE);
	}
	}

	int main(void)
	{
	printf("=== CUDA Device Test ===\n\n");

	// Check CUDA device count
	int deviceCount = 0;
	cudaError_t err = cudaGetDeviceCount(&deviceCount);
	checkCudaError(err, "Failed to get device count");

	printf("CUDA Devices Found: %d\n\n", deviceCount);

	if (deviceCount == 0) {
	printf("No CUDA-capable devices found!\n");
	return 1;
	}

	// Display device properties
	for (int dev = 0; dev < deviceCount; dev++) {
	cudaDeviceProp deviceProp;
	cudaGetDeviceProperties(&deviceProp, dev);

	printf("Device %d: %s\n", dev, deviceProp.name);
	printf(" Compute Capability: %d.%d\n", deviceProp.major, deviceProp.minor);
	printf(" Total Global Memory: %.2f GB\n",
	(float)deviceProp.totalGlobalMem / (102410241024));
	printf(" Multiprocessors: %d\n", deviceProp.multiProcessorCount);
	printf(" CUDA Cores: %d\n",
	deviceProp.multiProcessorCount * 128); // Approximate for Jetson
	printf(" Max Threads per Block: %d\n", deviceProp.maxThreadsPerBlock);
	printf(" Max Threads Dim: (%d, %d, %d)\n",
	deviceProp.maxThreadsDim[0],
	deviceProp.maxThreadsDim[1],
	deviceProp.maxThreadsDim[2]);
	printf(" Max Grid Size: (%d, %d, %d)\n",
	deviceProp.maxGridSize[0],
	deviceProp.maxGridSize[1],
	deviceProp.maxGridSize[2]);
	printf(" Warp Size: %d\n", deviceProp.warpSize);
	printf(" Memory Clock Rate: %.2f MHz\n", deviceProp.memoryClockRate / 1000.0);
	printf(" Memory Bus Width: %d-bit\n", deviceProp.memoryBusWidth);
	printf(" L2 Cache Size: %.2f MB\n", deviceProp.l2CacheSize / (1024.0*1024.0));
	printf("\n");
	}

	// Perform simple vector addition test
	printf("=== Vector Addition Test ===\n");

	int numElements = 50000;
	size_t size = numElements * sizeof(float);
	printf("Vector size: %d elements (%.2f KB)\n", numElements, size / 1024.0);

	// Allocate host memory
	float h_A = (float )malloc(size);
	float h_B = (float )malloc(size);
	float h_C = (float )malloc(size);

	// Initialize input vectors
	for (int i = 0; i < numElements; ++i) {
	h_A[i] = rand() / (float)RAND_MAX;
	h_B[i] = rand() / (float)RAND_MAX;
	}

	// Allocate device memory
	float d_A = NULL, d_B = NULL, *d_C = NULL;
	checkCudaError(cudaMalloc((void **)&d_A, size), "Failed to allocate d_A");
	checkCudaError(cudaMalloc((void **)&d_B, size), "Failed to allocate d_B");
	checkCudaError(cudaMalloc((void **)&d_C, size), "Failed to allocate d_C");

	// Copy data to device
	checkCudaError(cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice),
	"Failed to copy h_A to device");
	checkCudaError(cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice),
	"Failed to copy h_B to device");

	// Launch kernel
	int threadsPerBlock = 256;
	int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
	printf("Launching kernel: %d blocks x %d threads\n", blocksPerGrid, threadsPerBlock);

	vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, numElements);
	checkCudaError(cudaGetLastError(), "Failed to launch kernel");

	// Copy result back to host
	checkCudaError(cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost),
	"Failed to copy result to host");

	// Verify result
	printf("Verifying results...\n");
	bool success = true;
	for (int i = 0; i < numElements; ++i) {
	float expected = h_A[i] + h_B[i];
	if (fabs(h_C[i] - expected) > 1e-5) {
	fprintf(stderr, "Result verification failed at element %d!\n", i);
	fprintf(stderr, "Expected: %f, Got: %f\n", expected, h_C[i]);
	success = false;
	break;
	}
	}

	if (success) {
	printf("✓ Test PASSED! GPU computation successful.\n");
	}

	// Cleanup
	cudaFree(d_A);
	cudaFree(d_B);
	cudaFree(d_C);
	free(h_A);
	free(h_B);
	free(h_C);

	printf("\n=== CUDA Test Complete ===\n");

	return success ? 0 : 1;
	}
	EOF

	echo "=== CUDA Test Script ==="
	echo "Source file: ${CUDA_SRC}"
	echo ""

	# Check if nvcc is available
	if ! command -v nvcc &> /dev/null; then
	echo "ERROR: nvcc (CUDA compiler) not found!"
	echo "Please ensure CUDA toolkit is installed in the container."
	exit 1
	fi

	# Display CUDA version
	echo "CUDA Version Information:"
	nvcc --version
	echo ""

	# Compile the CUDA program
	echo "Compiling CUDA test program..."
	nvcc -o "${CUDA_BIN}" "${CUDA_SRC}" -O2

	if [ $? -eq 0 ]; then
	echo "✓ Compilation successful!"
	echo ""

	# Run the test
	echo "Running CUDA test..."
	echo ""
	"${CUDA_BIN}"

	exit_code=$?
	echo ""
	if [ $exit_code -eq 0 ]; then
	echo "✓ All tests passed!"
	else
	echo "✗ Tests failed with exit code: $exit_code"
	fi
	exit $exit_code
	else
	echo "✗ Compilation failed!"
	exit 1
	fi
	#!/bin/bash
	# TensorRT test script to verify inference optimization functionality

	set -e

	SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
	TEST_DIR="${SCRIPT_DIR}/tensorrt_test"
	TRT_SRC="${TEST_DIR}/tensorrt_test.cpp"
	TRT_BIN="${TEST_DIR}/tensorrt_test"

	# Create test directory
	mkdir -p "${TEST_DIR}"

	# Create TensorRT test program
	cat > "${TRT_SRC}" << 'EOF'
	#include <iostream>
	#include <fstream>
	#include <vector>
	#include <cuda_runtime_api.h>
	#include <NvInfer.h>
	#include <NvOnnxParser.h>

	using namespace nvinfer1;

	// Logger for TensorRT
	class Logger : public ILogger {
	void log(Severity severity, const char* msg) noexcept override {
	if (severity <= Severity::kWARNING)
	std::cout << msg << std::endl;
	}
	} gLogger;

	void checkCudaError(cudaError_t err, const char* msg) {
	if (err != cudaSuccess) {
	std::cerr << "CUDA Error: " << msg << " - " << cudaGetErrorString(err) << std::endl;
	exit(EXIT_FAILURE);
	}
	}

	int main() {
	std::cout << "=== TensorRT Test ===" << std::endl << std::endl;

	// Check TensorRT version
	std::cout << "TensorRT Version: "
	<< NV_TENSORRT_MAJOR << "."
	<< NV_TENSORRT_MINOR << "."
	<< NV_TENSORRT_PATCH << std::endl << std::endl;

	// Check CUDA device
	int deviceCount = 0;
	cudaError_t err = cudaGetDeviceCount(&deviceCount);
	checkCudaError(err, "Failed to get device count");

	if (deviceCount == 0) {
	std::cerr << "No CUDA-capable devices found!" << std::endl;
	return 1;
	}

	cudaDeviceProp deviceProp;
	cudaGetDeviceProperties(&deviceProp, 0);
	std::cout << "Using GPU: " << deviceProp.name << std::endl;
	std::cout << "Compute Capability: " << deviceProp.major << "." << deviceProp.minor << std::endl << std::endl;

	// Create builder
	std::cout << "Creating TensorRT builder..." << std::endl;
	IBuilder* builder = createInferBuilder(gLogger);
	if (!builder) {
	std::cerr << "Failed to create builder!" << std::endl;
	return 1;
	}

	// Create network
	const uint32_t explicitBatch = 1U << static_cast<uint32_t>(NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
	INetworkDefinition* network = builder->createNetworkV2(explicitBatch);
	if (!network) {
	std::cerr << "Failed to create network!" << std::endl;
	return 1;
	}

	std::cout << "Building simple test network..." << std::endl;

	// Create a simple network: Input -> Convolution -> Output
	// Input: [1, 3, 4, 4] (batch, channels, height, width)
	// Output: [1, 2, 4, 4]
	const int batchSize = 1;
	const int inputChannels = 3;
	const int outputChannels = 2;
	const int height = 4;
	const int width = 4;

	ITensor* input = network->addInput("input", DataType::kFLOAT, Dims4{batchSize, inputChannels, height, width});
	if (!input) {
	std::cerr << "Failed to add input!" << std::endl;
	return 1;
	}

	// Create weights for convolution (1x1 conv)
	const int kernelSize = 1;
	const int weightsSize = outputChannels * inputChannels * kernelSize * kernelSize;
	std::vector<float> weights(weightsSize);
	std::vector<float> bias(outputChannels);

	// Initialize weights and bias (simple pattern for testing)
	for (int i = 0; i < weightsSize; i++) {
	weights[i] = 0.1f;
	}
	for (int i = 0; i < outputChannels; i++) {
	bias[i] = 0.5f;
	}

	Weights kernelWeights{DataType::kFLOAT, weights.data(), weightsSize};
	Weights biasWeights{DataType::kFLOAT, bias.data(), outputChannels};

	// Add 1x1 convolution layer
	IConvolutionLayer* conv = network->addConvolutionNd(*input, outputChannels, DimsHW{1, 1}, kernelWeights, biasWeights);
	if (!conv) {
	std::cerr << "Failed to add convolution layer!" << std::endl;
	return 1;
	}

	conv->getOutput(0)->setName("output");
	network->markOutput(*conv->getOutput(0));

	std::cout << "Network structure:" << std::endl;
	std::cout << " Input: [1, 3, 4, 4]" << std::endl;
	std::cout << " Conv 1x1: 3 channels -> 2 channels" << std::endl;
	std::cout << " Output: [1, 2, 4, 4]" << std::endl << std::endl;

	// Build engine
	std::cout << "Building TensorRT engine..." << std::endl;
	IBuilderConfig* config = builder->createBuilderConfig();
	if (!config) {
	std::cerr << "Failed to create builder config!" << std::endl;
	return 1;
	}

	// Set memory pool limit
	config->setMemoryPoolLimit(MemoryPoolType::kWORKSPACE, 1U << 28); // 256 MB

	// Enable FP16 if supported
	if (builder->platformHasFastFp16()) {
	std::cout << "FP16 mode: ENABLED" << std::endl;
	config->setFlag(BuilderFlag::kFP16);
	} else {
	std::cout << "FP16 mode: NOT SUPPORTED" << std::endl;
	}

	// Build serialized network
	IHostMemory* serializedModel = builder->buildSerializedNetwork(network, config);
	if (!serializedModel) {
	std::cerr << "Failed to build engine!" << std::endl;
	return 1;
	}

	std::cout << "✓ Engine built successfully!" << std::endl;
	std::cout << "Engine size: " << serializedModel->size() / 1024.0 << " KB" << std::endl << std::endl;

	// Create runtime and deserialize engine
	std::cout << "Creating runtime and deserializing engine..." << std::endl;
	IRuntime* runtime = createInferRuntime(gLogger);
	if (!runtime) {
	std::cerr << "Failed to create runtime!" << std::endl;
	return 1;
	}

	ICudaEngine* engine = runtime->deserializeCudaEngine(serializedModel->data(), serializedModel->size());
	if (!engine) {
	std::cerr << "Failed to deserialize engine!" << std::endl;
	return 1;
	}

	IExecutionContext* context = engine->createExecutionContext();
	if (!context) {
	std::cerr << "Failed to create execution context!" << std::endl;
	return 1;
	}

	std::cout << "✓ Runtime initialized successfully!" << std::endl << std::endl;

	// Prepare input and output buffers
	std::cout << "Running inference test..." << std::endl;

	const int inputSize = batchSize * inputChannels * height * width;
	const int outputSize = batchSize * outputChannels * height * width;

	std::vector<float> hostInput(inputSize, 1.0f); // Initialize with 1.0
	std::vector<float> hostOutput(outputSize, 0.0f);

	void* deviceInput = nullptr;
	void* deviceOutput = nullptr;

	checkCudaError(cudaMalloc(&deviceInput, inputSize * sizeof(float)), "Failed to allocate device input");
	checkCudaError(cudaMalloc(&deviceOutput, outputSize * sizeof(float)), "Failed to allocate device output");

	// Copy input to device
	checkCudaError(cudaMemcpy(deviceInput, hostInput.data(), inputSize * sizeof(float), cudaMemcpyHostToDevice),
	"Failed to copy input to device");

	// Set up bindings
	void* bindings[] = {deviceInput, deviceOutput};

	// Execute inference
	bool status = context->executeV2(bindings);
	if (!status) {
	std::cerr << "Failed to execute inference!" << std::endl;
	return 1;
	}

	// Copy output back to host
	checkCudaError(cudaMemcpy(hostOutput.data(), deviceOutput, outputSize * sizeof(float), cudaMemcpyDeviceToHost),
	"Failed to copy output to host");

	std::cout << "✓ Inference completed successfully!" << std::endl << std::endl;

	// Display results
	std::cout << "Input tensor: [" << batchSize << ", " << inputChannels << ", "
	<< height << ", " << width << "] = " << inputSize << " elements" << std::endl;
	std::cout << "Output tensor: [" << batchSize << ", " << outputChannels << ", "
	<< height << ", " << width << "] = " << outputSize << " elements" << std::endl << std::endl;

	std::cout << "Sample input values (first 5):" << std::endl;
	for (int i = 0; i < std::min(5, inputSize); i++) {
	std::cout << " [" << i << "] = " << hostInput[i] << std::endl;
	}

	std::cout << "\nSample output values (first 5):" << std::endl;
	for (int i = 0; i < std::min(5, outputSize); i++) {
	std::cout << " [" << i << "] = " << hostOutput[i] << std::endl;
	}

	// Verify output (with input=1.0 and weights=0.1 for 3 channels, bias=0.5)
	// Expected: 3 * 0.1 + 0.5 = 0.8
	std::cout << "\nVerifying results..." << std::endl;
	bool success = true;
	float expected = inputChannels * 0.1f + 0.5f;
	int errors = 0;
	const int maxErrors = 5;
	for (int i = 0; i < outputSize; i++) {
	if (std::abs(hostOutput[i] - expected) > 0.01f) {
	if (errors < maxErrors) {
	std::cerr << "Verification failed at output[" << i << "]!" << std::endl;
	std::cerr << "Expected: ~" << expected << ", Got: " << hostOutput[i] << std::endl;
	}
	errors++;
	success = false;
	}
	}
	if (errors > maxErrors) {
	std::cerr << "... and " << (errors - maxErrors) << " more errors" << std::endl;
	}

	if (success) {
	std::cout << "✓ Results verified successfully!" << std::endl;
	}

	// Cleanup
	cudaFree(deviceInput);
	cudaFree(deviceOutput);
	delete context;
	delete engine;
	delete runtime;
	delete serializedModel;
	delete config;
	delete network;
	delete builder;

	std::cout << "\n=== TensorRT Test Complete ===" << std::endl;

	return success ? 0 : 1;
	}
	EOF

	echo "=== TensorRT Test Script ==="
	echo "Source file: ${TRT_SRC}"
	echo ""

	# Check if nvcc is available
	if ! command -v nvcc &> /dev/null; then
	echo "ERROR: nvcc (CUDA compiler) not found!"
	echo "Please ensure CUDA toolkit is installed in the container."
	exit 1
	fi

	# Check for TensorRT headers
	TENSORRT_INCLUDE="/usr/include/aarch64-linux-gnu"
	if [ ! -f "${TENSORRT_INCLUDE}/NvInfer.h" ] && [ ! -f "/usr/include/NvInfer.h" ]; then
	echo "ERROR: TensorRT headers not found!"
	echo "Please ensure TensorRT is installed in the container."
	echo "Checked paths:"
	echo " - /usr/include/NvInfer.h"
	echo " - ${TENSORRT_INCLUDE}/NvInfer.h"
	exit 1
	fi

	# Display versions
	echo "CUDA Version:"
	nvcc --version \| grep "release"
	echo ""

	# Compile the TensorRT program
	echo "Compiling TensorRT test program..."

	# Try to find TensorRT libraries
	TRT_LIB_PATH="/usr/lib/aarch64-linux-gnu"
	if [ ! -d "${TRT_LIB_PATH}" ]; then
	TRT_LIB_PATH="/usr/lib"
	fi

	# Compile with appropriate flags
	nvcc -o "${TRT_BIN}" "${TRT_SRC}" \
	-I/usr/include/aarch64-linux-gnu \
	-L${TRT_LIB_PATH} \
	-lnvinfer \
	-lnvonnxparser \
	-lcudart \
	-std=c++11 \
	-O2

	if [ $? -eq 0 ]; then
	echo "✓ Compilation successful!"
	echo ""

	# Run the test
	echo "Running TensorRT test..."
	echo ""
	"${TRT_BIN}"

	exit_code=$?
	echo ""
	if [ $exit_code -eq 0 ]; then
	echo "✓ All tests passed!"
	else
	echo "✗ Tests failed with exit code: $exit_code"
	fi
	exit $exit_code
	else
	echo "✗ Compilation failed!"
	echo ""
	echo "Debug information:"
	echo "TensorRT include path: ${TENSORRT_INCLUDE}"
	echo "TensorRT library path: ${TRT_LIB_PATH}"
	echo ""
	echo "Available TensorRT libraries:"
	ls -l ${TRT_LIB_PATH}/libnv* 2>/dev/null \| grep -i infer \|\| echo " No TensorRT libraries found"
	exit 1
	fi