nvnnghia · November 7, 2019 01:32
diff --git a/main.cpp b/main.cpp
 #include "common.h"
 #include "cudaUtility.h"
 #include "mathFunctions.h"
 #include "pluginImplement.h"
 #include "tensorNet.h"
 #include "loadImage.h"
 #include "imageBuffer.h"
 #include <chrono>
 #include <thread>
 #include <chrono>

 const char* model  = "model/pelee/pelee_deploy_iplugin.prototxt";
 const char* weight = "model/pelee/pelee_merged.caffemodel";

 const char* INPUT_BLOB_NAME = "data";

 const char* OUTPUT_BLOB_NAME = "detection_out";
 static const uint32_t BATCH_SIZE = 2;

 //image buffer size = 10
 //dropFrame = false
 ConsumerProducerQueue<cv::Mat> *imageBuffer = new ConsumerProducerQueue<cv::Mat>(10,false);

 class Timer {
 public:
    void tic() {
        start_ticking_ = true;
        start_ = std::chrono::high_resolution_clock::now();
    }
    void toc() {
        if (!start_ticking_)return;
        end_ = std::chrono::high_resolution_clock::now();
        start_ticking_ = false;
        t = std::chrono::duration<double, std::milli>(end_ - start_).count();
        //std::cout << "Time: " << t << " ms" << std::endl;
    }
    double t;
 private:
    bool start_ticking_ = false;
    std::chrono::time_point<std::chrono::high_resolution_clock> start_;
    std::chrono::time_point<std::chrono::high_resolution_clock> end_;
 };


 /* *
 * @TODO: unifiedMemory is used here under -> ( cudaMallocManaged )
 * */
 float* allocateMemory(DimsCHW dims, char* info)
 {
    float* ptr;
    size_t size;
    std::cout << "Allocate memory: " << info << std::endl;
    size = BATCH_SIZE * dims.c() * dims.h() * dims.w();
    assert(!cudaMallocManaged( &ptr, size*sizeof(float)));
    return ptr;
 }


 void loadImg( cv::Mat &input1, cv::Mat &input2, int re_width, int re_height, float *data_unifrom,const float3 mean,const float scale )
 {
    int i;
    int j;
    int line_offset;
    int offset_g;
    int offset_r;
    cv::Mat dst1;
    cv::Mat dst2;

    unsigned char *line1 = NULL;
    unsigned char *line2 = NULL;
    float *unifrom_data = data_unifrom;

    cv::resize( input1, dst1, cv::Size( re_width, re_height ), (0.0), (0.0), cv::INTER_LINEAR );
    cv::resize( input2, dst2, cv::Size( re_width, re_height ), (0.0), (0.0), cv::INTER_LINEAR );
    offset_g = re_width * re_height;
    offset_r = re_width * re_height * 2;
    for( i = 0; i < re_height; ++i )
    {
        line1 = dst1.ptr< unsigned char >( i );
        line2 = dst2.ptr< unsigned char >( i );
        line_offset = i * re_width;
        for( j = 0; j < re_width; ++j )
        {
            //first image
            // b
            unifrom_data[ line_offset + j  ] = (( float )(line1[ j * 3 ] - mean.x) * scale);
            // g
            unifrom_data[ offset_g + line_offset + j ] = (( float )(line1[ j * 3 + 1 ] - mean.y) * scale);
            // r
            unifrom_data[ offset_r + line_offset + j ] = (( float )(line1[ j * 3 + 2 ] - mean.z) * scale);

            //second image
            // b
            unifrom_data[ line_offset + j  + re_width * re_height * 3] = (( float )(line2[ j * 3 ] - mean.x) * scale);
            // g
            unifrom_data[ offset_g + line_offset + j + re_width * re_height * 3] = (( float )(line2[ j * 3 + 1 ] - mean.y) * scale);
            // r
            unifrom_data[ offset_r + line_offset + j + re_width * re_height * 3] = (( float )(line2[ j * 3 + 2 ] - mean.z) * scale);
        }
    }

 }

 //thread read video
 void readPicture()
 {
    cv::VideoCapture cap("testVideo/test.avi");
    // cv::VideoCapture cap("/data1/lhi/SANDISK128/nvnn/segmentation/driving_Korea/Danyang/anno_videos/vlc-record-2019-10-30-09h18m05s-driving_in_Korea_Danyang.mp4-.mp4");
    cv::Mat image;
    while(cap.isOpened())
    {
        cap >> image;
        imageBuffer->add(image);
    }
 }

 int main(int argc, char *argv[])
 {
    std::vector<std::string> output_vector = {OUTPUT_BLOB_NAME};
    TensorNet tensorNet;
    tensorNet.LoadNetwork(model,weight,INPUT_BLOB_NAME, output_vector,BATCH_SIZE);

    DimsCHW dimsData = tensorNet.getTensorDims(INPUT_BLOB_NAME);
    DimsCHW dimsOut  = tensorNet.getTensorDims(OUTPUT_BLOB_NAME);

    float* data    = allocateMemory( dimsData , (char*)"input blob");
    std::cout << "allocate data" << std::endl;
    float* output  = allocateMemory( dimsOut  , (char*)"output blob");
    std::cout << "allocate output" << std::endl;
    int height = 304;
    int width  = 304;

    cv::Mat frame,srcImg, fl_frame;

    void* imgCPU;
    void* imgCUDA;
    Timer timer;
    int count =0;
    std::thread readTread(readPicture);
    readTread.detach();
    cv::VideoWriter writer;
    while(1)
    {
        count ++;
        imageBuffer->consume(frame);
        auto start = std::chrono::system_clock::now();
        srcImg = frame.clone();
        cv::resize(frame, frame, cv::Size(304,304));
        cv::flip(frame,fl_frame,1);
        const size_t size = width * height * sizeof(float3);

        if( CUDA_FAILED( cudaMalloc( &imgCUDA, 2*size)) )
        {
            cout <<"Cuda Memory allocation error occured."<<endl;
            return false;
        }

        void* imgData = malloc(2*size);
        memset(imgData,0,2*size);

        loadImg(frame,fl_frame,height,width,(float*)imgData,make_float3(103.94,116.78,123.68),0.017);
        cudaMemcpyAsync(imgCUDA,imgData,2*size,cudaMemcpyHostToDevice);

        void* buffers[] = { imgCUDA, output }; 

        timer.tic();
        tensorNet.imageInference( buffers, output_vector.size() + 1, BATCH_SIZE);
        timer.toc();
        double msTime = timer.t;

        vector<vector<float> > detections;

        for (int k=0; k<100; k++)
        {
            if(output[7*k+1] == -1)
                break;
            float classIndex = output[7*k+1];
            float confidence = output[7*k+2];
            float xmin = output[7*k + 3];
            float ymin = output[7*k + 4];
            float xmax = output[7*k + 5];
            float ymax = output[7*k + 6];
            //std::cout << classIndex << " , " << confidence << " , "  << xmin << " , " << ymin<< " , " << xmax<< " , " << ymax << std::endl;
            int x1 = static_cast<int>(xmin * srcImg.cols);
            int y1 = static_cast<int>(ymin * srcImg.rows);
            int x2 = static_cast<int>(xmax * srcImg.cols);
            int y2 = static_cast<int>(ymax * srcImg.rows);
            cv::rectangle(srcImg,cv::Rect2f(cv::Point(x1,y1),cv::Point(x2,y2)),cv::Scalar(255,0,255),1);

        }
        cv::Size size1;
        size1.width = srcImg.cols;
        size1.height = srcImg.rows;
        auto end = std::chrono::system_clock::now();
        std::chrono::duration<double> elapsed_seconds = end-start;
        float duration = elapsed_seconds.count()*1000;
        std::cout<<"Network processing time: "<<duration<< std::endl; 

        if (count == 1) {
          char fname[256];
          sprintf(fname,"result.wmv");
          printf(fname);
          writer.open(fname, cv::VideoWriter::fourcc('M', 'P', '4', 'V'), 20, size1);
        }
        else {
          //cv::cvtColor(seg_gray, seg_gray, CV_BGR2GRAY);
          writer << srcImg;
        }
        free(imgData);
    }
    writer.release();
    cudaFree(imgCUDA);
    cudaFreeHost(imgCPU);
    cudaFree(output);
    tensorNet.destroy();
    return 0;
 }
	#include "common.h"
	#include "cudaUtility.h"
	#include "mathFunctions.h"
	#include "pluginImplement.h"
	#include "tensorNet.h"
	#include "loadImage.h"
	#include "imageBuffer.h"
	#include <chrono>
	#include <thread>
	#include <chrono>

	const char* model = "model/pelee/pelee_deploy_iplugin.prototxt";
	const char* weight = "model/pelee/pelee_merged.caffemodel";

	const char* INPUT_BLOB_NAME = "data";

	const char* OUTPUT_BLOB_NAME = "detection_out";
	static const uint32_t BATCH_SIZE = 2;

	//image buffer size = 10
	//dropFrame = false
	ConsumerProducerQueue<cv::Mat> *imageBuffer = new ConsumerProducerQueue<cv::Mat>(10,false);

	class Timer {
	public:
	void tic() {
	start_ticking_ = true;
	start_ = std::chrono::high_resolution_clock::now();
	}
	void toc() {
	if (!start_ticking_)return;
	end_ = std::chrono::high_resolution_clock::now();
	start_ticking_ = false;
	t = std::chrono::duration<double, std::milli>(end_ - start_).count();
	//std::cout << "Time: " << t << " ms" << std::endl;
	}
	double t;
	private:
	bool start_ticking_ = false;
	std::chrono::time_point<std::chrono::high_resolution_clock> start_;
	std::chrono::time_point<std::chrono::high_resolution_clock> end_;
	};


	/* *
	* @TODO: unifiedMemory is used here under -> ( cudaMallocManaged )
	* */
	float* allocateMemory(DimsCHW dims, char* info)
	{
	float* ptr;
	size_t size;
	std::cout << "Allocate memory: " << info << std::endl;
	size = BATCH_SIZE * dims.c() * dims.h() * dims.w();
	assert(!cudaMallocManaged( &ptr, size*sizeof(float)));
	return ptr;
	}


	void loadImg( cv::Mat &input1, cv::Mat &input2, int re_width, int re_height, float *data_unifrom,const float3 mean,const float scale )
	{
	int i;
	int j;
	int line_offset;
	int offset_g;
	int offset_r;
	cv::Mat dst1;
	cv::Mat dst2;

	unsigned char *line1 = NULL;
	unsigned char *line2 = NULL;
	float *unifrom_data = data_unifrom;

	cv::resize( input1, dst1, cv::Size( re_width, re_height ), (0.0), (0.0), cv::INTER_LINEAR );
	cv::resize( input2, dst2, cv::Size( re_width, re_height ), (0.0), (0.0), cv::INTER_LINEAR );
	offset_g = re_width * re_height;
	offset_r = re_width * re_height * 2;
	for( i = 0; i < re_height; ++i )
	{
	line1 = dst1.ptr< unsigned char >( i );
	line2 = dst2.ptr< unsigned char >( i );
	line_offset = i * re_width;
	for( j = 0; j < re_width; ++j )
	{
	//first image
	// b
	unifrom_data[ line_offset + j ] = (( float )(line1[ j * 3 ] - mean.x) * scale);
	// g
	unifrom_data[ offset_g + line_offset + j ] = (( float )(line1[ j * 3 + 1 ] - mean.y) * scale);
	// r
	unifrom_data[ offset_r + line_offset + j ] = (( float )(line1[ j * 3 + 2 ] - mean.z) * scale);

	//second image
	// b
	unifrom_data[ line_offset + j + re_width * re_height * 3] = (( float )(line2[ j * 3 ] - mean.x) * scale);
	// g
	unifrom_data[ offset_g + line_offset + j + re_width * re_height * 3] = (( float )(line2[ j * 3 + 1 ] - mean.y) * scale);
	// r
	unifrom_data[ offset_r + line_offset + j + re_width * re_height * 3] = (( float )(line2[ j * 3 + 2 ] - mean.z) * scale);
	}
	}

	}

	//thread read video
	void readPicture()
	{
	cv::VideoCapture cap("testVideo/test.avi");
	// cv::VideoCapture cap("/data1/lhi/SANDISK128/nvnn/segmentation/driving_Korea/Danyang/anno_videos/vlc-record-2019-10-30-09h18m05s-driving_in_Korea_Danyang.mp4-.mp4");
	cv::Mat image;
	while(cap.isOpened())
	{
	cap >> image;
	imageBuffer->add(image);
	}
	}

	int main(int argc, char *argv[])
	{
	std::vector<std::string> output_vector = {OUTPUT_BLOB_NAME};
	TensorNet tensorNet;
	tensorNet.LoadNetwork(model,weight,INPUT_BLOB_NAME, output_vector,BATCH_SIZE);

	DimsCHW dimsData = tensorNet.getTensorDims(INPUT_BLOB_NAME);
	DimsCHW dimsOut = tensorNet.getTensorDims(OUTPUT_BLOB_NAME);

	float* data = allocateMemory( dimsData , (char*)"input blob");
	std::cout << "allocate data" << std::endl;
	float* output = allocateMemory( dimsOut , (char*)"output blob");
	std::cout << "allocate output" << std::endl;
	int height = 304;
	int width = 304;

	cv::Mat frame,srcImg, fl_frame;

	void* imgCPU;
	void* imgCUDA;
	Timer timer;
	int count =0;
	std::thread readTread(readPicture);
	readTread.detach();
	cv::VideoWriter writer;
	while(1)
	{
	count ++;
	imageBuffer->consume(frame);
	auto start = std::chrono::system_clock::now();
	srcImg = frame.clone();
	cv::resize(frame, frame, cv::Size(304,304));
	cv::flip(frame,fl_frame,1);
	const size_t size = width * height * sizeof(float3);

	if( CUDA_FAILED( cudaMalloc( &imgCUDA, 2*size)) )
	{
	cout <<"Cuda Memory allocation error occured."<<endl;
	return false;
	}

	void* imgData = malloc(2*size);
	memset(imgData,0,2*size);

	loadImg(frame,fl_frame,height,width,(float*)imgData,make_float3(103.94,116.78,123.68),0.017);
	cudaMemcpyAsync(imgCUDA,imgData,2*size,cudaMemcpyHostToDevice);

	void* buffers[] = { imgCUDA, output };

	timer.tic();
	tensorNet.imageInference( buffers, output_vector.size() + 1, BATCH_SIZE);
	timer.toc();
	double msTime = timer.t;

	vector<vector<float> > detections;

	for (int k=0; k<100; k++)
	{
	if(output[7*k+1] == -1)
	break;
	float classIndex = output[7*k+1];
	float confidence = output[7*k+2];
	float xmin = output[7*k + 3];
	float ymin = output[7*k + 4];
	float xmax = output[7*k + 5];
	float ymax = output[7*k + 6];
	//std::cout << classIndex << " , " << confidence << " , " << xmin << " , " << ymin<< " , " << xmax<< " , " << ymax << std::endl;
	int x1 = static_cast<int>(xmin * srcImg.cols);
	int y1 = static_cast<int>(ymin * srcImg.rows);
	int x2 = static_cast<int>(xmax * srcImg.cols);
	int y2 = static_cast<int>(ymax * srcImg.rows);
	cv::rectangle(srcImg,cv::Rect2f(cv::Point(x1,y1),cv::Point(x2,y2)),cv::Scalar(255,0,255),1);

	}
	cv::Size size1;
	size1.width = srcImg.cols;
	size1.height = srcImg.rows;
	auto end = std::chrono::system_clock::now();
	std::chrono::duration<double> elapsed_seconds = end-start;
	float duration = elapsed_seconds.count()*1000;
	std::cout<<"Network processing time: "<<duration<< std::endl;

	if (count == 1) {
	char fname[256];
	sprintf(fname,"result.wmv");
	printf(fname);
	writer.open(fname, cv::VideoWriter::fourcc('M', 'P', '4', 'V'), 20, size1);
	}
	else {
	//cv::cvtColor(seg_gray, seg_gray, CV_BGR2GRAY);
	writer << srcImg;
	}
	free(imgData);
	}
	writer.release();
	cudaFree(imgCUDA);
	cudaFreeHost(imgCPU);
	cudaFree(output);
	tensorNet.destroy();
	return 0;
	}