Skip to content

Instantly share code, notes, and snippets.

@lix19937
Created November 15, 2024 02:04
Show Gist options
  • Save lix19937/48c75aaf665839d6692a7c2f13c57066 to your computer and use it in GitHub Desktop.
Save lix19937/48c75aaf665839d6692a7c2f13c57066 to your computer and use it in GitHub Desktop.
horizon_j6 infer cpp
#include <algorithm>
#include <fstream>
#include <iomanip>
#include <iostream>
#include <iterator>
#include <map>
#include <queue>
#include <utility>
#include "gflags/gflags.h"
// hori
#include "hlog/logging.h"
#include "hobot/dnn/hb_dnn.h"
#include "hobot/hb_ucp.h"
#include "hobot/hb_ucp_sys.h"
// opencv
#include "opencv2/core/mat.hpp"
#include "opencv2/imgcodecs.hpp"
#include "opencv2/imgproc.hpp"
#define EMPTY ""
DEFINE_string(model_file, EMPTY, "model file path");
DEFINE_string(image_file, EMPTY, "Test image path");
DEFINE_int32(top_k, 5, "Top k classes, 5 by default");
#define MOUDULE_NAME "DNN_BASIC_SAMPLE"
#define LOGD(err_msg, ...) HFLOGM_D(MOUDULE_NAME, err_msg, ##__VA_ARGS__)
#define LOGI(err_msg, ...) HFLOGM_I(MOUDULE_NAME, err_msg, ##__VA_ARGS__)
#define LOGE(err_msg, ...) HFLOGM_E(MOUDULE_NAME, err_msg, ##__VA_ARGS__)
#define LOGW(err_msg, ...) HFLOGM_W(MOUDULE_NAME, err_msg, ##__VA_ARGS__)
#define HB_CHECK_SUCCESS(value, errmsg) \
do \
{ \
/*value can be call of function*/ \
auto ret_code = value; \
if (ret_code != 0) \
{ \
LOGE("{}, error code: {}", errmsg, ret_code); \
return ret_code; \
} \
} while (0);
typedef struct Classification
{
int id;
float score;
const char *class_name;
Classification() : class_name(0), id(0), score(0.0) {}
Classification(int id, float score, const char *class_name)
: id(id), score(score), class_name(class_name) {}
friend bool operator>(const Classification &lhs, const Classification &rhs)
{
return (lhs.score > rhs.score);
}
~Classification() {}
} Classification;
int prepare_tensor(hbDNNTensor *input_tensor, hbDNNTensor *output_tensor,
hbDNNHandle_t dnn_handle);
int32_t read_image_2_tensor_as_nv12(std::string &image_file,
hbDNNTensor *input_tensor);
void get_topk_result(hbDNNTensor *tensor,
std::vector<Classification> &top_k_cls, int top_k);
/**
* Step1: get model handle
* Step2: prepare input and output tensor
* Step3: set input data to input tensor
* Step4: run inference
* Step5: do postprocess with output data
* Step6: release resources
*/
/*
./run_resnet \
--model_file=../model/resnet50_224x224_nv12.hbm \
--image_file=../data/cls_images/zebra_cls.jpg \
--top_k=5
*/
int main(int argc, char **argv)
{
gflags::SetUsageMessage(argv[0]);
gflags::ParseCommandLineFlags(&argc, &argv, true);
std::cout << gflags::GetArgv() << std::endl;
// Init logging
hobot::hlog::HobotLog::Instance()->SetLogLevel("DNN_BASIC_SAMPLE", hobot::hlog::LogLevel::log_info);
//////////////////////----------------------------------------- // Step1: get model handle
hbDNNPackedHandle_t packed_dnn_handle;
hbDNNHandle_t dnn_handle;
const char **model_name_list;
auto modelFileName = FLAGS_model_file.c_str();
int model_count = 0;
{
HB_CHECK_SUCCESS(
hbDNNInitializeFromFiles(&packed_dnn_handle, &modelFileName, 1),
"hbDNNInitializeFromFiles failed");
HB_CHECK_SUCCESS(hbDNNGetModelNameList(&model_name_list, &model_count,
packed_dnn_handle),
"hbDNNGetModelNameList failed");
HB_CHECK_SUCCESS(
hbDNNGetModelHandle(&dnn_handle, packed_dnn_handle, model_name_list[0]),
"hbDNNGetModelHandle failed");
}
//////////////////////----------------------------------------- // Step2: prepare input and output tensor
std::vector<hbDNNTensor> input_tensors, output_tensors;
int input_count = 0, output_count = 0;
{
HB_CHECK_SUCCESS(hbDNNGetInputCount(&input_count, dnn_handle),
"hbDNNGetInputCount failed");
HB_CHECK_SUCCESS(hbDNNGetOutputCount(&output_count, dnn_handle),
"hbDNNGetOutputCount failed");
input_tensors.resize(input_count);
output_tensors.resize(output_count);
prepare_tensor(input_tensors.data(), output_tensors.data(), dnn_handle);
}
//////////////////////----------------------------------------- // Step3: set input data to input tensor
{
// read a single picture for input_tensor[0], for multi_input model, you
// should set other input data according to model input properties. !!! will changed depends on models
HB_CHECK_SUCCESS(
read_image_2_tensor_as_nv12(FLAGS_image_file, input_tensors.data()),
"read_image_2_tensor_as_nv12 failed");
LOGI("read image to tensor as nv12 success");
}
//////////////////////----------------------------------------- // Step4: run inference
hbUCPTaskHandle_t task_handle{nullptr};
hbDNNTensor *output = output_tensors.data();
{
// make sure memory data is flushed to DDR before inference
for (int i = 0; i < input_count; i++)
{
hbUCPMemFlush(&input_tensors[i].sysMem[0], HB_SYS_MEM_CACHE_CLEAN);
}
// generate task handle
HB_CHECK_SUCCESS(
hbDNNInferV2(&task_handle, output, input_tensors.data(), dnn_handle),
"hbDNNInferV2 failed");
// submit task
hbUCPSchedParam ctrl_param;
HB_UCP_INITIALIZE_SCHED_PARAM(&ctrl_param);
ctrl_param.backend = HB_UCP_BPU_CORE_ANY;
HB_CHECK_SUCCESS(hbUCPSubmitTask(task_handle, &ctrl_param),
"hbUCPSubmitTask failed");
// wait task done
HB_CHECK_SUCCESS(hbUCPWaitTaskDone(task_handle, 0),
"hbUCPWaitTaskDone failed");
}
// Step5: do postprocess with output data
std::vector<Classification> top_k_cls;
{
// make sure CPU read data from DDR before using output tensor data
for (int i = 0; i < output_count; i++)
{
hbUCPMemFlush(&output_tensors[i].sysMem[0], HB_SYS_MEM_CACHE_INVALIDATE);
}
get_topk_result(output, top_k_cls, FLAGS_top_k);
for (int i = 0; i < FLAGS_top_k; i++)
{
LOGI("TOP {} result id: {}", i, top_k_cls[i].id);
}
}
// Step6: release resources
{
// release task handle
HB_CHECK_SUCCESS(hbUCPReleaseTask(task_handle), "hbUCPReleaseTask failed");
// free input mem
for (int i = 0; i < input_count; i++)
{
HB_CHECK_SUCCESS(hbUCPFree(&(input_tensors[i].sysMem[0])),
"hbUCPFree failed");
}
// free output mem
for (int i = 0; i < output_count; i++)
{
HB_CHECK_SUCCESS(hbUCPFree(&(output_tensors[i].sysMem[0])),
"hbUCPFree failed");
}
// release model
HB_CHECK_SUCCESS(hbDNNRelease(packed_dnn_handle), "hbDNNRelease failed");
}
return 0;
}
#define ALIGN(value, alignment) (((value) + ((alignment) - 1)) & ~((alignment) - 1))
#define ALIGN_32(value) ALIGN(value, 32)
int prepare_tensor(hbDNNTensor *input_tensor, hbDNNTensor *output_tensor,
hbDNNHandle_t dnn_handle)
{
int input_count = 0;
int output_count = 0;
hbDNNGetInputCount(&input_count, dnn_handle);
hbDNNGetOutputCount(&output_count, dnn_handle);
/** Tips:
* For input memory size in most cases:
* * input_memSize = input[i].properties.alignedByteSize
* but here for dynamic stride of y and uv,alignedByteSize is not fixed
* For output memory size:
* * output_memSize = output[i].properties.alignedByteSize
*/
hbDNNTensor *input = input_tensor;
for (int i = 0; i < input_count; i++)
{
HB_CHECK_SUCCESS(
hbDNNGetInputTensorProperties(&input[i].properties, dnn_handle, i),
"hbDNNGetInputTensorProperties failed");
/** Tips:
* For input tensor, usually need to pad the input data according to stride obtained from properties.
* but here for dynamic stride of y and uv,user needs to specify a value which should be 32 bytes aligned for the -1 position in stride.
* */
auto dim_len = input[i].properties.validShape.numDimensions;
for (int32_t dim_i = dim_len - 1; dim_i >= 0; --dim_i)
{
if (input[i].properties.stride[dim_i] == -1)
{
auto cur_stride =
input[i].properties.stride[dim_i + 1] *
input[i].properties.validShape.dimensionSize[dim_i + 1];
input[i].properties.stride[dim_i] = ALIGN_32(cur_stride);
}
}
int input_memSize = input[i].properties.stride[0] *
input[i].properties.validShape.dimensionSize[0];
HB_CHECK_SUCCESS(hbUCPMallocCached(&input[i].sysMem[0], input_memSize, 0),
"hbUCPMallocCached failed");
// Show how to get input name
const char *input_name;
HB_CHECK_SUCCESS(hbDNNGetInputName(&input_name, dnn_handle, i),
"hbDNNGetInputName failed");
LOGI("input[{}] name is {}", i, input_name);
}
hbDNNTensor *output = output_tensor;
for (int i = 0; i < output_count; i++)
{
HB_CHECK_SUCCESS(
hbDNNGetOutputTensorProperties(&output[i].properties, dnn_handle, i),
"hbDNNGetOutputTensorProperties failed");
int output_memSize = output[i].properties.alignedByteSize;
HB_CHECK_SUCCESS(hbUCPMallocCached(&output[i].sysMem[0], output_memSize, 0),
"hbUCPMallocCached failed");
// Show how to get output name
const char *output_name;
HB_CHECK_SUCCESS(hbDNNGetOutputName(&output_name, dnn_handle, i),
"hbDNNGetOutputName failed");
LOGI("output[{}] name is {}", i, output_name);
}
return 0;
}
/** You can define read_image_2_tensor_as_other_type to prepare your data **/
int32_t read_image_2_tensor_as_nv12(std::string &image_file,
hbDNNTensor *input_tensor)
{
// the struct of input shape is NHWC
int input_h = input_tensor[0].properties.validShape.dimensionSize[1];
int input_w = input_tensor[0].properties.validShape.dimensionSize[2];
cv::Mat bgr_mat = cv::imread(image_file, cv::IMREAD_COLOR);
if (bgr_mat.empty())
{
LOGE("image file not exist!");
return -1;
}
// resize
cv::Mat mat;
mat.create(input_h, input_w, bgr_mat.type());
cv::resize(bgr_mat, mat, mat.size(), 0, 0);
// convert to YUV420
if (input_h % 2 || input_w % 2)
{
LOGE("input img height and width must aligned by 2!");
return -1;
}
cv::Mat yuv_mat;
cv::cvtColor(mat, yuv_mat, cv::COLOR_BGR2YUV_I420);
uint8_t *yuv_data = yuv_mat.ptr<uint8_t>();
uint8_t *y_data_src = yuv_data;
// copy y data
uint8_t *y_data_dst =
reinterpret_cast<uint8_t *>(input_tensor[0].sysMem[0].virAddr);
for (int32_t h = 0; h < input_h; ++h)
{
memcpy(y_data_dst, y_data_src, input_w);
y_data_src += input_w;
// add padding
y_data_dst += input_tensor[0].properties.stride[1];
}
// copy uv data
int32_t uv_height = input_tensor[1].properties.validShape.dimensionSize[1];
int32_t uv_width = input_tensor[1].properties.validShape.dimensionSize[2];
uint8_t *uv_data_dst =
reinterpret_cast<uint8_t *>(input_tensor[1].sysMem[0].virAddr);
uint8_t *u_data_src = yuv_data + input_h * input_w;
uint8_t *v_data_src = u_data_src + uv_height * uv_width;
for (int32_t h = 0; h < uv_height; ++h)
{
auto *cur_data = uv_data_dst;
for (int32_t w = 0; w < uv_width; ++w)
{
*cur_data++ = *u_data_src++;
*cur_data++ = *v_data_src++;
}
// add padding
uv_data_dst += input_tensor[1].properties.stride[1];
}
return 0;
}
void get_topk_result(hbDNNTensor *tensor,
std::vector<Classification> &top_k_cls, int top_k)
{
hbUCPMemFlush(&(tensor->sysMem[0]), HB_SYS_MEM_CACHE_INVALIDATE);
std::priority_queue<Classification, std::vector<Classification>,
std::greater<Classification>>
queue;
// The type reinterpret_cast should be determined according to the output type
// For example: HB_DNN_TENSOR_TYPE_F32 is float
auto data = reinterpret_cast<float *>(tensor->sysMem[0].virAddr);
auto quanti_type{tensor->properties.quantiType};
// For example model, quantiType is NONE and no dequantize processing is required.
if (quanti_type != hbDNNQuantiType::NONE)
{
LOGE("quanti_type is not NONE, and the output needs to be dequantized!");
}
// 1000 classification score values
int tensor_len = 1000;
for (auto i = 0; i < tensor_len; i++)
{
float score = data[i];
queue.push(Classification(i, score, ""));
if (queue.size() > top_k)
{
queue.pop();
}
}
while (!queue.empty())
{
top_k_cls.emplace_back(queue.top());
queue.pop();
}
std::reverse(top_k_cls.begin(), top_k_cls.end());
}
/*
root@9ed8cd874b9d:/open_explorer/J6_start/nni_ucp/build# ./run_resnet \
--model_file=../model/resnet50_224x224_nv12.hbm \
--image_file=../data/cls_images/zebra_cls.jpg \
--top_k=5
[UCP]: log level = 3
[UCP]: UCP version = 3.1.2
[VP]: log level = 3
[DNN]: log level = 3
[HPL]: log level = 3
[UCPT]: log level = 6
./run_resnet --model_file=../model/resnet50_224x224_nv12.hbm --image_file=../data/cls_images/zebra_cls.jpg --top_k=5
[I][26367][11-12][08:35:50:282][main.cc:267][run_resnet][DNN_BASIC_SAMPLE] input[0] name is input_y
[I][26367][11-12][08:35:50:282][main.cc:267][run_resnet][DNN_BASIC_SAMPLE] input[1] name is input_uv
[I][26367][11-12][08:35:50:282][main.cc:284][run_resnet][DNN_BASIC_SAMPLE] output[0] name is output
[I][26367][11-12][08:35:50:356][main.cc:148][run_resnet][DNN_BASIC_SAMPLE] read image to tensor as nv12 success
[BPU][[BPU_DEV]][INFO]bpu_core_get_est_time not implemented in simulator
[BPU][[BPU_DEV]][INFO]bpu_core_get_est_time not implemented in simulator
[BPU][[BPU_DEV]][INFO]bpu_core_get_est_time not implemented in simulator
[BPU][[BPU_DEV]][INFO]bpu_core_get_est_time not implemented in simulator
[BPU][[BPU_DEV]][INFO]bpu_core_get_est_time not implemented in simulator
[BPU][[BPU_DEV]][INFO]bpu_core_get_est_time not implemented in simulator
[I][26367][11-12][08:35:56:401][main.cc:190][run_resnet][DNN_BASIC_SAMPLE] TOP 0 result id: 340
[I][26367][11-12][08:35:56:402][main.cc:190][run_resnet][DNN_BASIC_SAMPLE] TOP 1 result id: 292
[I][26367][11-12][08:35:56:402][main.cc:190][run_resnet][DNN_BASIC_SAMPLE] TOP 2 result id: 9
[I][26367][11-12][08:35:56:402][main.cc:190][run_resnet][DNN_BASIC_SAMPLE] TOP 3 result id: 353
[I][26367][11-12][08:35:56:402][main.cc:190][run_resnet][DNN_BASIC_SAMPLE] TOP 4 result id: 343
*/
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment