Last active
December 31, 2024 08:38
-
-
Save StefanoLusardi/af2b3c11191d7b0c80431da202090dc1 to your computer and use it in GitHub Desktop.
ONNX Yolo v8
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from ultralytics import YOLO | |
| model = YOLO("yolov8n.pt") | |
| model.export(format="onnx", opset=12, simplify=True, dynamic=False, imgsz=640) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #include <onnxruntime/core/session/onnxruntime_cxx_api.h> | |
| #include <opencv2/opencv.hpp> | |
| #include <iostream> | |
| #include <vector> | |
| #include <string> | |
| #include <algorithm> | |
| #include <numeric> | |
| #include <variant> | |
| #include <fstream> | |
| struct Detection | |
| { | |
| cv::Rect bbox; | |
| float score; | |
| int label; | |
| }; | |
| // First, define the variant type (could be in the header file) | |
| using TensorElement = std::variant<float, int32_t, int64_t>; | |
| class Detector{ | |
| protected: | |
| float confidenceThreshold_; | |
| float nms_threshold_ = 0.4f; | |
| size_t network_width_; | |
| size_t network_height_; | |
| std::string backend_; | |
| int channels_{ -1 }; | |
| public: | |
| Detector( | |
| float confidenceThreshold = 0.5f, | |
| size_t network_width = -1, | |
| size_t network_height = -1 | |
| ) : confidenceThreshold_{confidenceThreshold}, | |
| network_width_ {network_width}, | |
| network_height_ {network_height} | |
| { | |
| } | |
| inline float getConfidenceThreshold(){ return confidenceThreshold_; } | |
| inline float getNetworkWidth() { return network_width_; } | |
| inline float getNetworkHeight() { return network_height_; } | |
| virtual std::vector<Detection> postprocess(const std::vector<std::vector<TensorElement>>& outputs, const std::vector<std::vector<int64_t>>& shapes, const cv::Size& frame_size) = 0; | |
| virtual cv::Mat preprocess_image(const cv::Mat& image) = 0; | |
| }; | |
| struct Output | |
| { | |
| std::vector<cv::Rect> boxes; | |
| std::vector<float> confs; | |
| std::vector<int> classIds; | |
| }; | |
| class YoloVn : public Detector | |
| { | |
| public: | |
| YoloVn( | |
| float confidenceThreshold = 0.25, | |
| size_t network_width = 640, | |
| size_t network_height = 640); | |
| std::vector<Detection> postprocess(const std::vector<std::vector<TensorElement>> &outputs, const std::vector<std::vector<int64_t>> &shapes, const cv::Size &frame_size) override; | |
| cv::Mat preprocess_image(const cv::Mat &image) override; | |
| cv::Rect get_rect(const cv::Size &imgSz, const std::vector<float> &bbox); | |
| std::tuple<std::vector<cv::Rect>, std::vector<float>, std::vector<int>> postprocess_v567(const TensorElement *output, const std::vector<int64_t> &shape, const cv::Size &frame_size); | |
| std::tuple<std::vector<cv::Rect>, std::vector<float>, std::vector<int>> postprocess_ultralytics(const TensorElement *output, const std::vector<int64_t> &shape, const cv::Size &frame_size); | |
| Output postprocess_ultralytics2(const float *output, const std::vector<int64_t> &shape, const cv::Size &frame_size); | |
| }; | |
| YoloVn::YoloVn( | |
| float confidenceThreshold, | |
| size_t network_width, | |
| size_t network_height) : Detector{confidenceThreshold, | |
| network_width, | |
| network_height} | |
| { | |
| } | |
| cv::Mat YoloVn::preprocess_image(const cv::Mat &img) | |
| { | |
| int w, h, x, y; | |
| float r_w = network_width_ / (img.cols * 1.0); | |
| float r_h = network_height_ / (img.rows * 1.0); | |
| if (r_h > r_w) | |
| { | |
| w = network_width_; | |
| h = r_w * img.rows; | |
| x = 0; | |
| y = (network_height_ - h) / 2; | |
| } | |
| else | |
| { | |
| w = r_h * img.cols; | |
| h = network_height_; | |
| x = (network_width_ - w) / 2; | |
| y = 0; | |
| } | |
| cv::Mat re(h, w, CV_8UC3); | |
| cv::resize(img, re, re.size(), 0, 0, cv::INTER_LINEAR); | |
| cv::Mat out(network_width_, network_height_, CV_8UC3, cv::Scalar(128, 128, 128)); | |
| re.copyTo(out(cv::Rect(x, y, re.cols, re.rows))); | |
| cv::cvtColor(out, out, cv::COLOR_BGR2RGB); | |
| out.convertTo(out, CV_32F, 1.0 / 255.0); | |
| return out; | |
| } | |
| cv::Rect YoloVn::get_rect(const cv::Size &imgSz, const std::vector<float> &bbox) | |
| { | |
| float r_w = network_width_ / static_cast<float>(imgSz.width); | |
| float r_h = network_height_ / static_cast<float>(imgSz.height); | |
| int l, r, t, b; | |
| if (r_h > r_w) | |
| { | |
| l = bbox[0] - bbox[2] / 2.f; | |
| r = bbox[0] + bbox[2] / 2.f; | |
| t = bbox[1] - bbox[3] / 2.f - (network_height_ - r_w * imgSz.height) / 2; | |
| b = bbox[1] + bbox[3] / 2.f - (network_height_ - r_w * imgSz.height) / 2; | |
| l /= r_w; | |
| r /= r_w; | |
| t /= r_w; | |
| b /= r_w; | |
| } | |
| else | |
| { | |
| l = bbox[0] - bbox[2] / 2.f - (network_width_ - r_h * imgSz.width) / 2; | |
| r = bbox[0] + bbox[2] / 2.f - (network_width_ - r_h * imgSz.width) / 2; | |
| t = bbox[1] - bbox[3] / 2.f; | |
| b = bbox[1] + bbox[3] / 2.f; | |
| l /= r_h; | |
| r /= r_h; | |
| t /= r_h; | |
| b /= r_h; | |
| } | |
| // Clamp the coordinates within the image bounds | |
| l = std::max(0, std::min(l, imgSz.width - 1)); | |
| r = std::max(0, std::min(r, imgSz.width - 1)); | |
| t = std::max(0, std::min(t, imgSz.height - 1)); | |
| b = std::max(0, std::min(b, imgSz.height - 1)); | |
| return cv::Rect(l, t, r - l, b - t); | |
| } | |
| std::tuple<std::vector<cv::Rect>, std::vector<float>, std::vector<int>> YoloVn::postprocess_v567(const TensorElement *output, const std::vector<int64_t> &shape, const cv::Size &frame_size) | |
| { | |
| std::vector<cv::Rect> boxes; | |
| std::vector<float> confs; | |
| std::vector<int> classIds; | |
| const auto offset = 5; | |
| const auto num_classes = shape[2] - offset; // 1 x 25200 x 85 | |
| for (int i = 0; i < shape[1]; ++i) | |
| { | |
| const auto obj_conf = std::get<float>(output[4]); | |
| auto maxSPtr = std::max_element(output + 5, output + 5 + num_classes, | |
| [](const TensorElement &a, const TensorElement &b) | |
| { | |
| return std::get<float>(a) < std::get<float>(b); | |
| }); | |
| float score = std::get<float>(*maxSPtr) * obj_conf; | |
| if (score > confidenceThreshold_) | |
| { | |
| std::vector<float> bbox; | |
| for (int j = 0; j < 4; ++j) | |
| { | |
| bbox.emplace_back(std::get<float>(output[j])); | |
| } | |
| boxes.emplace_back(get_rect(frame_size, bbox)); | |
| int label = maxSPtr - (output + 5); | |
| confs.emplace_back(score); | |
| classIds.emplace_back(label); | |
| } | |
| output += shape[2]; | |
| } | |
| return std::make_tuple(boxes, confs, classIds); | |
| } | |
| std::tuple<std::vector<cv::Rect>, std::vector<float>, std::vector<int>> YoloVn::postprocess_ultralytics(const TensorElement *output, const std::vector<int64_t> &shape, const cv::Size &frame_size) | |
| { | |
| std::vector<cv::Rect> boxes; | |
| std::vector<float> confs; | |
| std::vector<int> classIds; | |
| const auto offset = 4; | |
| const auto num_classes = shape[1] - offset; | |
| std::vector<std::vector<float>> output_matrix(shape[1], std::vector<float>(shape[2])); | |
| // Construct output matrix | |
| for (size_t i = 0; i < shape[1]; ++i) | |
| { | |
| for (size_t j = 0; j < shape[2]; ++j) | |
| { | |
| output_matrix[i][j] = std::get<float>(output[i * shape[2] + j]); | |
| } | |
| } | |
| std::vector<std::vector<float>> transposed_output(shape[2], std::vector<float>(shape[1])); | |
| // Transpose output matrix | |
| for (int i = 0; i < shape[1]; ++i) | |
| { | |
| for (int j = 0; j < shape[2]; ++j) | |
| { | |
| transposed_output[j][i] = output_matrix[i][j]; | |
| } | |
| } | |
| // Get all the YOLO proposals | |
| for (int i = 0; i < shape[2]; ++i) | |
| { | |
| const auto &row = transposed_output[i]; | |
| const float *bboxesPtr = row.data(); | |
| const float *scoresPtr = bboxesPtr + 4; | |
| auto maxSPtr = std::max_element(scoresPtr, scoresPtr + num_classes); | |
| float score = *maxSPtr; | |
| if (score > confidenceThreshold_) | |
| { | |
| boxes.emplace_back(get_rect(frame_size, std::vector<float>(bboxesPtr, bboxesPtr + 4))); | |
| int label = maxSPtr - scoresPtr; | |
| confs.emplace_back(score); | |
| classIds.emplace_back(label); | |
| } | |
| } | |
| return std::make_tuple(boxes, confs, classIds); | |
| } | |
| Output YoloVn::postprocess_ultralytics2(const float *output, const std::vector<int64_t> &shape, const cv::Size &frame_size) | |
| { | |
| std::vector<cv::Rect> boxes; | |
| std::vector<float> confs; | |
| std::vector<int> classIds; | |
| const auto offset = 4; | |
| const auto num_classes = shape[1] - offset; | |
| std::vector<std::vector<float>> output_matrix(shape[1], std::vector<float>(shape[2])); | |
| // Construct output matrix | |
| for (size_t i = 0; i < shape[1]; ++i) | |
| { | |
| for (size_t j = 0; j < shape[2]; ++j) | |
| { | |
| output_matrix[i][j] = output[i * shape[2] + j]; | |
| } | |
| } | |
| std::vector<std::vector<float>> transposed_output(shape[2], std::vector<float>(shape[1])); | |
| // Transpose output matrix | |
| for (int i = 0; i < shape[1]; ++i) | |
| { | |
| for (int j = 0; j < shape[2]; ++j) | |
| { | |
| transposed_output[j][i] = output_matrix[i][j]; | |
| } | |
| } | |
| // Get all the YOLO proposals | |
| for (int i = 0; i < shape[2]; ++i) | |
| { | |
| const auto &row = transposed_output[i]; | |
| const float *bboxesPtr = row.data(); | |
| const float *scoresPtr = bboxesPtr + 4; | |
| auto maxSPtr = std::max_element(scoresPtr, scoresPtr + num_classes); | |
| float score = *maxSPtr; | |
| if (score > confidenceThreshold_) | |
| { | |
| boxes.emplace_back(get_rect(frame_size, std::vector<float>(bboxesPtr, bboxesPtr + 4))); | |
| int label = maxSPtr - scoresPtr; | |
| confs.emplace_back(score); | |
| classIds.emplace_back(label); | |
| } | |
| } | |
| return Output{boxes, confs, classIds}; | |
| // return std::make_tuple(boxes, confs, classIds); | |
| } | |
| std::vector<Detection> YoloVn::postprocess(const std::vector<std::vector<TensorElement>> &outputs, const std::vector<std::vector<int64_t>> &shapes, const cv::Size &frame_size) | |
| { | |
| const TensorElement *output0 = outputs.front().data(); | |
| const std::vector<int64_t> shape0 = shapes.front(); | |
| const auto [boxes, confs, classIds] = (shape0[1] > shape0[2]) ? postprocess_v567(output0, shape0, frame_size) : postprocess_ultralytics(output0, shape0, frame_size); | |
| // Perform Non Maximum Suppression and draw predictions. | |
| std::vector<int> indices; | |
| cv::dnn::NMSBoxes(boxes, confs, confidenceThreshold_, nms_threshold_, indices); | |
| std::vector<Detection> detections; | |
| for (int i = 0; i < indices.size(); i++) | |
| { | |
| Detection det; | |
| int idx = indices[i]; | |
| det.label = classIds[idx]; | |
| det.bbox = boxes[idx]; | |
| det.score = confs[idx]; | |
| detections.emplace_back(det); | |
| } | |
| return detections; | |
| } | |
| std::vector<float> blob2vec(const cv::Mat& input_blob) | |
| { | |
| const auto channels = input_blob.size[1]; | |
| const auto network_width = input_blob.size[2]; | |
| const auto network_height = input_blob.size[3]; | |
| size_t img_byte_size = network_width * network_height * channels * sizeof(float); // Allocate a buffer to hold all image elements. | |
| std::vector<float> input_data = std::vector<float>(network_width * network_height * channels); | |
| std::memcpy(input_data.data(), input_blob.data, img_byte_size); | |
| std::vector<cv::Mat> chw; | |
| for (size_t i = 0; i < channels; ++i) | |
| { | |
| chw.emplace_back(cv::Mat(cv::Size(network_width, network_height), CV_32FC1, &(input_data[i * network_width * network_height]))); | |
| } | |
| cv::split(input_blob, chw); | |
| return input_data; | |
| } | |
| int main(int argc, char **argv) | |
| { | |
| const std::string model_path = "../../models/yolov8n.onnx"; | |
| const std::string image_path = "../../img/dog.png"; | |
| // const std::string image_path = "../../img/cat.jpeg"; | |
| YoloVn yolo; | |
| cv::Mat img = cv::imread(image_path, cv::IMREAD_COLOR); | |
| cv::Mat input_image = yolo.preprocess_image(img); | |
| cv::Mat blob; | |
| cv::dnn::blobFromImage(input_image, blob, 1.0, cv::Size(), cv::Scalar(), false, false); | |
| auto blob_vec = blob2vec(blob); | |
| Ort::Env env(ORT_LOGGING_LEVEL_WARNING, "YOLOv8"); | |
| Ort::SessionOptions session_options; | |
| session_options.SetIntraOpNumThreads(1); | |
| Ort::Session session(env, model_path.c_str(), session_options); | |
| auto memory_info = Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeCPU); | |
| auto input_shape = session.GetInputTypeInfo(0).GetTensorTypeAndShapeInfo().GetShape(); | |
| auto input_tensor = Ort::Value::CreateTensor<float>( | |
| memory_info, | |
| blob_vec.data(), | |
| blob_vec.size(), | |
| input_shape.data(), | |
| input_shape.size() | |
| ); | |
| std::vector<const char*> input_names = {"images"}; | |
| std::vector<const char*> output_names = {"output0"}; | |
| std::vector<Ort::Value> infer_output = session.Run( | |
| Ort::RunOptions{nullptr}, | |
| input_names.data(), | |
| &input_tensor, | |
| 1, // input_count | |
| output_names.data(), | |
| 1 // output_count | |
| ); | |
| // Postprocess output | |
| float* outputs_raw = infer_output[0].GetTensorMutableData<float>(); | |
| std::vector<int64_t> output_shape = session.GetOutputTypeInfo(0).GetTensorTypeAndShapeInfo().GetShape(); | |
| const auto output_postprocess = yolo.postprocess_ultralytics2(outputs_raw, output_shape, img.size()); | |
| // Print detections | |
| std::cout << "Detection results:\n"; | |
| for (int i = 0; i < output_postprocess.classIds.size(); ++i) | |
| { | |
| std::cout | |
| << "Class: " << output_postprocess.classIds[i] | |
| << ", Confidence: " << output_postprocess.confs[i] | |
| << std::endl; | |
| } | |
| return EXIT_SUCCESS; | |
| } |
olibartfast
commented
Dec 30, 2024
Author
@olibartfast , thanks for the update!
The missing step was the blob2vec function.
I will update the original gist code with the latest modifications.
Thank you.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment