Skip to content

Instantly share code, notes, and snippets.

@StefanoLusardi
Last active December 31, 2024 08:38
Show Gist options
  • Select an option

  • Save StefanoLusardi/af2b3c11191d7b0c80431da202090dc1 to your computer and use it in GitHub Desktop.

Select an option

Save StefanoLusardi/af2b3c11191d7b0c80431da202090dc1 to your computer and use it in GitHub Desktop.
ONNX Yolo v8
from ultralytics import YOLO
model = YOLO("yolov8n.pt")
model.export(format="onnx", opset=12, simplify=True, dynamic=False, imgsz=640)
#include <onnxruntime/core/session/onnxruntime_cxx_api.h>
#include <opencv2/opencv.hpp>
#include <iostream>
#include <vector>
#include <string>
#include <algorithm>
#include <numeric>
#include <variant>
#include <fstream>
struct Detection
{
cv::Rect bbox;
float score;
int label;
};
// First, define the variant type (could be in the header file)
using TensorElement = std::variant<float, int32_t, int64_t>;
class Detector{
protected:
float confidenceThreshold_;
float nms_threshold_ = 0.4f;
size_t network_width_;
size_t network_height_;
std::string backend_;
int channels_{ -1 };
public:
Detector(
float confidenceThreshold = 0.5f,
size_t network_width = -1,
size_t network_height = -1
) : confidenceThreshold_{confidenceThreshold},
network_width_ {network_width},
network_height_ {network_height}
{
}
inline float getConfidenceThreshold(){ return confidenceThreshold_; }
inline float getNetworkWidth() { return network_width_; }
inline float getNetworkHeight() { return network_height_; }
virtual std::vector<Detection> postprocess(const std::vector<std::vector<TensorElement>>& outputs, const std::vector<std::vector<int64_t>>& shapes, const cv::Size& frame_size) = 0;
virtual cv::Mat preprocess_image(const cv::Mat& image) = 0;
};
struct Output
{
std::vector<cv::Rect> boxes;
std::vector<float> confs;
std::vector<int> classIds;
};
class YoloVn : public Detector
{
public:
YoloVn(
float confidenceThreshold = 0.25,
size_t network_width = 640,
size_t network_height = 640);
std::vector<Detection> postprocess(const std::vector<std::vector<TensorElement>> &outputs, const std::vector<std::vector<int64_t>> &shapes, const cv::Size &frame_size) override;
cv::Mat preprocess_image(const cv::Mat &image) override;
cv::Rect get_rect(const cv::Size &imgSz, const std::vector<float> &bbox);
std::tuple<std::vector<cv::Rect>, std::vector<float>, std::vector<int>> postprocess_v567(const TensorElement *output, const std::vector<int64_t> &shape, const cv::Size &frame_size);
std::tuple<std::vector<cv::Rect>, std::vector<float>, std::vector<int>> postprocess_ultralytics(const TensorElement *output, const std::vector<int64_t> &shape, const cv::Size &frame_size);
Output postprocess_ultralytics2(const float *output, const std::vector<int64_t> &shape, const cv::Size &frame_size);
};
YoloVn::YoloVn(
float confidenceThreshold,
size_t network_width,
size_t network_height) : Detector{confidenceThreshold,
network_width,
network_height}
{
}
cv::Mat YoloVn::preprocess_image(const cv::Mat &img)
{
int w, h, x, y;
float r_w = network_width_ / (img.cols * 1.0);
float r_h = network_height_ / (img.rows * 1.0);
if (r_h > r_w)
{
w = network_width_;
h = r_w * img.rows;
x = 0;
y = (network_height_ - h) / 2;
}
else
{
w = r_h * img.cols;
h = network_height_;
x = (network_width_ - w) / 2;
y = 0;
}
cv::Mat re(h, w, CV_8UC3);
cv::resize(img, re, re.size(), 0, 0, cv::INTER_LINEAR);
cv::Mat out(network_width_, network_height_, CV_8UC3, cv::Scalar(128, 128, 128));
re.copyTo(out(cv::Rect(x, y, re.cols, re.rows)));
cv::cvtColor(out, out, cv::COLOR_BGR2RGB);
out.convertTo(out, CV_32F, 1.0 / 255.0);
return out;
}
cv::Rect YoloVn::get_rect(const cv::Size &imgSz, const std::vector<float> &bbox)
{
float r_w = network_width_ / static_cast<float>(imgSz.width);
float r_h = network_height_ / static_cast<float>(imgSz.height);
int l, r, t, b;
if (r_h > r_w)
{
l = bbox[0] - bbox[2] / 2.f;
r = bbox[0] + bbox[2] / 2.f;
t = bbox[1] - bbox[3] / 2.f - (network_height_ - r_w * imgSz.height) / 2;
b = bbox[1] + bbox[3] / 2.f - (network_height_ - r_w * imgSz.height) / 2;
l /= r_w;
r /= r_w;
t /= r_w;
b /= r_w;
}
else
{
l = bbox[0] - bbox[2] / 2.f - (network_width_ - r_h * imgSz.width) / 2;
r = bbox[0] + bbox[2] / 2.f - (network_width_ - r_h * imgSz.width) / 2;
t = bbox[1] - bbox[3] / 2.f;
b = bbox[1] + bbox[3] / 2.f;
l /= r_h;
r /= r_h;
t /= r_h;
b /= r_h;
}
// Clamp the coordinates within the image bounds
l = std::max(0, std::min(l, imgSz.width - 1));
r = std::max(0, std::min(r, imgSz.width - 1));
t = std::max(0, std::min(t, imgSz.height - 1));
b = std::max(0, std::min(b, imgSz.height - 1));
return cv::Rect(l, t, r - l, b - t);
}
std::tuple<std::vector<cv::Rect>, std::vector<float>, std::vector<int>> YoloVn::postprocess_v567(const TensorElement *output, const std::vector<int64_t> &shape, const cv::Size &frame_size)
{
std::vector<cv::Rect> boxes;
std::vector<float> confs;
std::vector<int> classIds;
const auto offset = 5;
const auto num_classes = shape[2] - offset; // 1 x 25200 x 85
for (int i = 0; i < shape[1]; ++i)
{
const auto obj_conf = std::get<float>(output[4]);
auto maxSPtr = std::max_element(output + 5, output + 5 + num_classes,
[](const TensorElement &a, const TensorElement &b)
{
return std::get<float>(a) < std::get<float>(b);
});
float score = std::get<float>(*maxSPtr) * obj_conf;
if (score > confidenceThreshold_)
{
std::vector<float> bbox;
for (int j = 0; j < 4; ++j)
{
bbox.emplace_back(std::get<float>(output[j]));
}
boxes.emplace_back(get_rect(frame_size, bbox));
int label = maxSPtr - (output + 5);
confs.emplace_back(score);
classIds.emplace_back(label);
}
output += shape[2];
}
return std::make_tuple(boxes, confs, classIds);
}
std::tuple<std::vector<cv::Rect>, std::vector<float>, std::vector<int>> YoloVn::postprocess_ultralytics(const TensorElement *output, const std::vector<int64_t> &shape, const cv::Size &frame_size)
{
std::vector<cv::Rect> boxes;
std::vector<float> confs;
std::vector<int> classIds;
const auto offset = 4;
const auto num_classes = shape[1] - offset;
std::vector<std::vector<float>> output_matrix(shape[1], std::vector<float>(shape[2]));
// Construct output matrix
for (size_t i = 0; i < shape[1]; ++i)
{
for (size_t j = 0; j < shape[2]; ++j)
{
output_matrix[i][j] = std::get<float>(output[i * shape[2] + j]);
}
}
std::vector<std::vector<float>> transposed_output(shape[2], std::vector<float>(shape[1]));
// Transpose output matrix
for (int i = 0; i < shape[1]; ++i)
{
for (int j = 0; j < shape[2]; ++j)
{
transposed_output[j][i] = output_matrix[i][j];
}
}
// Get all the YOLO proposals
for (int i = 0; i < shape[2]; ++i)
{
const auto &row = transposed_output[i];
const float *bboxesPtr = row.data();
const float *scoresPtr = bboxesPtr + 4;
auto maxSPtr = std::max_element(scoresPtr, scoresPtr + num_classes);
float score = *maxSPtr;
if (score > confidenceThreshold_)
{
boxes.emplace_back(get_rect(frame_size, std::vector<float>(bboxesPtr, bboxesPtr + 4)));
int label = maxSPtr - scoresPtr;
confs.emplace_back(score);
classIds.emplace_back(label);
}
}
return std::make_tuple(boxes, confs, classIds);
}
Output YoloVn::postprocess_ultralytics2(const float *output, const std::vector<int64_t> &shape, const cv::Size &frame_size)
{
std::vector<cv::Rect> boxes;
std::vector<float> confs;
std::vector<int> classIds;
const auto offset = 4;
const auto num_classes = shape[1] - offset;
std::vector<std::vector<float>> output_matrix(shape[1], std::vector<float>(shape[2]));
// Construct output matrix
for (size_t i = 0; i < shape[1]; ++i)
{
for (size_t j = 0; j < shape[2]; ++j)
{
output_matrix[i][j] = output[i * shape[2] + j];
}
}
std::vector<std::vector<float>> transposed_output(shape[2], std::vector<float>(shape[1]));
// Transpose output matrix
for (int i = 0; i < shape[1]; ++i)
{
for (int j = 0; j < shape[2]; ++j)
{
transposed_output[j][i] = output_matrix[i][j];
}
}
// Get all the YOLO proposals
for (int i = 0; i < shape[2]; ++i)
{
const auto &row = transposed_output[i];
const float *bboxesPtr = row.data();
const float *scoresPtr = bboxesPtr + 4;
auto maxSPtr = std::max_element(scoresPtr, scoresPtr + num_classes);
float score = *maxSPtr;
if (score > confidenceThreshold_)
{
boxes.emplace_back(get_rect(frame_size, std::vector<float>(bboxesPtr, bboxesPtr + 4)));
int label = maxSPtr - scoresPtr;
confs.emplace_back(score);
classIds.emplace_back(label);
}
}
return Output{boxes, confs, classIds};
// return std::make_tuple(boxes, confs, classIds);
}
std::vector<Detection> YoloVn::postprocess(const std::vector<std::vector<TensorElement>> &outputs, const std::vector<std::vector<int64_t>> &shapes, const cv::Size &frame_size)
{
const TensorElement *output0 = outputs.front().data();
const std::vector<int64_t> shape0 = shapes.front();
const auto [boxes, confs, classIds] = (shape0[1] > shape0[2]) ? postprocess_v567(output0, shape0, frame_size) : postprocess_ultralytics(output0, shape0, frame_size);
// Perform Non Maximum Suppression and draw predictions.
std::vector<int> indices;
cv::dnn::NMSBoxes(boxes, confs, confidenceThreshold_, nms_threshold_, indices);
std::vector<Detection> detections;
for (int i = 0; i < indices.size(); i++)
{
Detection det;
int idx = indices[i];
det.label = classIds[idx];
det.bbox = boxes[idx];
det.score = confs[idx];
detections.emplace_back(det);
}
return detections;
}
std::vector<float> blob2vec(const cv::Mat& input_blob)
{
const auto channels = input_blob.size[1];
const auto network_width = input_blob.size[2];
const auto network_height = input_blob.size[3];
size_t img_byte_size = network_width * network_height * channels * sizeof(float); // Allocate a buffer to hold all image elements.
std::vector<float> input_data = std::vector<float>(network_width * network_height * channels);
std::memcpy(input_data.data(), input_blob.data, img_byte_size);
std::vector<cv::Mat> chw;
for (size_t i = 0; i < channels; ++i)
{
chw.emplace_back(cv::Mat(cv::Size(network_width, network_height), CV_32FC1, &(input_data[i * network_width * network_height])));
}
cv::split(input_blob, chw);
return input_data;
}
int main(int argc, char **argv)
{
const std::string model_path = "../../models/yolov8n.onnx";
const std::string image_path = "../../img/dog.png";
// const std::string image_path = "../../img/cat.jpeg";
YoloVn yolo;
cv::Mat img = cv::imread(image_path, cv::IMREAD_COLOR);
cv::Mat input_image = yolo.preprocess_image(img);
cv::Mat blob;
cv::dnn::blobFromImage(input_image, blob, 1.0, cv::Size(), cv::Scalar(), false, false);
auto blob_vec = blob2vec(blob);
Ort::Env env(ORT_LOGGING_LEVEL_WARNING, "YOLOv8");
Ort::SessionOptions session_options;
session_options.SetIntraOpNumThreads(1);
Ort::Session session(env, model_path.c_str(), session_options);
auto memory_info = Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeCPU);
auto input_shape = session.GetInputTypeInfo(0).GetTensorTypeAndShapeInfo().GetShape();
auto input_tensor = Ort::Value::CreateTensor<float>(
memory_info,
blob_vec.data(),
blob_vec.size(),
input_shape.data(),
input_shape.size()
);
std::vector<const char*> input_names = {"images"};
std::vector<const char*> output_names = {"output0"};
std::vector<Ort::Value> infer_output = session.Run(
Ort::RunOptions{nullptr},
input_names.data(),
&input_tensor,
1, // input_count
output_names.data(),
1 // output_count
);
// Postprocess output
float* outputs_raw = infer_output[0].GetTensorMutableData<float>();
std::vector<int64_t> output_shape = session.GetOutputTypeInfo(0).GetTensorTypeAndShapeInfo().GetShape();
const auto output_postprocess = yolo.postprocess_ultralytics2(outputs_raw, output_shape, img.size());
// Print detections
std::cout << "Detection results:\n";
for (int i = 0; i < output_postprocess.classIds.size(); ++i)
{
std::cout
<< "Class: " << output_postprocess.classIds[i]
<< ", Confidence: " << output_postprocess.confs[i]
<< std::endl;
}
return EXIT_SUCCESS;
}
@olibartfast
Copy link

std::vector<float> blob2vec(const cv::Mat& input_blob)
{

    const auto channels = input_blob.size[1];
    const auto network_width = input_blob.size[2];
    const auto network_height = input_blob.size[3];
    size_t img_byte_size = network_width * network_height * channels * sizeof(float);  // Allocate a buffer to hold all image elements.
    std::vector<float> input_data = std::vector<float>(network_width * network_height * channels);
    std::memcpy(input_data.data(), input_blob.data, img_byte_size);

    std::vector<cv::Mat> chw;
    for (size_t i = 0; i < channels; ++i)
    {
        chw.emplace_back(cv::Mat(cv::Size(network_width, network_height), CV_32FC1, &(input_data[i * network_width * network_height])));
    }
    cv::split(input_blob, chw);

    return input_data;    
}

int main(int argc, char **argv)
{
    const std::string model_path = "/home/oli/yolov8n.onnx";
    const std::string image_path = "dog.jpg";

    YoloVn yolo;
    cv::Mat img = cv::imread(image_path, cv::IMREAD_COLOR);
    cv::Mat input_image = yolo.preprocess_image(img);

    cv::Mat blob;
    cv::dnn::blobFromImage(input_image, blob, 1.0, cv::Size(), cv::Scalar(), false, false);
    auto blob_vec = blob2vec(blob);

    Ort::Env env(ORT_LOGGING_LEVEL_WARNING, "YOLOv8");
    Ort::SessionOptions session_options;
    session_options.SetIntraOpNumThreads(1);
    Ort::Session session(env, model_path.c_str(), session_options);
    auto memory_info = Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeCPU);

    auto input_shape = session.GetInputTypeInfo(0).GetTensorTypeAndShapeInfo().GetShape();
    auto input_tensor = Ort::Value::CreateTensor<float>(
        memory_info,
        blob_vec.data(),
        blob_vec.size(),
        input_shape.data(),
        input_shape.size()
    );

    std::vector<const char*> input_names = {"images"};
    std::vector<const char*> output_names = {"output0"};

    std::vector<Ort::Value> infer_output = session.Run(
        Ort::RunOptions{nullptr},
        input_names.data(),
        &input_tensor,
        1, // input_count
        output_names.data(),
        1 // output_count
    );
    // Postprocess output
    float* outputs_raw = infer_output[0].GetTensorMutableData<float>();
    std::vector<int64_t> output_shape = infer_output[0].GetTensorTypeAndShapeInfo().GetShape();

    // Wrap the raw output in the format expected by postprocess
    std::vector<std::vector<TensorElement>> wrapped_output(1);
    wrapped_output[0].reserve(output_shape[1] * output_shape[2]);
    for (int64_t i = 0; i < output_shape[1] * output_shape[2]; ++i) {
        wrapped_output[0].push_back(outputs_raw[i]);
    }

    std::vector<std::vector<int64_t>> wrapped_shape = {output_shape};

    // Call postprocess
    std::vector<Detection> detections = yolo.postprocess(wrapped_output, wrapped_shape, img.size());

    // Print detections
    std::cout << "Detection results:\n";
    for (const auto& det : detections)
    {
        std::cout
            << "Class: " << det.label
            << ", Confidence: " << det.score
            << ", Bounding Box: (" << det.bbox.x << ", " << det.bbox.y << ", " 
            << det.bbox.width << ", " << det.bbox.height << ")"
            << std::endl;
    }

    return EXIT_SUCCESS;
}

@StefanoLusardi
Copy link
Author

@olibartfast , thanks for the update!
The missing step was the blob2vec function.
I will update the original gist code with the latest modifications.
Thank you.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment