From 71cdbf8601059221757af0880caeab79e9cecd00 Mon Sep 17 00:00:00 2001 From: Nuzhny007 Date: Mon, 6 Apr 2026 22:09:53 +0300 Subject: [PATCH 1/5] Try to add fp8 Precision # Conflicts: # data/settings_yoloe_seg.ini --- data/settings_yolov26m.ini | 1 + src/Detector/ONNXTensorRTDetector.cpp | 1 + src/Detector/tensorrt_onnx/YoloONNX.cpp | 19 ++++++++++++----- src/Detector/tensorrt_onnx/YoloONNX.hpp | 1 + src/Detector/tensorrt_onnx/class_detector.cpp | 2 ++ src/Detector/tensorrt_onnx/class_detector.h | 3 ++- .../tensorrt_onnx/common/sampleInference.cpp | 21 +++++++++++++++++-- .../tensorrt_onnx/common/sampleUtils.cpp | 11 +++++++++- 8 files changed, 50 insertions(+), 9 deletions(-) diff --git a/data/settings_yolov26m.ini b/data/settings_yolov26m.ini index 625ce893..23dd24f4 100644 --- a/data/settings_yolov26m.ini +++ b/data/settings_yolov26m.ini @@ -47,6 +47,7 @@ net_type = YOLOV26 # INT8 # FP16 # FP32 +# FP8 inference_precision = FP16 diff --git a/src/Detector/ONNXTensorRTDetector.cpp b/src/Detector/ONNXTensorRTDetector.cpp index 9d28da54..b0a734a7 100644 --- a/src/Detector/ONNXTensorRTDetector.cpp +++ b/src/Detector/ONNXTensorRTDetector.cpp @@ -72,6 +72,7 @@ bool ONNXTensorRTDetector::Init(const config_t& config) dictPrecision["INT8"] = tensor_rt::INT8; dictPrecision["FP16"] = tensor_rt::FP16; dictPrecision["FP32"] = tensor_rt::FP32; + dictPrecision["FP8"] = tensor_rt::FP8; auto precision = dictPrecision.find(inference_precision->second); if (precision != dictPrecision.end()) m_localConfig.m_inferencePrecision = precision->second; diff --git a/src/Detector/tensorrt_onnx/YoloONNX.cpp b/src/Detector/tensorrt_onnx/YoloONNX.cpp index aa4d23a6..bec31df9 100644 --- a/src/Detector/tensorrt_onnx/YoloONNX.cpp +++ b/src/Detector/tensorrt_onnx/YoloONNX.cpp @@ -19,6 +19,7 @@ bool YoloONNX::Init(const SampleYoloParams& params) m_params = params; + sample::setReportableSeverity(sample::Logger::Severity::kINFO); initLibNvInferPlugins(&sample::gLogger.getTRTLogger(), ""); auto GetBindings = [&]() @@ -79,15 +80,16 @@ bool YoloONNX::Init(const SampleYoloParams& params) file.close(); } - nvinfer1::IRuntime* infer = nvinfer1::createInferRuntime(sample::gLogger); + m_inferRuntime = std::shared_ptr(nvinfer1::createInferRuntime(sample::gLogger)); if (m_params.m_dlaCore >= 0) - infer->setDLACore(m_params.m_dlaCore); + m_inferRuntime->setDLACore(m_params.m_dlaCore); - m_engine = std::shared_ptr(infer->deserializeCudaEngine(trtModelStream.data(), size), samplesCommon::InferDeleter()); + m_engine = std::shared_ptr(m_inferRuntime->deserializeCudaEngine(trtModelStream.data(), size), samplesCommon::InferDeleter()); #if (NV_TENSORRT_MAJOR < 8) - infer->destroy(); + m_inferRuntime->destroy(); + m_inferRuntime.reset(); #else - //delete infer; + //m_inferRuntime.reset(); #endif if (m_engine) @@ -233,6 +235,12 @@ bool YoloONNX::ConstructNetwork(YoloONNXUniquePtr& builder, { case tensor_rt::Precision::FP16: config->setFlag(nvinfer1::BuilderFlag::kFP16); + sample::gLogInfo << "config->setFlag(nvinfer1::BuilderFlag::kFP16)" << std::endl; + break; + + case tensor_rt::Precision::FP8: + config->setFlag(nvinfer1::BuilderFlag::kFP8); + sample::gLogInfo << "config->setFlag(nvinfer1::BuilderFlag::kFP8)" << std::endl; break; case tensor_rt::Precision::INT8: @@ -243,6 +251,7 @@ bool YoloONNX::ConstructNetwork(YoloONNXUniquePtr& builder, BatchStream calibrationStream(m_params.m_explicitBatchSize, m_params.m_nbCalBatches, m_params.m_calibrationBatches, m_params.m_dataDirs); calibrator.reset(new Int8EntropyCalibrator2(calibrationStream, 0, "Yolo", m_params.m_inputTensorNames[0].c_str())); config->setFlag(nvinfer1::BuilderFlag::kINT8); + sample::gLogInfo << "config->setFlag(nvinfer1::BuilderFlag::kINT8)" << std::endl; config->setInt8Calibrator(calibrator.get()); } break; diff --git a/src/Detector/tensorrt_onnx/YoloONNX.hpp b/src/Detector/tensorrt_onnx/YoloONNX.hpp index 2452f61d..cae188c5 100644 --- a/src/Detector/tensorrt_onnx/YoloONNX.hpp +++ b/src/Detector/tensorrt_onnx/YoloONNX.hpp @@ -86,6 +86,7 @@ class YoloONNX private: std::shared_ptr m_engine; //!< The TensorRT engine used to run the network + std::shared_ptr m_inferRuntime; cv::Mat m_resized; std::vector m_resizedBatch; diff --git a/src/Detector/tensorrt_onnx/class_detector.cpp b/src/Detector/tensorrt_onnx/class_detector.cpp index 70f2aa48..f5f4fb66 100644 --- a/src/Detector/tensorrt_onnx/class_detector.cpp +++ b/src/Detector/tensorrt_onnx/class_detector.cpp @@ -134,6 +134,8 @@ namespace tensor_rt dictprecision[tensor_rt::INT8] = "kINT8"; dictprecision[tensor_rt::FP16] = "kHALF"; dictprecision[tensor_rt::FP32] = "kFLOAT"; + dictprecision[tensor_rt::FP8] = "kFP8"; + auto precision = dictprecision.find(m_params.m_precision); if (precision != dictprecision.end()) precisionStr = precision->second; diff --git a/src/Detector/tensorrt_onnx/class_detector.h b/src/Detector/tensorrt_onnx/class_detector.h index 29780685..7ea989bc 100644 --- a/src/Detector/tensorrt_onnx/class_detector.h +++ b/src/Detector/tensorrt_onnx/class_detector.h @@ -76,7 +76,8 @@ namespace tensor_rt { INT8 = 0, FP16, - FP32 + FP32, + FP8 }; /// diff --git a/src/Detector/tensorrt_onnx/common/sampleInference.cpp b/src/Detector/tensorrt_onnx/common/sampleInference.cpp index f0470bf7..b131ca32 100644 --- a/src/Detector/tensorrt_onnx/common/sampleInference.cpp +++ b/src/Detector/tensorrt_onnx/common/sampleInference.cpp @@ -46,6 +46,7 @@ #include "sampleOptions.h" #include "sampleReporting.h" #include "sampleUtils.h" +#include using namespace nvinfer1; namespace sample { @@ -1320,7 +1321,15 @@ void Binding::fill() fillBuffer(buffer->getHostBuffer(), volume, 0, 255); break; } - case nvinfer1::DataType::kFP8: ASSERT(false && "FP8 is not supported"); + case nvinfer1::DataType::kFP8: + { +#if 0 + ASSERT(false && "FP8 is not supported"); +#else + fillBuffer<__nv_fp8_e4m3>(buffer->getHostBuffer(), volume, __nv_fp8_e4m3(- 1.0f), __nv_fp8_e4m3(1.0f)); +#endif + break; + } #if (NV_TENSORRT_MAJOR > 8) case nvinfer1::DataType::kINT4: ASSERT(false && "INT4 is not supported"); #endif @@ -1388,7 +1397,15 @@ void Binding::dump(std::ostream& os, Dims dims, Dims strides, int32_t vectorDim, break; } #endif - case nvinfer1::DataType::kFP8: ASSERT(false && "FP8 is not supported"); + case nvinfer1::DataType::kFP8: + { +#if 0 + ASSERT(false && "FP8 is not supported"); +#else + dumpBuffer<__nv_fp8_e4m3>(outputBuffer, separator, os, dims, strides, vectorDim, spv); +#endif + break; + } #if (NV_TENSORRT_MAJOR > 8) case nvinfer1::DataType::kINT4: ASSERT(false && "INT4 is not supported"); #endif diff --git a/src/Detector/tensorrt_onnx/common/sampleUtils.cpp b/src/Detector/tensorrt_onnx/common/sampleUtils.cpp index 8f172afe..89a128ee 100644 --- a/src/Detector/tensorrt_onnx/common/sampleUtils.cpp +++ b/src/Detector/tensorrt_onnx/common/sampleUtils.cpp @@ -18,6 +18,7 @@ #include "sampleUtils.h" #include "bfloat16.h" #include "half.h" +#include using namespace nvinfer1; @@ -433,6 +434,11 @@ void print(std::ostream& os, __half v) os << static_cast(v); } +void print(std::ostream& os, __nv_fp8_e4m3 v) +{ + os << static_cast(v); +} + template void dumpBuffer(void const* buffer, std::string const& separator, std::ostream& os, Dims const& dims, Dims const& strides, int32_t vectorDim, int32_t spv) @@ -482,6 +488,8 @@ template void dumpBuffer(void const* buffer, std::string const& separat Dims const& strides, int32_t vectorDim, int32_t spv); template void dumpBuffer(void const* buffer, std::string const& separator, std::ostream& os, Dims const& dims, Dims const& strides, int32_t vectorDim, int32_t spv); +template void dumpBuffer<__nv_fp8_e4m3>(void const* buffer, std::string const& separator, std::ostream& os, Dims const& dims, + Dims const& strides, int32_t vectorDim, int32_t spv); template void sparsify(T const* values, int64_t count, int32_t k, int32_t trs, std::vector& sparseWeights) @@ -566,7 +574,7 @@ void fillBuffer(void* buffer, int64_t volume, T min, T max) { T* typedBuffer = static_cast(buffer); std::default_random_engine engine; - std::uniform_real_distribution distribution(min, max); + std::uniform_real_distribution distribution((float)min, (float)max); auto generator = [&engine, &distribution]() { return static_cast(distribution(engine)); }; std::generate(typedBuffer, typedBuffer + volume, generator); } @@ -580,6 +588,7 @@ template void fillBuffer(void* buffer, int64_t volume, int8_t min, int8_ template void fillBuffer<__half>(void* buffer, int64_t volume, __half min, __half max); template void fillBuffer(void* buffer, int64_t volume, BFloat16 min, BFloat16 max); template void fillBuffer(void* buffer, int64_t volume, uint8_t min, uint8_t max); +template void fillBuffer<__nv_fp8_e4m3>(void* buffer, int64_t volume, __nv_fp8_e4m3 min, __nv_fp8_e4m3 max); bool matchStringWithOneWildcard(std::string const& pattern, std::string const& target) { From 0e935b0b49edfdef8ef8737c4c27b36f8b9b7d87 Mon Sep 17 00:00:00 2001 From: Nuzhny007 Date: Sun, 19 Apr 2026 06:39:31 +0300 Subject: [PATCH 2/5] Change default params for very small objects --- example/MotionDetectorExample.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/example/MotionDetectorExample.h b/example/MotionDetectorExample.h index 7136b758..76afa05e 100644 --- a/example/MotionDetectorExample.h +++ b/example/MotionDetectorExample.h @@ -17,7 +17,7 @@ class MotionDetectorExample final : public VideoExample { public: MotionDetectorExample(const cv::CommandLineParser& parser) - : VideoExample(parser), m_minObjWidth(10) + : VideoExample(parser) { #ifdef USE_CLIP std::string clipModel = "C:/work/clip/ruclip_/CLIP/data/ruclip-vit-large-patch14-336"; @@ -38,8 +38,7 @@ class MotionDetectorExample final : public VideoExample { m_logger->info("MotionDetectorExample::InitDetector"); - //m_minObjWidth = frame.cols / 20; - m_minObjWidth = 4; + m_minObjWidth = 2; config_t config; config.emplace("useRotatedRect", "0"); @@ -97,7 +96,7 @@ class MotionDetectorExample final : public VideoExample if (!m_trackerSettingsLoaded) { - m_trackerSettings.SetDistance(tracking::DistJaccard); + m_trackerSettings.SetDistance(tracking::DistCenters); m_trackerSettings.m_kalmanType = tracking::KalmanLinear; m_trackerSettings.m_filterGoal = tracking::FilterCenter; m_trackerSettings.m_lostTrackType = tracking::TrackNone; // Use visual objects tracker for collisions resolving. Used if m_filterGoal == tracking::FilterRect From 6eb553291cd46989bb31c4d6be34575d32f1cfe1 Mon Sep 17 00:00:00 2001 From: Nuzhny007 Date: Wed, 29 Apr 2026 06:28:07 +0300 Subject: [PATCH 3/5] Fixed some warnings --- src/Detector/tensorrt_onnx/YoloONNXv26_bb.hpp | 2 +- src/Detector/tensorrt_onnx/YoloONNXv26_instance.hpp | 2 +- src/Detector/tensorrt_onnx/YoloONNXv26_obb.hpp | 2 +- src/Tracker/TrackerSettings.cpp | 8 ++++---- src/Tracker/byte_track/BYTETracker.cpp | 13 +++---------- 5 files changed, 10 insertions(+), 17 deletions(-) diff --git a/src/Detector/tensorrt_onnx/YoloONNXv26_bb.hpp b/src/Detector/tensorrt_onnx/YoloONNXv26_bb.hpp index 7677244b..19cdd67a 100644 --- a/src/Detector/tensorrt_onnx/YoloONNXv26_bb.hpp +++ b/src/Detector/tensorrt_onnx/YoloONNXv26_bb.hpp @@ -43,7 +43,7 @@ class YOLOv26_bb_onnx : public YoloONNX auto ind = i * m_outpuDims[0].d[2]; float classConf = output[ind + 4]; - int64_t classId = output[ind + 5]; + int classId = static_cast(output[ind + 5]); if (classConf >= m_params.m_confThreshold) { diff --git a/src/Detector/tensorrt_onnx/YoloONNXv26_instance.hpp b/src/Detector/tensorrt_onnx/YoloONNXv26_instance.hpp index abcb3b4f..9ec2d27e 100644 --- a/src/Detector/tensorrt_onnx/YoloONNXv26_instance.hpp +++ b/src/Detector/tensorrt_onnx/YoloONNXv26_instance.hpp @@ -65,7 +65,7 @@ class YOLOv26_instance_onnx : public YoloONNX size_t k = i * dimensions; float objectConf = output[k + 4]; - int classId = output[k + 5]; + int classId = static_cast(output[k + 5]); if (objectConf >= m_params.m_confThreshold) { diff --git a/src/Detector/tensorrt_onnx/YoloONNXv26_obb.hpp b/src/Detector/tensorrt_onnx/YoloONNXv26_obb.hpp index 0b70b509..8a097f2f 100644 --- a/src/Detector/tensorrt_onnx/YoloONNXv26_obb.hpp +++ b/src/Detector/tensorrt_onnx/YoloONNXv26_obb.hpp @@ -43,7 +43,7 @@ class YOLOv26_obb_onnx : public YoloONNX auto ind = i * m_outpuDims[0].d[2]; float classConf = output[ind + 4]; - int64_t classId = output[ind + 5]; + int classId = static_cast(output[ind + 5]); if (classConf >= m_params.m_confThreshold) { diff --git a/src/Tracker/TrackerSettings.cpp b/src/Tracker/TrackerSettings.cpp index e76dd72e..e06f2311 100644 --- a/src/Tracker/TrackerSettings.cpp +++ b/src/Tracker/TrackerSettings.cpp @@ -57,12 +57,12 @@ bool ParseTrackerSettings(const std::string& settingsFile, TrackerSettings& trac trackerSettings.m_useAbandonedDetection = reader.GetInteger("tracking", "detect_abandoned", 0) != 0; trackerSettings.m_minStaticTime = reader.GetInteger("tracking", "min_static_time", 5); trackerSettings.m_maxStaticTime = reader.GetInteger("tracking", "max_static_time", 25); - trackerSettings.m_maxSpeedForStatic = reader.GetInteger("tracking", "max_speed_for_static", 10); + trackerSettings.m_maxSpeedForStatic = static_cast(reader.GetReal("tracking", "max_speed_for_static", 0.5)); trackerSettings.m_byteTrackSettings.m_trackBuffer = reader.GetInteger("tracking", "bytetrack_track_buffer", 30); - trackerSettings.m_byteTrackSettings.m_trackThresh = reader.GetReal("tracking", "bytetrack_track_thresh", 0.5); - trackerSettings.m_byteTrackSettings.m_highThresh = reader.GetReal("tracking", "bytetrack_high_thresh", 0.5); - trackerSettings.m_byteTrackSettings.m_matchThresh = reader.GetReal("tracking", "bytetrack_match_thresh", 0.8); + trackerSettings.m_byteTrackSettings.m_trackThresh = static_cast(reader.GetReal("tracking", "bytetrack_track_thresh", 0.5)); + trackerSettings.m_byteTrackSettings.m_highThresh = static_cast(reader.GetReal("tracking", "bytetrack_high_thresh", 0.5)); + trackerSettings.m_byteTrackSettings.m_matchThresh = static_cast(reader.GetReal("tracking", "bytetrack_match_thresh", 0.8)); // Read detection settings trackerSettings.m_nnWeights = reader.GetString("detection", "nn_weights", "data/yolov4-tiny_best.weights"); diff --git a/src/Tracker/byte_track/BYTETracker.cpp b/src/Tracker/byte_track/BYTETracker.cpp index 8a63cf56..144490b1 100644 --- a/src/Tracker/byte_track/BYTETracker.cpp +++ b/src/Tracker/byte_track/BYTETracker.cpp @@ -310,8 +310,8 @@ void byte_track::BYTETracker::removeDuplicateStracks(const std::vector a_overlapping(a_stracks.size(), false), b_overlapping(b_stracks.size(), false); for (const auto &[a_idx, b_idx] : overlapping_combinations) { - const int timep = a_stracks[a_idx]->getFrameId() - a_stracks[a_idx]->getStartFrameId(); - const int timeq = b_stracks[b_idx]->getFrameId() - b_stracks[b_idx]->getStartFrameId(); + const size_t timep = a_stracks[a_idx]->getFrameId() - a_stracks[a_idx]->getStartFrameId(); + const size_t timeq = b_stracks[b_idx]->getFrameId() - b_stracks[b_idx]->getStartFrameId(); if (timep > timeq) b_overlapping[b_idx] = true; else @@ -359,16 +359,9 @@ void byte_track::BYTETracker::linearAssignment(const std::vector= 0) - { - std::vector match; - match.push_back(i); - match.push_back(rowsol[i]); - matches.push_back(match); - } + matches.push_back({ (int)i, rowsol[i] }); else - { a_unmatched.push_back(i); - } } for (size_t i = 0; i < colsol.size(); i++) From 2a5550fb92c23a35b3733c00399313e12eef7788 Mon Sep 17 00:00:00 2001 From: Nuzhny007 Date: Thu, 30 Apr 2026 19:11:18 +0300 Subject: [PATCH 4/5] More debug logs --- example/CarsCounting.cpp | 2 +- example/MotionDetectorExample.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/example/CarsCounting.cpp b/example/CarsCounting.cpp index cf801901..5ff8aca5 100644 --- a/example/CarsCounting.cpp +++ b/example/CarsCounting.cpp @@ -230,7 +230,7 @@ bool CarsCounting::InitTracker(cv::UMat frame) /// void CarsCounting::DrawData(cv::Mat frame, const std::vector& tracks, int framesCounter, int currTime) { - m_logger->info("Frame ({1}): tracks = {2}, time = {3}", framesCounter, tracks.size(), currTime); + m_logger->info("Frame {0} ({1}): tracks = {2}, time = {3}", framesCounter, m_framesCount, tracks.size(), currTime); #if 1 // Debug output if (!m_geoParams.Empty()) diff --git a/example/MotionDetectorExample.h b/example/MotionDetectorExample.h index 76afa05e..7319c407 100644 --- a/example/MotionDetectorExample.h +++ b/example/MotionDetectorExample.h @@ -140,7 +140,7 @@ class MotionDetectorExample final : public VideoExample /// void DrawData(cv::Mat frame, const std::vector& tracks, int framesCounter, int currTime) override { - m_logger->info("Frame ({0}): tracks = {1}, time = {2}", framesCounter, tracks.size(), currTime); + m_logger->info("Frame {0} ({1}): tracks = {2}, time = {3}", framesCounter, m_framesCount, tracks.size(), currTime); #ifdef USE_CLIP std::vector clipResult; From 08503429a4b250418f29f16dc6c982e40d169d65 Mon Sep 17 00:00:00 2001 From: Nuzhny007 Date: Thu, 30 Apr 2026 22:26:03 +0300 Subject: [PATCH 5/5] Build with CUDA 13 --- src/Detector/tensorrt_onnx/common/sampleDevice.cpp | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/Detector/tensorrt_onnx/common/sampleDevice.cpp b/src/Detector/tensorrt_onnx/common/sampleDevice.cpp index 7964aeb5..1e7ee17a 100644 --- a/src/Detector/tensorrt_onnx/common/sampleDevice.cpp +++ b/src/Detector/tensorrt_onnx/common/sampleDevice.cpp @@ -107,8 +107,17 @@ void setCudaDevice(int32_t device, std::ostream& os) os << "Shared Memory per SM: " << (properties.sharedMemPerMultiprocessor >> 10) << " KiB" << std::endl; os << "Memory Bus Width: " << properties.memoryBusWidth << " bits" << " (ECC " << (properties.ECCEnabled != 0 ? "enabled" : "disabled") << ")" << std::endl; +#if (CUDA_VERSION < 13000) os << "Application Compute Clock Rate: " << properties.clockRate / 1000000.0F << " GHz" << std::endl; os << "Application Memory Clock Rate: " << properties.memoryClockRate / 1000000.0F << " GHz" << std::endl; +#else + int clockRateKHz = 0; + cudaDeviceGetAttribute(&clockRateKHz, cudaDevAttrClockRate, device); + int memoryClockRateKHz = 0; + cudaDeviceGetAttribute(&memoryClockRateKHz, cudaDevAttrMemoryClockRate, device); + os << "Application Compute Clock Rate: " << clockRateKHz / 1000000.0F << " GHz" << std::endl; + os << "Application Memory Clock Rate: " << memoryClockRateKHz / 1000000.0F << " GHz" << std::endl; +#endif os << std::endl; os << "Note: The application clock rates do not reflect the actual clock rates that the GPU is " << "currently running at." << std::endl;