Pandinosaurus · pull · Aug 9, 2024 · Aug 11, 2024 · Aug 11, 2024 · Oct 2, 2024
diff --git a/README.md b/README.md
@@ -5,6 +5,7 @@
 [![CodeQL](https://github.com/Smorodov/Multitarget-tracker/actions/workflows/codeql-analysis.yml/badge.svg)](https://github.com/Smorodov/Multitarget-tracker/actions/workflows/codeql-analysis.yml)
 
 ## Latest Features
+- Instance segmentation model from RF-DETR detector works with TensorRT! Export pre-trained PyTorch models [here (roboflow/rf-detr)](https://github.com/roboflow/rf-detr) to ONNX format and run Multitarget-tracker with `-e=6` example
 - New linear assignment algorithm - [Jonker-Volgenant / LAPJV algorithm](https://github.com/yongyanghz/LAPJV-algorithm-c) used in [scipy](https://docs.scipy.org/doc/scipy/reference/generated/scipy.optimize.linear_sum_assignment.html) as alternative for Hungarian allgorithm
 - D-FINE detector works with TensorRT! Export pre-trained PyTorch models [here (Peterande/D-FINE)](https://github.com/Peterande/D-FINE) to ONNX format and run Multitarget-tracker with `-e=6` example
 - RF-DETR detector works with TensorRT! Export pre-trained PyTorch models [here (roboflow/rf-detr)](https://github.com/roboflow/rf-detr) to ONNX format and run Multitarget-tracker with `-e=6` example
@@ -20,6 +21,8 @@
 ## Demo Videos
 
 ### Detection & Tracking
+
+[![RF-DETR: detection vs instance segmentation](https://img.youtube.com/vi/oKy7jEKT83c/0.jpg)](https://youtu.be/oKy7jEKT83c)
 [![Satellite planes detection and tracking with YOLOv11-obb](https://img.youtube.com/vi/gTpWnkMF7Lg/0.jpg)](https://youtu.be/gTpWnkMF7Lg)
 [![4-in-1 latest SOTA detectors](https://img.youtube.com/vi/Pb_HnejRpY4/0.jpg)](https://youtu.be/Pb_HnejRpY4)
 [![YOLOv8-obb detection with rotated boxes](https://img.youtube.com/vi/1e6ur57Fhzs/0.jpg)](https://youtu.be/1e6ur57Fhzs)

diff --git a/data/settings_rfdetr_seg.ini b/data/settings_rfdetr_seg.ini
@@ -0,0 +1,142 @@
+[detection]
+
+#-----------------------------
+# opencv_dnn = 12
+# darknet_cudnn = 10
+# tensorrt = 11
+detector_backend = 11
+
+#-----------------------------
+# Target and backend for opencv_dnn detector
+# DNN_TARGET_CPU
+# DNN_TARGET_OPENCL
+# DNN_TARGET_OPENCL_FP16
+# DNN_TARGET_MYRIAD
+# DNN_TARGET_CUDA
+# DNN_TARGET_CUDA_FP16
+ocv_dnn_target = DNN_TARGET_CPU
+
+# DNN_BACKEND_DEFAULT
+# DNN_BACKEND_HALIDE
+# DNN_BACKEND_INFERENCE_ENGINE
+# DNN_BACKEND_OPENCV
+# DNN_BACKEND_VKCOM
+# DNN_BACKEND_CUDA
+# DNN_BACKEND_INFERENCE_ENGINE_NGRAPH
+# DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019
+ocv_dnn_backend = DNN_BACKEND_OPENCV
+
+#-----------------------------
+nn_weights = C:/work/home/mtracker/Multitarget-tracker/data/coco/rfdetr_seg_coco.onnx
+nn_config = C:/work/home/mtracker/Multitarget-tracker/data/coco/rfdetr_seg_coco.onnx
+class_names = C:/work/home/mtracker/Multitarget-tracker/data/coco/coco_91.names
+
+#-----------------------------
+confidence_threshold = 0.5
+
+max_crop_ratio = 0
+max_batch = 1
+gpu_id = 0
+
+#-----------------------------
+# YOLOV3 
+# YOLOV4 
+# YOLOV5 
+net_type = RFDETR_IS
+
+#-----------------------------
+# INT8
+# FP16
+# FP32
+inference_precision = FP16
+
+
+[tracking]
+
+#-----------------------------
+# DistCenters = 0   // Euclidean distance between centers, pixels
+# DistRects = 1     // Euclidean distance between bounding rectangles, pixels
+# DistJaccard = 2   // Intersection over Union, IoU, [0, 1]
+# DistHist = 3      // Bhatacharia distance between histograms, [0, 1]
+
+distance_type = 0
+
+#-----------------------------
+# KalmanLinear = 0
+# KalmanUnscented = 1
+
+kalman_type = 0
+
+#-----------------------------
+# FilterCenter = 0
+# FilterRect = 1
+# FilterRRect = 2
+
+filter_goal = 0
+
+#-----------------------------
+# TrackNone = 0
+# TrackKCF = 1
+# TrackMIL = 2
+# TrackMedianFlow = 3
+# TrackGOTURN = 4
+# TrackMOSSE = 5
+# TrackCSRT = 6
+# TrackDAT = 7
+# TrackSTAPLE = 8
+# TrackLDES = 9
+# TrackDaSiamRPN = 10
+# Used if filter_goal == FilterRect
+
+lost_track_type = 0
+
+#-----------------------------
+# MatchHungrian = 0
+# MatchBipart = 1
+
+match_type = 0
+
+#-----------------------------
+# Use constant acceleration motion model:
+# 0 - unused (stable)
+# 1 - use acceleration in Kalman filter (experimental)
+use_aceleration = 0
+
+#-----------------------------
+# Delta time for Kalman filter
+delta_time = 0.4
+
+#-----------------------------
+# Accel noise magnitude for Kalman filter
+accel_noise = 0.2
+
+#-----------------------------
+# Distance threshold between region and object on two frames
+dist_thresh = 0.8 
+
+#-----------------------------
+# If this value > 0 than will be used circle with this radius
+# If this value <= 0 than will be used ellipse with size (3*vx, 3*vy), vx and vy - horizontal and vertical speed in pixelsa
+min_area_radius_pix = -1
+
+#-----------------------------
+# Minimal area radius in ration for object size. Used if min_area_radius_pix < 0
+min_area_radius_k = 0.8
+
+#-----------------------------
+# If the object do not assignment more than this seconds then it will be removed
+max_lost_time = 2
+
+#-----------------------------
+# The maximum trajectory length
+max_trace_len = 2
+
+#-----------------------------
+# Detection abandoned objects
+detect_abandoned = 0
+# After this time (in seconds) the object is considered abandoned
+min_static_time = 5
+# After this time (in seconds) the abandoned object will be removed
+max_static_time = 25
+# Speed in pixels. If speed of object is more that this value than object is non static
+max_speed_for_static = 10
diff --git a/src/Detector/BaseDetector.h b/src/Detector/BaseDetector.h
@@ -167,17 +167,25 @@ class BaseDetector
         cv::Mat foreground(m_motionMap.size(), CV_8UC1, cv::Scalar(0, 0, 0));
         for (const auto& region : m_regions)
         {
+            if (region.m_boxMask.empty())
+            {
 #if (CV_VERSION_MAJOR < 4)
-            cv::ellipse(foreground, region.m_rrect, cv::Scalar(255, 255, 255), CV_FILLED);
+                cv::ellipse(foreground, region.m_rrect, cv::Scalar(255, 255, 255), CV_FILLED);
 #else
-            cv::ellipse(foreground, region.m_rrect, cv::Scalar(255, 255, 255), cv::FILLED);
+                cv::ellipse(foreground, region.m_rrect, cv::Scalar(255, 255, 255), cv::FILLED);
 #endif
+            }
+            else
+            {
+                cv::Rect brect = Clamp(cv::Rect(region.m_brect.x, region.m_brect.y, region.m_boxMask.cols, region.m_boxMask.rows), foreground.size());
+                region.m_boxMask.copyTo(foreground(brect));
+            }
         }
         if (!m_ignoreMask.empty())
             cv::bitwise_and(foreground, m_ignoreMask, foreground);
         cv::normalize(foreground, m_normFor, 255, 0, cv::NORM_MINMAX, m_motionMap.type());
 
-        double alpha = 0.95;
+        double alpha = 0.9;
         cv::addWeighted(m_motionMap, alpha, m_normFor, 1 - alpha, 0, m_motionMap);
 
         const int chans = frame.channels();

diff --git a/src/Detector/OCVDNNDetector.cpp b/src/Detector/OCVDNNDetector.cpp
@@ -169,6 +169,7 @@ bool OCVDNNDetector::Init(const config_t& config)
         dictNetType["YOLOV11Mask"] = ModelType::YOLOV11Mask;
         dictNetType["YOLOV12"] = ModelType::YOLOV12;
         dictNetType["RFDETR"] = ModelType::RFDETR;
+        dictNetType["RFDETR_IS"] = ModelType::RFDETR_IS;
         dictNetType["DFINE"] = ModelType::DFINE;
 
         auto netType = dictNetType.find(net_type->second);
@@ -414,6 +415,10 @@ void OCVDNNDetector::DetectInCrop(const cv::UMat& colorFrame, const cv::Rect& cr
         ParseRFDETR(crop, detections, tmpRegions);
         break;
 
+    case ModelType::RFDETR_IS:
+        ParseRFDETR_IS(crop, detections, tmpRegions);
+        break;
+
     case ModelType::DFINE:
         ParseDFINE(crop, detections, tmpRegions);
         break;
@@ -934,6 +939,70 @@ void OCVDNNDetector::ParseRFDETR(const cv::Rect& crop, std::vector<cv::Mat>& det
     }
 }
 
+///
+/// \brief OCVDNNDetector::ParseRFDETR_IS
+/// \param crop
+/// \param detections
+/// \param tmpRegions
+///
+void OCVDNNDetector::ParseRFDETR_IS(const cv::Rect& crop, std::vector<cv::Mat>& detections, regions_t& tmpRegions)
+{
+    int rows = detections[0].size[1];
+    int dimensionsDets = detections[0].size[2];
+    int dimensionsLabels = detections[1].size[2];
+
+    //0: name: input, size : 1x3x560x560
+    //1: name: dets, size : 1x300x4
+    //2: name: labels, size : 1x300x91
+
+    float* dets = (float*)detections[0].data;
+    float* labels = (float*)detections[1].data;
+
+    float x_factor = crop.width / static_cast<float>(m_inWidth);
+    float y_factor = crop.height / static_cast<float>(m_inHeight);
+
+    auto L2Conf = [](float v)
+    {
+        return 1.f / (1.f + std::exp(-v));
+    };
+
+    for (int i = 0; i < rows; ++i)
+    {
+        float maxClassScore = L2Conf(labels[0]);
+        size_t classId = 0;
+        for (size_t cli = 1; cli < static_cast<size_t>(dimensionsLabels); ++cli)
+        {
+            auto conf = L2Conf(labels[cli]);
+            if (maxClassScore < conf)
+            {
+                maxClassScore = conf;
+                classId = cli;
+            }
+        }
+        if (classId > 0)
+            --classId;
+
+        if (maxClassScore > m_confidenceThreshold)
+        {
+            float x = dets[0];
+            float y = dets[1];
+            float w = dets[2];
+            float h = dets[3];
+
+            int left = cvRound((x - 0.5f * w) * x_factor);
+            int top = cvRound((y - 0.5f * h) * y_factor);
+
+            int width = cvRound(w * x_factor);
+            int height = cvRound(h * y_factor);
+
+            if (m_classesWhiteList.empty() || m_classesWhiteList.find(T2T(classId)) != std::end(m_classesWhiteList))
+                tmpRegions.emplace_back(cv::Rect(left + crop.x, top + crop.y, width, height), T2T(classId), static_cast<float>(maxClassScore));
+        }
+        dets += dimensionsDets;
+        labels += dimensionsLabels;
+    }
+}
+
 ///
 /// \brief OCVDNNDetector::ParseDFINE
 /// \param crop

diff --git a/src/Detector/OCVDNNDetector.h b/src/Detector/OCVDNNDetector.h
@@ -50,6 +50,7 @@ class OCVDNNDetector final : public BaseDetector
         YOLOV11Mask,
         YOLOV12,
         RFDETR,
+        RFDETR_IS,
         DFINE
     };
 
@@ -85,6 +86,7 @@ class OCVDNNDetector final : public BaseDetector
     void ParseYOLOv5_8_11_obb(const cv::Rect& crop, std::vector<cv::Mat>& detections, regions_t& tmpRegions);
     void ParseYOLOv5_8_11_seg(const cv::Rect& crop, std::vector<cv::Mat>& detections, regions_t& tmpRegions);
     void ParseRFDETR(const cv::Rect& crop, std::vector<cv::Mat>& detections, regions_t& tmpRegions);
+    void ParseRFDETR_IS(const cv::Rect& crop, std::vector<cv::Mat>& detections, regions_t& tmpRegions);
     void ParseDFINE(const cv::Rect& crop, std::vector<cv::Mat>& detections, regions_t& tmpRegions);
 };
 

diff --git a/src/Detector/YoloTensorRTDetector.cpp b/src/Detector/YoloTensorRTDetector.cpp
@@ -112,6 +112,7 @@ bool YoloTensorRTDetector::Init(const config_t& config)
 		dictNetType["YOLOV11Mask"] = tensor_rt::YOLOV11Mask;
 		dictNetType["YOLOV12"] = tensor_rt::YOLOV12;
 		dictNetType["RFDETR"] = tensor_rt::RFDETR;
+		dictNetType["RFDETR_IS"] = tensor_rt::RFDETR_IS;
 		dictNetType["DFINE"] = tensor_rt::DFINE;
 
 		auto netType = dictNetType.find(net_type->second);