fix: add roi overlapping segment

badai-nguyen · badai-nguyen · commit aede063edf50 · 2023-11-20T11:21:09.000+09:00
Signed-off-by: badai-nguyen &lt;dai.nguyen@tier4.jp&gt;
diff --git a/perception/tensorrt_yolox/include/tensorrt_yolox/tensorrt_yolox.hpp b/perception/tensorrt_yolox/include/tensorrt_yolox/tensorrt_yolox.hpp
@@ -78,6 +78,7 @@ class TrtYoloX
    * @param[in] build_config configuration including precision, calibration method, DLA, remaining
    * fp16 for first layer,  remaining fp16 for last layer and profiler for builder
    * @param[in] use_gpu_preprocess whether use cuda gpu for preprocessing
+   * @param[in] publish_color_mask whether publish color_mask for debugging and visualization
    * @param[in] calibration_image_list_file path for calibration files (only require for
    * quantization)
    * @param[in] norm_factor scaling factor for preprocess
@@ -90,8 +91,9 @@ class TrtYoloX
     const std::string & color_map_path, const int num_class = 8, const float score_threshold = 0.3,
     const float nms_threshold = 0.7,
     const tensorrt_common::BuildConfig build_config = tensorrt_common::BuildConfig(),
-    const bool use_gpu_preprocess = false, std::string calibration_image_list_file = std::string(),
-    const double norm_factor = 1.0, [[maybe_unused]] const std::string & cache_dir = "",
+    const bool use_gpu_preprocess = false, const bool publish_color_mask = false,
+    std::string calibration_image_list_file = std::string(), const double norm_factor = 1.0,
+    [[maybe_unused]] const std::string & cache_dir = "",
     const tensorrt_common::BatchConfig & batch_config = {1, 1, 1},
     const size_t max_workspace_size = (1 << 30));
   /**
@@ -105,8 +107,8 @@ class TrtYoloX
    * @param[in] images batched images
    */
   bool doInference(
-    const std::vector<cv::Mat> & images, ObjectArrays & objects, cv::Mat & mask,
-    cv::Mat & color_mask);
+    const std::vector<cv::Mat> & images, ObjectArrays & objects, std::vector<cv::Mat> & masks,
+    std::vector<cv::Mat> & color_masks);
 
   /**
    * @brief run inference including pre-process and post-process
@@ -201,8 +203,8 @@ class TrtYoloX
 
   bool feedforward(const std::vector<cv::Mat> & images, ObjectArrays & objects);
   bool feedforwardAndDecode(
-    const std::vector<cv::Mat> & images, ObjectArrays & objects, cv::Mat & mask,
-    cv::Mat & color_mask);
+    const std::vector<cv::Mat> & images, ObjectArrays & objects, std::vector<cv::Mat> & masks,
+    std::vector<cv::Mat> & color_masks);
   void decodeOutputs(float * prob, ObjectArray & objects, float scale, cv::Size & img_size) const;
   void generateGridsAndStride(
     const int target_w, const int target_h, const std::vector<int> & strides,
@@ -307,7 +309,11 @@ class TrtYoloX
   CudaUniquePtrHost<unsigned char[]> argmax_buf_h_;
   // device buffer for argmax postprocessing  on GPU
   CudaUniquePtr<unsigned char[]> argmax_buf_d_;
-  std::vector<tensorrt_yolox::Colormap> color_map_;
+  std::vector<tensorrt_yolox::Colormap> sematic_color_map_;
+  // flag whether overlay segmentation by roi
+  bool roi_overlap_segment_;
+  // flag where publish color mask for debugging and visualization
+  bool publish_color_mask_;
 };
 
 }  // namespace tensorrt_yolox
diff --git a/perception/tensorrt_yolox/include/tensorrt_yolox/tensorrt_yolox_node.hpp b/perception/tensorrt_yolox/include/tensorrt_yolox/tensorrt_yolox_node.hpp
@@ -51,7 +51,8 @@ class TrtYoloXNode : public rclcpp::Node
   void onImage(const sensor_msgs::msg::Image::ConstSharedPtr msg);
   bool readLabelFile(const std::string & label_path);
   void replaceLabelMap();
-
+  void overlapSegmentByRoi(const tensorrt_yolox::Object & object, cv::Mat & mask);
+  int mapRoiLabel2SegLabel(const int32_t roi_label_index);
   image_transport::Publisher image_pub_;
   image_transport::Publisher mask_pub_;
   image_transport::Publisher color_mask_pub_;
@@ -64,6 +65,9 @@ class TrtYoloXNode : public rclcpp::Node
 
   LabelMap label_map_;
   std::unique_ptr<tensorrt_yolox::TrtYoloX> trt_yolox_;
+  bool is_roi_overlap_segment_;
+  bool is_publish_color_mask_;
+  float overlap_roi_score_threshold_;
 };
 
 }  // namespace tensorrt_yolox
diff --git a/perception/tensorrt_yolox/launch/yolox_s_plus_opt.launch.xml b/perception/tensorrt_yolox/launch/yolox_s_plus_opt.launch.xml
@@ -32,7 +32,9 @@
   <arg name="calibration_image_list_path" default="" description="Path to a file which contains path to images. Those images will be used for int8 quantization."/>
   <arg name="use_decompress" default="true" description="use image decompress"/>
   <arg name="build_only" default="false" description="exit after trt engine is built"/>
-
+  <arg name="is_roi_overlap_segment" default="true" description="refine segmentation mask by overlay roi class, disable when sematic segmentation accuracy is good enough"/>
+  <arg name="is_publish_color_mask" default="false" description="publish color mask for result visualization"/>
+  <arg name="overlap_roi_score_threshold" default="0.3" description="the roi object existance probability threshold that consider to replace segmentation"/>
   <node pkg="image_transport_decompressor" exec="image_transport_decompressor_node" name="image_transport_decompressor_node" if="$(var use_decompress)">
     <remap from="~/input/compressed_image" to="$(var input/image)/compressed"/>
     <remap from="~/output/raw_image" to="$(var input/image)"/>
@@ -56,6 +58,9 @@
     <param name="preprocess_on_gpu" value="$(var preprocess_on_gpu)"/>
     <param name="calibration_image_list_path" value="$(var calibration_image_list_path)"/>
     <param name="build_only" value="$(var build_only)"/>
-    <param name="color_map_path" value="$(var model_path)/bdd100k_semseg.csv"/>
+    <param name="color_map_path" value="$(var model_path)/semseg_color_map.csv"/>
+    <param name="is_roi_overlap_segment" value="$(var is_roi_overlap_segment)"/>
+    <param name="is_publish_color_mask" value="$(var is_publish_color_mask)"/>
+    <param name="overlap_roi_score_threshold" value="$(var overlap_roi_score_threshold)"/>
   </node>
 </launch>
diff --git a/perception/tensorrt_yolox/src/tensorrt_yolox.cpp b/perception/tensorrt_yolox/src/tensorrt_yolox.cpp
@@ -109,7 +109,7 @@ std::vector<tensorrt_yolox::Colormap> get_seg_colormap(const std::string & filen
   std::vector<tensorrt_yolox::Colormap> seg_cmap;
   if (filename != "not-specified") {
     std::vector<std::string> color_list = loadListFromTextFile(filename);
-    for (int i = 0; i < (int)color_list.size(); i++) {
+    for (int i = 0; i < static_cast<int>(color_list.size()); i++) {
       if (i == 0) {
         // Skip header
         continue;
@@ -120,7 +120,7 @@ std::vector<tensorrt_yolox::Colormap> get_seg_colormap(const std::string & filen
       size_t npos = colormapString.find_first_of(',');
       assert(npos != std::string::npos);
       std::string substr = colormapString.substr(0, npos);
-      int id = (int)std::stoi(trim(substr));
+      int id = static_cast<int>(std::stoi(trim(substr)));
       colormapString.erase(0, npos + 1);
 
       npos = colormapString.find_first_of(',');
@@ -157,7 +157,7 @@ namespace tensorrt_yolox
 TrtYoloX::TrtYoloX(
   const std::string & model_path, const std::string & precision, const std::string & color_map_path,
   const int num_class, const float score_threshold, const float nms_threshold,
-  tensorrt_common::BuildConfig build_config, const bool use_gpu_preprocess,
+  tensorrt_common::BuildConfig build_config, const bool use_gpu_preprocess, bool publish_color_mask,
   std::string calibration_image_list_path, const double norm_factor,
   [[maybe_unused]] const std::string & cache_dir, const tensorrt_common::BatchConfig & batch_config,
   const size_t max_workspace_size)
@@ -167,7 +167,8 @@ TrtYoloX::TrtYoloX(
   norm_factor_ = norm_factor;
   batch_size_ = batch_config[2];
   multitask_ = 0;
-  color_map_ = get_seg_colormap(color_map_path);
+  sematic_color_map_ = get_seg_colormap(color_map_path);
+  publish_color_mask_ = publish_color_mask;
   if (precision == "int8") {
     if (build_config.clip_value <= 0.0) {
       if (calibration_image_list_path.empty()) {
@@ -388,13 +389,14 @@ void TrtYoloX::initPreprocessBuffer(int width, int height)
       for (int m = 0; m < multitask_; m++) {
         const auto output_dims =
           trt_common_->getBindingDimensions(m + 2);  // 0 : input, 1 : output for detections
-        const float scale =
-          std::min(output_dims.d[3] / float(width), output_dims.d[2] / float(height));
-        int out_w = (int)(width * scale);
-        int out_h = (int)(height * scale);
-        //	size_t out_elem_num = std::accumulate(
+        const float scale = std::min(
+          output_dims.d[3] / static_cast<float>(width),
+          output_dims.d[2] / static_cast<float>(height));
+        int out_w = static_cast<int>(width * scale);
+        int out_h = static_cast<int>(height * scale);
+        // size_t out_elem_num = std::accumulate(
         // output_dims.d + 1, output_dims.d + output_dims.nbDims, 1, std::multiplies<int>());
-        //	out_elem_num = out_elem_num * batch_size_;
+        // out_elem_num = out_elem_num * batch_size_;
         size_t out_elem_num = out_w * out_h * batch_size_;
         argmax_out_elem_num += out_elem_num;
       }
@@ -468,8 +470,9 @@ void TrtYoloX::preprocessGpu(const std::vector<cv::Mat> & images)
       for (int m = 0; m < multitask_; m++) {
         const auto output_dims =
           trt_common_->getBindingDimensions(m + 2);  // 0: input, 1: output for detections
-        const float scale =
-          std::min(output_dims.d[3] / float(image.cols), output_dims.d[2] / float(image.rows));
+        const float scale = std::min(
+          output_dims.d[3] / static_cast<float>(image.cols),
+          output_dims.d[2] / static_cast<float>(image.rows));
         int out_w = static_cast<int>(image.cols * scale);
         int out_h = static_cast<int>(image.rows * scale);
         argmax_out_elem_num += out_w * out_h * batch_size;
@@ -545,8 +548,8 @@ void TrtYoloX::preprocess(const std::vector<cv::Mat> & images)
 }
 
 bool TrtYoloX::doInference(
-  const std::vector<cv::Mat> & images, ObjectArrays & objects, cv::Mat & mask,
-  [[maybe_unused]] cv::Mat & color_mask)
+  const std::vector<cv::Mat> & images, ObjectArrays & objects, std::vector<cv::Mat> & masks,
+  [[maybe_unused]] std::vector<cv::Mat> & color_masks)
 {
   if (!trt_common_->isInitialized()) {
     return false;
@@ -559,7 +562,7 @@ bool TrtYoloX::doInference(
   }
 
   if (needs_output_decode_) {
-    return feedforwardAndDecode(images, objects, mask, color_mask);
+    return feedforwardAndDecode(images, objects, masks, color_masks);
   } else {
     return feedforward(images, objects);
   }
@@ -799,8 +802,8 @@ void TrtYoloX::multiScalePreprocess(const cv::Mat & image, const std::vector<cv:
 bool TrtYoloX::doInferenceWithRoi(
   const std::vector<cv::Mat> & images, ObjectArrays & objects, const std::vector<cv::Rect> & rois)
 {
-  cv::Mat mask;
-  cv::Mat color_mask;
+  std::vector<cv::Mat> masks;
+  std::vector<cv::Mat> color_masks;
   if (!trt_common_->isInitialized()) {
     return false;
   }
@@ -811,7 +814,7 @@ bool TrtYoloX::doInferenceWithRoi(
   }
 
   if (needs_output_decode_) {
-    return feedforwardAndDecode(images, objects, mask, color_mask);
+    return feedforwardAndDecode(images, objects, masks, color_masks);
   } else {
     return feedforward(images, objects);
   }
@@ -890,8 +893,8 @@ bool TrtYoloX::feedforward(const std::vector<cv::Mat> & images, ObjectArrays & o
 }
 
 bool TrtYoloX::feedforwardAndDecode(
-  const std::vector<cv::Mat> & images, ObjectArrays & objects, cv::Mat & out_mask,
-  [[maybe_unused]] cv::Mat & color_mask)
+  const std::vector<cv::Mat> & images, ObjectArrays & objects, std::vector<cv::Mat> & out_masks,
+  [[maybe_unused]] std::vector<cv::Mat> & color_masks)
 {
   std::vector<void *> buffers = {input_d_.get(), out_prob_d_.get()};
   if (multitask_) {
@@ -914,26 +917,31 @@ bool TrtYoloX::feedforwardAndDecode(
 
   for (size_t i = 0; i < batch_size; ++i) {
     auto image_size = images[i].size();
+    auto & out_mask = out_masks[i];
+    auto & color_mask = color_masks[i];
     float * batch_prob = out_prob_h_.get() + (i * out_elem_num_per_batch_);
     ObjectArray object_array;
     decodeOutputs(batch_prob, object_array, scales_[i], image_size);
+    // add refine mask using object
     objects.emplace_back(object_array);
     if (multitask_) {
       segmentation_masks_.clear();
       float * segmentation_results =
         segmentation_out_prob_h_.get() + (i * segmentation_out_elem_num_per_batch_);
       size_t counter = 0;
-      int batch = (int)(segmentation_out_elem_num_ / segmentation_out_elem_num_per_batch_);
+      int batch =
+        static_cast<int>(segmentation_out_elem_num_ / segmentation_out_elem_num_per_batch_);
       for (int m = 0; m < multitask_; m++) {
         const auto output_dims =
           trt_common_->getBindingDimensions(m + 2);  // 0 : input, 1 : output for detections
         size_t out_elem_num = std::accumulate(
           output_dims.d + 1, output_dims.d + output_dims.nbDims, 1, std::multiplies<int>());
         out_elem_num = out_elem_num * batch;
         const float scale = std::min(
-          output_dims.d[3] / float(image_size.width), output_dims.d[2] / float(image_size.height));
-        int out_w = (int)(image_size.width * scale);
-        int out_h = (int)(image_size.height * scale);
+          output_dims.d[3] / static_cast<float>(image_size.width),
+          output_dims.d[2] / static_cast<float>(image_size.height));
+        int out_w = static_cast<int>(image_size.width * scale);
+        int out_h = static_cast<int>(image_size.height * scale);
         cv::Mat mask;
         if (use_gpu_preprocess_) {
           float * d_segmentation_results =
@@ -945,8 +953,16 @@ bool TrtYoloX::feedforwardAndDecode(
         segmentation_masks_.push_back(mask);
         counter += out_elem_num;
       }
-      out_mask = segmentation_masks_.at(0);
-      color_mask = getColorizedMask(0, color_map_);
+    } else {
+      continue;
+    }
+    // Assume semantic segmentation is first task
+    // This should remove when the segmentation accuracy is high
+    out_mask = segmentation_masks_.at(0);
+
+    // publish color mask for visualization
+    if (publish_color_mask_) {
+      color_mask = getColorizedMask(0, sematic_color_map_);
     }
   }
   return true;
diff --git a/perception/tensorrt_yolox/src/tensorrt_yolox_node.cpp b/perception/tensorrt_yolox/src/tensorrt_yolox_node.cpp
@@ -94,6 +94,10 @@ TrtYoloXNode::TrtYoloXNode(const rclcpp::NodeOptions & node_options)
     RCLCPP_ERROR(this->get_logger(), "Could not find label file");
     rclcpp::shutdown();
   }
+
+  is_roi_overlap_segment_ = declare_parameter<bool>("is_roi_overlap_segment");
+  is_publish_color_mask_ = declare_parameter<bool>("is_publish_color_mask");
+  overlap_roi_score_threshold_ = declare_parameter<float>("overlap_roi_score_threshold");
   replaceLabelMap();
 
   tensorrt_common::BuildConfig build_config(
@@ -102,7 +106,7 @@ TrtYoloXNode::TrtYoloXNode(const rclcpp::NodeOptions & node_options)
 
   trt_yolox_ = std::make_unique<tensorrt_yolox::TrtYoloX>(
     model_path, precision, color_map_path, label_map_.size(), score_threshold, nms_threshold,
-    build_config, preprocess_on_gpu, calibration_image_list_path);
+    build_config, preprocess_on_gpu, is_publish_color_mask_, calibration_image_list_path);
 
   timer_ =
     rclcpp::create_timer(this, get_clock(), 100ms, std::bind(&TrtYoloXNode::onConnect, this));
@@ -150,13 +154,19 @@ void TrtYoloXNode::onImage(const sensor_msgs::msg::Image::ConstSharedPtr msg)
   const auto height = in_image_ptr->image.rows;
 
   tensorrt_yolox::ObjectArrays objects;
-  cv::Mat mask(cv::Size(height, width), CV_8UC1, cv::Scalar(0));
-  cv::Mat color_mask(cv::Size(height, width), CV_8UC3, cv::Scalar(0, 0, 0));
+  std::vector<cv::Mat> masks = {cv::Mat(cv::Size(height, width), CV_8UC1, cv::Scalar(0))};
+  std::vector<cv::Mat> color_masks = {
+    cv::Mat(cv::Size(height, width), CV_8UC3, cv::Scalar(0, 0, 0))};
 
-  if (!trt_yolox_->doInference({in_image_ptr->image}, objects, mask, color_mask)) {
+  if (!trt_yolox_->doInference({in_image_ptr->image}, objects, masks, color_masks)) {
     RCLCPP_WARN(this->get_logger(), "Fail to inference");
     return;
   }
+  auto & mask = masks.at(0);
+  cv::resize(
+    mask, mask, cv::Size(in_image_ptr->image.cols, in_image_ptr->image.rows), 0, 0,
+    cv::INTER_NEAREST);
+
   for (const auto & yolox_object : objects.at(0)) {
     tier4_perception_msgs::msg::DetectedObjectWithFeature object;
     object.feature.roi.x_offset = yolox_object.x_offset;
@@ -176,29 +186,32 @@ void TrtYoloXNode::onImage(const sensor_msgs::msg::Image::ConstSharedPtr msg)
     cv::rectangle(
       in_image_ptr->image, cv::Point(left, top), cv::Point(right, bottom), cv::Scalar(0, 0, 255), 3,
       8, 0);
+    // Refine mask: replacing segmentation mask by roi class
+    if (is_roi_overlap_segment_) {
+      overlapSegmentByRoi(yolox_object, mask);
+    }
   }
-  cv::resize(
-    mask, mask, cv::Size(in_image_ptr->image.cols, in_image_ptr->image.rows), 0, 0,
-    cv::INTER_NEAREST);
   sensor_msgs::msg::Image::SharedPtr out_mask_msg =
     cv_bridge::CvImage(std_msgs::msg::Header(), sensor_msgs::image_encodings::MONO8, mask)
       .toImageMsg();
   out_mask_msg->header = msg->header;
   mask_pub_.publish(out_mask_msg);
 
-  cv::resize(
-    color_mask, color_mask, cv::Size(in_image_ptr->image.cols, in_image_ptr->image.rows), 0, 0,
-    cv::INTER_NEAREST);
-  sensor_msgs::msg::Image::SharedPtr output_color_mask_msg =
-    cv_bridge::CvImage(std_msgs::msg::Header(), sensor_msgs::image_encodings::BGR8, color_mask)
-      .toImageMsg();
-  output_color_mask_msg->header = msg->header;
-  color_mask_pub_.publish(output_color_mask_msg);
-
   image_pub_.publish(in_image_ptr->toImageMsg());
-
   out_objects.header = msg->header;
   objects_pub_->publish(out_objects);
+
+  if (is_publish_color_mask_) {
+    auto & color_mask = color_masks.at(0);
+    cv::resize(
+      color_mask, color_mask, cv::Size(in_image_ptr->image.cols, in_image_ptr->image.rows), 0, 0,
+      cv::INTER_NEAREST);
+    sensor_msgs::msg::Image::SharedPtr output_color_mask_msg =
+      cv_bridge::CvImage(std_msgs::msg::Header(), sensor_msgs::image_encodings::BGR8, color_mask)
+        .toImageMsg();
+    output_color_mask_msg->header = msg->header;
+    color_mask_pub_.publish(output_color_mask_msg);
+  }
 }
 
 bool TrtYoloXNode::readLabelFile(const std::string & label_path)
@@ -235,6 +248,35 @@ void TrtYoloXNode::replaceLabelMap()
   }
 }
 
+int TrtYoloXNode::mapRoiLabel2SegLabel(const int32_t roi_label_index)
+{
+  auto & roi_label = label_map_[roi_label_index];
+  if (roi_label == "CAR" || roi_label == "BUS" || roi_label == "TRUCK") {
+    return static_cast<int>(roi_label_index + 11);
+  }
+  if (roi_label == "PEDESTRIAN") {
+    return 11;  // person index in segment_color_map
+  }
+  if (roi_label == "MOTORCYCLE") {
+    return 17;  // motocycle index in segment_color_map
+  }
+  if (roi_label == "BICYCLE") {
+    return 18;  // bicycle index in segment_color_map
+  }
+  return -1;
+}
+
+void TrtYoloXNode::overlapSegmentByRoi(const tensorrt_yolox::Object & roi_object, cv::Mat & mask)
+{
+  if (roi_object.score < overlap_roi_score_threshold_) return;
+  cv::Mat submat = mask.colRange(roi_object.x_offset, roi_object.width)
+                     .rowRange(roi_object.y_offset, roi_object.height);
+  int seg_class_index = mapRoiLabel2SegLabel(roi_object.type);
+  if (seg_class_index < 0) return;
+  cv::Mat replace_roi(cv::Size(), mask.type(), seg_class_index);
+  replace_roi.copyTo(submat);
+}
+
 }  // namespace tensorrt_yolox
 
 #include "rclcpp_components/register_node_macro.hpp"
diff --git a/perception/tensorrt_yolox/src/yolox_single_image_inference_node.cpp b/perception/tensorrt_yolox/src/yolox_single_image_inference_node.cpp