From a3cd7cda2d5742b245900c598e09cf4191e39a47 Mon Sep 17 00:00:00 2001
From: Kenzo Lobos Tsunekawa <kenzo.lobos@tier4.jp>
Date: Thu, 20 Jun 2024 10:05:59 +0900
Subject: [PATCH 01/15] feat(lidar_centerpoint): accelerated preprocessing for
 centerpoint (#6989)

* feat: accelerated preprocessing for centerpoint

Signed-off-by: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp>

* chore: changed includes to quotes instead of brackets

Signed-off-by: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp>

* chore: fixed variable name

Signed-off-by: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp>

* fix: addressed the case where the points in the cache is over the maximum capacity

Signed-off-by: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp>

* fix: fixed the offset of the output buffer in the kernel

Signed-off-by: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp>

* chore: reimplemented old logic in the pointpainting package since the new centerpoint logic is incompatible

Signed-off-by: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp>

* fix: eigen matrices are column major by default. this commit fixes the code that assumed row-major, implements check for data layout, and updates tests

Signed-off-by: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp>

* chore: fixed missing virtual destructor

Signed-off-by: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp>

---------

Signed-off-by: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp>
Co-authored-by: kminoda <44218668+kminoda@users.noreply.github.com>
---
 .../CMakeLists.txt                            |   1 +
 .../pointcloud_densification.hpp              |  72 ++++++++++++
 .../pointpainting_fusion/voxel_generator.hpp  |  22 +++-
 .../pointcloud_densification.cpp              | 103 ++++++++++++++++++
 .../pointpainting_fusion/voxel_generator.cpp  |  26 +++++
 .../lidar_centerpoint/centerpoint_trt.hpp     |  17 ++-
 .../include/lidar_centerpoint/cuda_utils.hpp  |   2 +-
 .../detection_class_remapper.hpp              |   8 +-
 .../lidar_centerpoint/network/network_trt.hpp |   4 +-
 .../network/scatter_kernel.hpp                |   4 +-
 .../network/tensorrt_wrapper.hpp              |   9 +-
 .../include/lidar_centerpoint/node.hpp        |   4 +-
 .../postprocess/circle_nms_kernel.hpp         |   5 +-
 .../postprocess/postprocess_kernel.hpp        |  11 +-
 .../preprocess/pointcloud_densification.hpp   |  27 +++--
 .../preprocess/preprocess_kernel.hpp          |   8 +-
 .../preprocess/voxel_generator.hpp            |  13 ++-
 .../include/lidar_centerpoint/ros_utils.hpp   |   7 +-
 .../lidar_centerpoint/lib/centerpoint_trt.cpp |  14 +--
 .../lib/detection_class_remapper.cpp          |   2 +-
 .../lib/network/scatter_kernel.cu             |   3 +-
 .../lib/network/tensorrt_wrapper.cpp          |   2 +-
 .../lib/postprocess/circle_nms_kernel.cu      |   8 +-
 .../postprocess/non_maximum_suppression.cpp   |   7 +-
 .../lib/postprocess/postprocess_kernel.cu     |  10 +-
 .../preprocess/pointcloud_densification.cpp   |  40 ++++---
 .../lib/preprocess/preprocess_kernel.cu       |  49 ++++++++-
 .../lib/preprocess/voxel_generator.cpp        |  48 ++++----
 perception/lidar_centerpoint/src/node.cpp     |  23 ++--
 .../test/test_postprocess_kernel.hpp          |  18 +--
 .../test/test_preprocess_kernel.hpp           |   2 +-
 .../test/test_voxel_generator.cpp             |  49 +++++++--
 .../test/test_voxel_generator.hpp             |  46 ++++----
 33 files changed, 491 insertions(+), 173 deletions(-)
 create mode 100644 perception/image_projection_based_fusion/include/image_projection_based_fusion/pointpainting_fusion/pointcloud_densification.hpp
 mode change 100755 => 100644 perception/image_projection_based_fusion/include/image_projection_based_fusion/pointpainting_fusion/voxel_generator.hpp
 create mode 100644 perception/image_projection_based_fusion/src/pointpainting_fusion/pointcloud_densification.cpp

diff --git a/perception/image_projection_based_fusion/CMakeLists.txt b/perception/image_projection_based_fusion/CMakeLists.txt
index 0f4cb7112f74a..b6dfbaf402cc8 100644
--- a/perception/image_projection_based_fusion/CMakeLists.txt
+++ b/perception/image_projection_based_fusion/CMakeLists.txt
@@ -127,6 +127,7 @@ if(TRT_AVAIL AND CUDA_AVAIL AND CUDNN_AVAIL)
 
   ament_auto_add_library(pointpainting_lib SHARED
     src/pointpainting_fusion/node.cpp
+    src/pointpainting_fusion/pointcloud_densification.cpp
     src/pointpainting_fusion/pointpainting_trt.cpp
     src/pointpainting_fusion/voxel_generator.cpp
   )
diff --git a/perception/image_projection_based_fusion/include/image_projection_based_fusion/pointpainting_fusion/pointcloud_densification.hpp b/perception/image_projection_based_fusion/include/image_projection_based_fusion/pointpainting_fusion/pointcloud_densification.hpp
new file mode 100644
index 0000000000000..03609eb18e689
--- /dev/null
+++ b/perception/image_projection_based_fusion/include/image_projection_based_fusion/pointpainting_fusion/pointcloud_densification.hpp
@@ -0,0 +1,72 @@
+// Copyright 2024 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef IMAGE_PROJECTION_BASED_FUSION__POINTPAINTING_FUSION__POINTCLOUD_DENSIFICATION_HPP_
+#define IMAGE_PROJECTION_BASED_FUSION__POINTPAINTING_FUSION__POINTCLOUD_DENSIFICATION_HPP_
+
+#include <tf2_ros/buffer.h>
+#include <tf2_ros/transform_listener.h>
+#ifdef ROS_DISTRO_GALACTIC
+#include <tf2_sensor_msgs/tf2_sensor_msgs.h>
+#else
+#include <tf2_sensor_msgs/tf2_sensor_msgs.hpp>
+#endif
+
+#include <lidar_centerpoint/preprocess/pointcloud_densification.hpp>
+
+#include <list>
+#include <string>
+#include <utility>
+
+namespace image_projection_based_fusion
+{
+struct PointCloudWithTransform
+{
+  sensor_msgs::msg::PointCloud2 pointcloud_msg;
+  Eigen::Affine3f affine_past2world;
+};
+
+class PointCloudDensification
+{
+public:
+  explicit PointCloudDensification(const centerpoint::DensificationParam & param);
+
+  bool enqueuePointCloud(
+    const sensor_msgs::msg::PointCloud2 & input_pointcloud_msg, const tf2_ros::Buffer & tf_buffer);
+
+  double getCurrentTimestamp() const { return current_timestamp_; }
+  Eigen::Affine3f getAffineWorldToCurrent() const { return affine_world2current_; }
+  std::list<PointCloudWithTransform>::iterator getPointCloudCacheIter()
+  {
+    return pointcloud_cache_.begin();
+  }
+  bool isCacheEnd(std::list<PointCloudWithTransform>::iterator iter)
+  {
+    return iter == pointcloud_cache_.end();
+  }
+  unsigned int pointcloud_cache_size() const { return param_.pointcloud_cache_size(); }
+
+private:
+  void enqueue(const sensor_msgs::msg::PointCloud2 & msg, const Eigen::Affine3f & affine);
+  void dequeue();
+
+  centerpoint::DensificationParam param_;
+  double current_timestamp_{0.0};
+  Eigen::Affine3f affine_world2current_;
+  std::list<PointCloudWithTransform> pointcloud_cache_;
+};
+
+}  // namespace image_projection_based_fusion
+
+#endif  // IMAGE_PROJECTION_BASED_FUSION__POINTPAINTING_FUSION__POINTCLOUD_DENSIFICATION_HPP_
diff --git a/perception/image_projection_based_fusion/include/image_projection_based_fusion/pointpainting_fusion/voxel_generator.hpp b/perception/image_projection_based_fusion/include/image_projection_based_fusion/pointpainting_fusion/voxel_generator.hpp
old mode 100755
new mode 100644
index d0f44b9d58706..4cdca8e49ac7e
--- a/perception/image_projection_based_fusion/include/image_projection_based_fusion/pointpainting_fusion/voxel_generator.hpp
+++ b/perception/image_projection_based_fusion/include/image_projection_based_fusion/pointpainting_fusion/voxel_generator.hpp
@@ -15,19 +15,35 @@
 #ifndef IMAGE_PROJECTION_BASED_FUSION__POINTPAINTING_FUSION__VOXEL_GENERATOR_HPP_
 #define IMAGE_PROJECTION_BASED_FUSION__POINTPAINTING_FUSION__VOXEL_GENERATOR_HPP_
 
+#include <image_projection_based_fusion/pointpainting_fusion/pointcloud_densification.hpp>
+#include <lidar_centerpoint/preprocess/pointcloud_densification.hpp>
 #include <lidar_centerpoint/preprocess/voxel_generator.hpp>
 
 #include <bitset>
+#include <memory>
 #include <vector>
 
 namespace image_projection_based_fusion
 {
-class VoxelGenerator : public centerpoint::VoxelGenerator
+
+class VoxelGenerator
 {
 public:
-  using centerpoint::VoxelGenerator::VoxelGenerator;
+  explicit VoxelGenerator(
+    const centerpoint::DensificationParam & param, const centerpoint::CenterPointConfig & config);
+
+  bool enqueuePointCloud(
+    const sensor_msgs::msg::PointCloud2 & input_pointcloud_msg, const tf2_ros::Buffer & tf_buffer);
+
+  std::size_t generateSweepPoints(std::vector<float> & points);
+
+protected:
+  std::unique_ptr<PointCloudDensification> pd_ptr_{nullptr};
 
-  std::size_t generateSweepPoints(std::vector<float> & points) override;
+  centerpoint::CenterPointConfig config_;
+  std::array<float, 6> range_;
+  std::array<int, 3> grid_size_;
+  std::array<float, 3> recip_voxel_size_;
 };
 }  // namespace image_projection_based_fusion
 
diff --git a/perception/image_projection_based_fusion/src/pointpainting_fusion/pointcloud_densification.cpp b/perception/image_projection_based_fusion/src/pointpainting_fusion/pointcloud_densification.cpp
new file mode 100644
index 0000000000000..9edf6a73cd59b
--- /dev/null
+++ b/perception/image_projection_based_fusion/src/pointpainting_fusion/pointcloud_densification.cpp
@@ -0,0 +1,103 @@
+// Copyright 2024 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "image_projection_based_fusion/pointpainting_fusion/pointcloud_densification.hpp"
+
+#include <pcl_ros/transforms.hpp>
+
+#include <boost/optional.hpp>
+
+#include <pcl_conversions/pcl_conversions.h>
+#ifdef ROS_DISTRO_GALACTIC
+#include <tf2_eigen/tf2_eigen.h>
+#else
+#include <tf2_eigen/tf2_eigen.hpp>
+#endif
+
+#include <string>
+#include <utility>
+
+namespace
+{
+boost::optional<geometry_msgs::msg::Transform> getTransform(
+  const tf2_ros::Buffer & tf_buffer, const std::string & target_frame_id,
+  const std::string & source_frame_id, const rclcpp::Time & time)
+{
+  try {
+    geometry_msgs::msg::TransformStamped transform_stamped;
+    transform_stamped = tf_buffer.lookupTransform(
+      target_frame_id, source_frame_id, time, rclcpp::Duration::from_seconds(0.5));
+    return transform_stamped.transform;
+  } catch (tf2::TransformException & ex) {
+    RCLCPP_WARN_STREAM(rclcpp::get_logger("lidar_centerpoint"), ex.what());
+    return boost::none;
+  }
+}
+
+Eigen::Affine3f transformToEigen(const geometry_msgs::msg::Transform & t)
+{
+  Eigen::Affine3f a;
+  a.matrix() = tf2::transformToEigen(t).matrix().cast<float>();
+  return a;
+}
+
+}  // namespace
+
+namespace image_projection_based_fusion
+{
+PointCloudDensification::PointCloudDensification(const centerpoint::DensificationParam & param)
+: param_(param)
+{
+}
+
+bool PointCloudDensification::enqueuePointCloud(
+  const sensor_msgs::msg::PointCloud2 & pointcloud_msg, const tf2_ros::Buffer & tf_buffer)
+{
+  const auto header = pointcloud_msg.header;
+
+  if (param_.pointcloud_cache_size() > 1) {
+    auto transform_world2current =
+      getTransform(tf_buffer, header.frame_id, param_.world_frame_id(), header.stamp);
+    if (!transform_world2current) {
+      return false;
+    }
+    auto affine_world2current = transformToEigen(transform_world2current.get());
+
+    enqueue(pointcloud_msg, affine_world2current);
+  } else {
+    enqueue(pointcloud_msg, Eigen::Affine3f::Identity());
+  }
+
+  dequeue();
+
+  return true;
+}
+
+void PointCloudDensification::enqueue(
+  const sensor_msgs::msg::PointCloud2 & msg, const Eigen::Affine3f & affine_world2current)
+{
+  affine_world2current_ = affine_world2current;
+  current_timestamp_ = rclcpp::Time(msg.header.stamp).seconds();
+  PointCloudWithTransform pointcloud = {msg, affine_world2current.inverse()};
+  pointcloud_cache_.push_front(pointcloud);
+}
+
+void PointCloudDensification::dequeue()
+{
+  if (pointcloud_cache_.size() > param_.pointcloud_cache_size()) {
+    pointcloud_cache_.pop_back();
+  }
+}
+
+}  // namespace image_projection_based_fusion
diff --git a/perception/image_projection_based_fusion/src/pointpainting_fusion/voxel_generator.cpp b/perception/image_projection_based_fusion/src/pointpainting_fusion/voxel_generator.cpp
index ea10cb1237436..cb3fc33d3e022 100755
--- a/perception/image_projection_based_fusion/src/pointpainting_fusion/voxel_generator.cpp
+++ b/perception/image_projection_based_fusion/src/pointpainting_fusion/voxel_generator.cpp
@@ -18,6 +18,32 @@
 
 namespace image_projection_based_fusion
 {
+
+VoxelGenerator::VoxelGenerator(
+  const centerpoint::DensificationParam & param, const centerpoint::CenterPointConfig & config)
+: config_(config)
+{
+  pd_ptr_ = std::make_unique<PointCloudDensification>(param);
+  range_[0] = config.range_min_x_;
+  range_[1] = config.range_min_y_;
+  range_[2] = config.range_min_z_;
+  range_[3] = config.range_max_x_;
+  range_[4] = config.range_max_y_;
+  range_[5] = config.range_max_z_;
+  grid_size_[0] = config.grid_size_x_;
+  grid_size_[1] = config.grid_size_y_;
+  grid_size_[2] = config.grid_size_z_;
+  recip_voxel_size_[0] = 1 / config.voxel_size_x_;
+  recip_voxel_size_[1] = 1 / config.voxel_size_y_;
+  recip_voxel_size_[2] = 1 / config.voxel_size_z_;
+}
+
+bool VoxelGenerator::enqueuePointCloud(
+  const sensor_msgs::msg::PointCloud2 & input_pointcloud_msg, const tf2_ros::Buffer & tf_buffer)
+{
+  return pd_ptr_->enqueuePointCloud(input_pointcloud_msg, tf_buffer);
+}
+
 size_t VoxelGenerator::generateSweepPoints(std::vector<float> & points)
 {
   Eigen::Vector3f point_current, point_past;
diff --git a/perception/lidar_centerpoint/include/lidar_centerpoint/centerpoint_trt.hpp b/perception/lidar_centerpoint/include/lidar_centerpoint/centerpoint_trt.hpp
index 383e8ef719d4d..df8100fb6b80e 100644
--- a/perception/lidar_centerpoint/include/lidar_centerpoint/centerpoint_trt.hpp
+++ b/perception/lidar_centerpoint/include/lidar_centerpoint/centerpoint_trt.hpp
@@ -15,15 +15,14 @@
 #ifndef LIDAR_CENTERPOINT__CENTERPOINT_TRT_HPP_
 #define LIDAR_CENTERPOINT__CENTERPOINT_TRT_HPP_
 
-#include <lidar_centerpoint/cuda_utils.hpp>
-#include <lidar_centerpoint/network/network_trt.hpp>
-#include <lidar_centerpoint/postprocess/postprocess_kernel.hpp>
-#include <lidar_centerpoint/preprocess/voxel_generator.hpp>
-
-#include <sensor_msgs/msg/point_cloud2.hpp>
-
-#include <pcl/point_cloud.h>
-#include <pcl/point_types.h>
+#include "lidar_centerpoint/cuda_utils.hpp"
+#include "lidar_centerpoint/network/network_trt.hpp"
+#include "lidar_centerpoint/postprocess/postprocess_kernel.hpp"
+#include "lidar_centerpoint/preprocess/voxel_generator.hpp"
+#include "pcl/point_cloud.h"
+#include "pcl/point_types.h"
+
+#include "sensor_msgs/msg/point_cloud2.hpp"
 
 #include <memory>
 #include <string>
diff --git a/perception/lidar_centerpoint/include/lidar_centerpoint/cuda_utils.hpp b/perception/lidar_centerpoint/include/lidar_centerpoint/cuda_utils.hpp
index 5ab26d75a0a41..efe3bbb9a5482 100644
--- a/perception/lidar_centerpoint/include/lidar_centerpoint/cuda_utils.hpp
+++ b/perception/lidar_centerpoint/include/lidar_centerpoint/cuda_utils.hpp
@@ -42,7 +42,7 @@
 #ifndef LIDAR_CENTERPOINT__CUDA_UTILS_HPP_
 #define LIDAR_CENTERPOINT__CUDA_UTILS_HPP_
 
-#include <cuda_runtime_api.h>
+#include "cuda_runtime_api.h"
 
 #include <memory>
 #include <sstream>
diff --git a/perception/lidar_centerpoint/include/lidar_centerpoint/detection_class_remapper.hpp b/perception/lidar_centerpoint/include/lidar_centerpoint/detection_class_remapper.hpp
index cd1a0e58e115e..d961dd998af76 100644
--- a/perception/lidar_centerpoint/include/lidar_centerpoint/detection_class_remapper.hpp
+++ b/perception/lidar_centerpoint/include/lidar_centerpoint/detection_class_remapper.hpp
@@ -17,10 +17,10 @@
 
 #include <Eigen/Core>
 
-#include <autoware_perception_msgs/msg/detected_object_kinematics.hpp>
-#include <autoware_perception_msgs/msg/detected_objects.hpp>
-#include <autoware_perception_msgs/msg/object_classification.hpp>
-#include <autoware_perception_msgs/msg/shape.hpp>
+#include "autoware_perception_msgs/msg/detected_object_kinematics.hpp"
+#include "autoware_perception_msgs/msg/detected_objects.hpp"
+#include "autoware_perception_msgs/msg/object_classification.hpp"
+#include "autoware_perception_msgs/msg/shape.hpp"
 
 #include <vector>
 
diff --git a/perception/lidar_centerpoint/include/lidar_centerpoint/network/network_trt.hpp b/perception/lidar_centerpoint/include/lidar_centerpoint/network/network_trt.hpp
index 639aa0ba8bad3..1bf77248efe08 100644
--- a/perception/lidar_centerpoint/include/lidar_centerpoint/network/network_trt.hpp
+++ b/perception/lidar_centerpoint/include/lidar_centerpoint/network/network_trt.hpp
@@ -15,8 +15,8 @@
 #ifndef LIDAR_CENTERPOINT__NETWORK__NETWORK_TRT_HPP_
 #define LIDAR_CENTERPOINT__NETWORK__NETWORK_TRT_HPP_
 
-#include <lidar_centerpoint/centerpoint_config.hpp>
-#include <lidar_centerpoint/network/tensorrt_wrapper.hpp>
+#include "lidar_centerpoint/centerpoint_config.hpp"
+#include "lidar_centerpoint/network/tensorrt_wrapper.hpp"
 
 #include <vector>
 
diff --git a/perception/lidar_centerpoint/include/lidar_centerpoint/network/scatter_kernel.hpp b/perception/lidar_centerpoint/include/lidar_centerpoint/network/scatter_kernel.hpp
index 3dd00c25fd9e7..ad5e222ce01e8 100644
--- a/perception/lidar_centerpoint/include/lidar_centerpoint/network/scatter_kernel.hpp
+++ b/perception/lidar_centerpoint/include/lidar_centerpoint/network/scatter_kernel.hpp
@@ -15,8 +15,8 @@
 #ifndef LIDAR_CENTERPOINT__NETWORK__SCATTER_KERNEL_HPP_
 #define LIDAR_CENTERPOINT__NETWORK__SCATTER_KERNEL_HPP_
 
-#include <cuda.h>
-#include <cuda_runtime_api.h>
+#include "cuda.h"
+#include "cuda_runtime_api.h"
 
 namespace centerpoint
 {
diff --git a/perception/lidar_centerpoint/include/lidar_centerpoint/network/tensorrt_wrapper.hpp b/perception/lidar_centerpoint/include/lidar_centerpoint/network/tensorrt_wrapper.hpp
index 7f55ab6f5e414..4af5aac7ff7fe 100644
--- a/perception/lidar_centerpoint/include/lidar_centerpoint/network/tensorrt_wrapper.hpp
+++ b/perception/lidar_centerpoint/include/lidar_centerpoint/network/tensorrt_wrapper.hpp
@@ -15,10 +15,9 @@
 #ifndef LIDAR_CENTERPOINT__NETWORK__TENSORRT_WRAPPER_HPP_
 #define LIDAR_CENTERPOINT__NETWORK__TENSORRT_WRAPPER_HPP_
 
-#include <lidar_centerpoint/centerpoint_config.hpp>
-#include <tensorrt_common/tensorrt_common.hpp>
-
-#include <NvInfer.h>
+#include "NvInfer.h"
+#include "lidar_centerpoint/centerpoint_config.hpp"
+#include "tensorrt_common/tensorrt_common.hpp"
 
 #include <iostream>
 #include <memory>
@@ -32,7 +31,7 @@ class TensorRTWrapper
 public:
   explicit TensorRTWrapper(const CenterPointConfig & config);
 
-  ~TensorRTWrapper();
+  virtual ~TensorRTWrapper();
 
   bool init(
     const std::string & onnx_path, const std::string & engine_path, const std::string & precision);
diff --git a/perception/lidar_centerpoint/include/lidar_centerpoint/node.hpp b/perception/lidar_centerpoint/include/lidar_centerpoint/node.hpp
index 4ccfa908e12d7..154ff97cdb887 100644
--- a/perception/lidar_centerpoint/include/lidar_centerpoint/node.hpp
+++ b/perception/lidar_centerpoint/include/lidar_centerpoint/node.hpp
@@ -15,13 +15,13 @@
 #ifndef LIDAR_CENTERPOINT__NODE_HPP_
 #define LIDAR_CENTERPOINT__NODE_HPP_
 
+#include "lidar_centerpoint/centerpoint_trt.hpp"
+#include "lidar_centerpoint/detection_class_remapper.hpp"
 #include "lidar_centerpoint/postprocess/non_maximum_suppression.hpp"
 
 #include <autoware/universe_utils/ros/debug_publisher.hpp>
 #include <autoware/universe_utils/ros/published_time_publisher.hpp>
 #include <autoware/universe_utils/system/stop_watch.hpp>
-#include <lidar_centerpoint/centerpoint_trt.hpp>
-#include <lidar_centerpoint/detection_class_remapper.hpp>
 #include <rclcpp/rclcpp.hpp>
 
 #include <autoware_perception_msgs/msg/detected_object_kinematics.hpp>
diff --git a/perception/lidar_centerpoint/include/lidar_centerpoint/postprocess/circle_nms_kernel.hpp b/perception/lidar_centerpoint/include/lidar_centerpoint/postprocess/circle_nms_kernel.hpp
index cd2d71d33806d..5b55636a3eec4 100644
--- a/perception/lidar_centerpoint/include/lidar_centerpoint/postprocess/circle_nms_kernel.hpp
+++ b/perception/lidar_centerpoint/include/lidar_centerpoint/postprocess/circle_nms_kernel.hpp
@@ -15,9 +15,8 @@
 #ifndef LIDAR_CENTERPOINT__POSTPROCESS__CIRCLE_NMS_KERNEL_HPP_
 #define LIDAR_CENTERPOINT__POSTPROCESS__CIRCLE_NMS_KERNEL_HPP_
 
-#include <lidar_centerpoint/utils.hpp>
-
-#include <thrust/device_vector.h>
+#include "lidar_centerpoint/utils.hpp"
+#include "thrust/device_vector.h"
 
 namespace centerpoint
 {
diff --git a/perception/lidar_centerpoint/include/lidar_centerpoint/postprocess/postprocess_kernel.hpp b/perception/lidar_centerpoint/include/lidar_centerpoint/postprocess/postprocess_kernel.hpp
index d790e896dea9a..e15d3022c947c 100644
--- a/perception/lidar_centerpoint/include/lidar_centerpoint/postprocess/postprocess_kernel.hpp
+++ b/perception/lidar_centerpoint/include/lidar_centerpoint/postprocess/postprocess_kernel.hpp
@@ -15,12 +15,11 @@
 #ifndef LIDAR_CENTERPOINT__POSTPROCESS__POSTPROCESS_KERNEL_HPP_
 #define LIDAR_CENTERPOINT__POSTPROCESS__POSTPROCESS_KERNEL_HPP_
 
-#include <lidar_centerpoint/centerpoint_config.hpp>
-#include <lidar_centerpoint/utils.hpp>
-
-#include <cuda.h>
-#include <cuda_runtime_api.h>
-#include <thrust/device_vector.h>
+#include "cuda.h"
+#include "cuda_runtime_api.h"
+#include "lidar_centerpoint/centerpoint_config.hpp"
+#include "lidar_centerpoint/utils.hpp"
+#include "thrust/device_vector.h"
 
 #include <vector>
 
diff --git a/perception/lidar_centerpoint/include/lidar_centerpoint/preprocess/pointcloud_densification.hpp b/perception/lidar_centerpoint/include/lidar_centerpoint/preprocess/pointcloud_densification.hpp
index 209fc74e6c58f..c1d6a6b060955 100644
--- a/perception/lidar_centerpoint/include/lidar_centerpoint/preprocess/pointcloud_densification.hpp
+++ b/perception/lidar_centerpoint/include/lidar_centerpoint/preprocess/pointcloud_densification.hpp
@@ -15,17 +15,19 @@
 #ifndef LIDAR_CENTERPOINT__PREPROCESS__POINTCLOUD_DENSIFICATION_HPP_
 #define LIDAR_CENTERPOINT__PREPROCESS__POINTCLOUD_DENSIFICATION_HPP_
 
-#include <tf2_ros/buffer.h>
-#include <tf2_ros/transform_listener.h>
-#ifdef ROS_DISTRO_GALACTIC
-#include <tf2_sensor_msgs/tf2_sensor_msgs.h>
-#else
-#include <tf2_sensor_msgs/tf2_sensor_msgs.hpp>
-#endif
+#include "tf2_ros/buffer.h"
+#include "tf2_ros/transform_listener.h"
 
 #include <list>
 #include <string>
 #include <utility>
+#ifdef ROS_DISTRO_GALACTIC
+#include "tf2_sensor_msgs/tf2_sensor_msgs.h"
+#else
+#include "tf2_sensor_msgs/tf2_sensor_msgs.hpp"
+#endif
+
+#include "lidar_centerpoint/cuda_utils.hpp"
 
 namespace centerpoint
 {
@@ -48,7 +50,10 @@ class DensificationParam
 
 struct PointCloudWithTransform
 {
-  sensor_msgs::msg::PointCloud2 pointcloud_msg;
+  cuda::unique_ptr<float[]> points_d{nullptr};
+  std_msgs::msg::Header header;
+  size_t num_points{0};
+  size_t point_step{0};
   Eigen::Affine3f affine_past2world;
 };
 
@@ -58,7 +63,8 @@ class PointCloudDensification
   explicit PointCloudDensification(const DensificationParam & param);
 
   bool enqueuePointCloud(
-    const sensor_msgs::msg::PointCloud2 & input_pointcloud_msg, const tf2_ros::Buffer & tf_buffer);
+    const sensor_msgs::msg::PointCloud2 & input_pointcloud_msg, const tf2_ros::Buffer & tf_buffer,
+    cudaStream_t stream);
 
   double getCurrentTimestamp() const { return current_timestamp_; }
   Eigen::Affine3f getAffineWorldToCurrent() const { return affine_world2current_; }
@@ -73,7 +79,8 @@ class PointCloudDensification
   unsigned int pointcloud_cache_size() const { return param_.pointcloud_cache_size(); }
 
 private:
-  void enqueue(const sensor_msgs::msg::PointCloud2 & msg, const Eigen::Affine3f & affine);
+  void enqueue(
+    const sensor_msgs::msg::PointCloud2 & msg, const Eigen::Affine3f & affine, cudaStream_t stream);
   void dequeue();
 
   DensificationParam param_;
diff --git a/perception/lidar_centerpoint/include/lidar_centerpoint/preprocess/preprocess_kernel.hpp b/perception/lidar_centerpoint/include/lidar_centerpoint/preprocess/preprocess_kernel.hpp
index 9488b67475509..3abcd89cb5c55 100644
--- a/perception/lidar_centerpoint/include/lidar_centerpoint/preprocess/preprocess_kernel.hpp
+++ b/perception/lidar_centerpoint/include/lidar_centerpoint/preprocess/preprocess_kernel.hpp
@@ -15,11 +15,15 @@
 #ifndef LIDAR_CENTERPOINT__PREPROCESS__PREPROCESS_KERNEL_HPP_
 #define LIDAR_CENTERPOINT__PREPROCESS__PREPROCESS_KERNEL_HPP_
 
-#include <cuda.h>
-#include <cuda_runtime_api.h>
+#include "cuda.h"
+#include "cuda_runtime_api.h"
 
 namespace centerpoint
 {
+cudaError_t generateSweepPoints_launch(
+  const float * input_points, size_t points_size, int input_point_step, float time_lag,
+  const float * transform, int num_features, float * output_points, cudaStream_t stream);
+
 cudaError_t generateVoxels_random_launch(
   const float * points, size_t points_size, float min_x_range, float max_x_range, float min_y_range,
   float max_y_range, float min_z_range, float max_z_range, float pillar_x_size, float pillar_y_size,
diff --git a/perception/lidar_centerpoint/include/lidar_centerpoint/preprocess/voxel_generator.hpp b/perception/lidar_centerpoint/include/lidar_centerpoint/preprocess/voxel_generator.hpp
index be15d51e91715..3ade5e677cdbe 100644
--- a/perception/lidar_centerpoint/include/lidar_centerpoint/preprocess/voxel_generator.hpp
+++ b/perception/lidar_centerpoint/include/lidar_centerpoint/preprocess/voxel_generator.hpp
@@ -15,10 +15,10 @@
 #ifndef LIDAR_CENTERPOINT__PREPROCESS__VOXEL_GENERATOR_HPP_
 #define LIDAR_CENTERPOINT__PREPROCESS__VOXEL_GENERATOR_HPP_
 
-#include <lidar_centerpoint/centerpoint_config.hpp>
-#include <lidar_centerpoint/preprocess/pointcloud_densification.hpp>
+#include "lidar_centerpoint/centerpoint_config.hpp"
+#include "lidar_centerpoint/preprocess/pointcloud_densification.hpp"
 
-#include <sensor_msgs/msg/point_cloud2.hpp>
+#include "sensor_msgs/msg/point_cloud2.hpp"
 
 #include <memory>
 #include <vector>
@@ -31,10 +31,11 @@ class VoxelGeneratorTemplate
   explicit VoxelGeneratorTemplate(
     const DensificationParam & param, const CenterPointConfig & config);
 
-  virtual std::size_t generateSweepPoints(std::vector<float> & points) = 0;
+  virtual std::size_t generateSweepPoints(float * d_points, cudaStream_t stream) = 0;
 
   bool enqueuePointCloud(
-    const sensor_msgs::msg::PointCloud2 & input_pointcloud_msg, const tf2_ros::Buffer & tf_buffer);
+    const sensor_msgs::msg::PointCloud2 & input_pointcloud_msg, const tf2_ros::Buffer & tf_buffer,
+    cudaStream_t stream);
 
 protected:
   std::unique_ptr<PointCloudDensification> pd_ptr_{nullptr};
@@ -50,7 +51,7 @@ class VoxelGenerator : public VoxelGeneratorTemplate
 public:
   using VoxelGeneratorTemplate::VoxelGeneratorTemplate;
 
-  std::size_t generateSweepPoints(std::vector<float> & points) override;
+  std::size_t generateSweepPoints(float * d_points, cudaStream_t stream) override;
 };
 
 }  // namespace centerpoint
diff --git a/perception/lidar_centerpoint/include/lidar_centerpoint/ros_utils.hpp b/perception/lidar_centerpoint/include/lidar_centerpoint/ros_utils.hpp
index e33d86cd0aba7..484fbcfd36657 100644
--- a/perception/lidar_centerpoint/include/lidar_centerpoint/ros_utils.hpp
+++ b/perception/lidar_centerpoint/include/lidar_centerpoint/ros_utils.hpp
@@ -17,12 +17,9 @@
 
 // ros packages cannot be included from cuda.
 
-#include <lidar_centerpoint/utils.hpp>
+#include "lidar_centerpoint/utils.hpp"
 
-#include <autoware_perception_msgs/msg/detected_object_kinematics.hpp>
-#include <autoware_perception_msgs/msg/detected_objects.hpp>
-#include <autoware_perception_msgs/msg/object_classification.hpp>
-#include <autoware_perception_msgs/msg/shape.hpp>
+#include "autoware_perception_msgs/msg/detected_objects.hpp"
 
 #include <string>
 #include <vector>
diff --git a/perception/lidar_centerpoint/lib/centerpoint_trt.cpp b/perception/lidar_centerpoint/lib/centerpoint_trt.cpp
index a4281aa8260d7..0e3791e061f62 100644
--- a/perception/lidar_centerpoint/lib/centerpoint_trt.cpp
+++ b/perception/lidar_centerpoint/lib/centerpoint_trt.cpp
@@ -14,10 +14,11 @@
 
 #include "lidar_centerpoint/centerpoint_trt.hpp"
 
+#include "lidar_centerpoint/centerpoint_config.hpp"
+#include "lidar_centerpoint/network/scatter_kernel.hpp"
+#include "lidar_centerpoint/preprocess/preprocess_kernel.hpp"
+
 #include <autoware/universe_utils/math/constants.hpp>
-#include <lidar_centerpoint/centerpoint_config.hpp>
-#include <lidar_centerpoint/network/scatter_kernel.hpp>
-#include <lidar_centerpoint/preprocess/preprocess_kernel.hpp>
 
 #include <iostream>
 #include <memory>
@@ -143,14 +144,11 @@ bool CenterPointTRT::detect(
 bool CenterPointTRT::preprocess(
   const sensor_msgs::msg::PointCloud2 & input_pointcloud_msg, const tf2_ros::Buffer & tf_buffer)
 {
-  bool is_success = vg_ptr_->enqueuePointCloud(input_pointcloud_msg, tf_buffer);
+  bool is_success = vg_ptr_->enqueuePointCloud(input_pointcloud_msg, tf_buffer, stream_);
   if (!is_success) {
     return false;
   }
-  const auto count = vg_ptr_->generateSweepPoints(points_);
-  CHECK_CUDA_ERROR(cudaMemcpyAsync(
-    points_d_.get(), points_.data(), count * config_.point_feature_size_ * sizeof(float),
-    cudaMemcpyHostToDevice, stream_));
+  const auto count = vg_ptr_->generateSweepPoints(points_d_.get(), stream_);
   CHECK_CUDA_ERROR(cudaMemsetAsync(num_voxels_d_.get(), 0, sizeof(unsigned int), stream_));
   CHECK_CUDA_ERROR(
     cudaMemsetAsync(voxels_buffer_d_.get(), 0, voxels_buffer_size_ * sizeof(float), stream_));
diff --git a/perception/lidar_centerpoint/lib/detection_class_remapper.cpp b/perception/lidar_centerpoint/lib/detection_class_remapper.cpp
index ecfbb4360ecf0..9805fc7a661d1 100644
--- a/perception/lidar_centerpoint/lib/detection_class_remapper.cpp
+++ b/perception/lidar_centerpoint/lib/detection_class_remapper.cpp
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <lidar_centerpoint/detection_class_remapper.hpp>
+#include "lidar_centerpoint/detection_class_remapper.hpp"
 
 namespace centerpoint
 {
diff --git a/perception/lidar_centerpoint/lib/network/scatter_kernel.cu b/perception/lidar_centerpoint/lib/network/scatter_kernel.cu
index 09cc83bf606fe..f139e230426ca 100644
--- a/perception/lidar_centerpoint/lib/network/scatter_kernel.cu
+++ b/perception/lidar_centerpoint/lib/network/scatter_kernel.cu
@@ -13,8 +13,7 @@
 // limitations under the License.
 
 #include "lidar_centerpoint/network/scatter_kernel.hpp"
-
-#include <lidar_centerpoint/utils.hpp>
+#include "lidar_centerpoint/utils.hpp"
 
 namespace
 {
diff --git a/perception/lidar_centerpoint/lib/network/tensorrt_wrapper.cpp b/perception/lidar_centerpoint/lib/network/tensorrt_wrapper.cpp
index 91fcce9a80f78..bdfde42a12a02 100644
--- a/perception/lidar_centerpoint/lib/network/tensorrt_wrapper.cpp
+++ b/perception/lidar_centerpoint/lib/network/tensorrt_wrapper.cpp
@@ -14,7 +14,7 @@
 
 #include "lidar_centerpoint/network/tensorrt_wrapper.hpp"
 
-#include <NvOnnxParser.h>
+#include "NvOnnxParser.h"
 
 #include <fstream>
 #include <memory>
diff --git a/perception/lidar_centerpoint/lib/postprocess/circle_nms_kernel.cu b/perception/lidar_centerpoint/lib/postprocess/circle_nms_kernel.cu
index 398e75a55c44b..c208eefe135fb 100644
--- a/perception/lidar_centerpoint/lib/postprocess/circle_nms_kernel.cu
+++ b/perception/lidar_centerpoint/lib/postprocess/circle_nms_kernel.cu
@@ -21,12 +21,10 @@ Written by Shaoshuai Shi
 All Rights Reserved 2019-2020.
 */
 
+#include "lidar_centerpoint/cuda_utils.hpp"
 #include "lidar_centerpoint/postprocess/circle_nms_kernel.hpp"
-
-#include <lidar_centerpoint/cuda_utils.hpp>
-#include <lidar_centerpoint/utils.hpp>
-
-#include <thrust/host_vector.h>
+#include "lidar_centerpoint/utils.hpp"
+#include "thrust/host_vector.h"
 
 namespace
 {
diff --git a/perception/lidar_centerpoint/lib/postprocess/non_maximum_suppression.cpp b/perception/lidar_centerpoint/lib/postprocess/non_maximum_suppression.cpp
index d750e4adab0e1..30a54b08b8ede 100644
--- a/perception/lidar_centerpoint/lib/postprocess/non_maximum_suppression.cpp
+++ b/perception/lidar_centerpoint/lib/postprocess/non_maximum_suppression.cpp
@@ -14,9 +14,10 @@
 
 #include "lidar_centerpoint/postprocess/non_maximum_suppression.hpp"
 
-#include "autoware/universe_utils/geometry/geometry.hpp"
-#include "object_recognition_utils/geometry.hpp"
-#include "object_recognition_utils/object_recognition_utils.hpp"
+#include <autoware/universe_utils/geometry/geometry.hpp>
+#include <object_recognition_utils/geometry.hpp>
+#include <object_recognition_utils/object_recognition_utils.hpp>
+
 namespace centerpoint
 {
 using Label = autoware_perception_msgs::msg::ObjectClassification;
diff --git a/perception/lidar_centerpoint/lib/postprocess/postprocess_kernel.cu b/perception/lidar_centerpoint/lib/postprocess/postprocess_kernel.cu
index 34bbd2811041c..12835bab038a6 100644
--- a/perception/lidar_centerpoint/lib/postprocess/postprocess_kernel.cu
+++ b/perception/lidar_centerpoint/lib/postprocess/postprocess_kernel.cu
@@ -13,12 +13,10 @@
 // limitations under the License.
 
 #include "lidar_centerpoint/postprocess/circle_nms_kernel.hpp"
-
-#include <lidar_centerpoint/postprocess/postprocess_kernel.hpp>
-
-#include <thrust/count.h>
-#include <thrust/device_vector.h>
-#include <thrust/sort.h>
+#include "lidar_centerpoint/postprocess/postprocess_kernel.hpp"
+#include "thrust/count.h"
+#include "thrust/device_vector.h"
+#include "thrust/sort.h"
 
 namespace
 {
diff --git a/perception/lidar_centerpoint/lib/preprocess/pointcloud_densification.cpp b/perception/lidar_centerpoint/lib/preprocess/pointcloud_densification.cpp
index 66c348fda50b4..2859d58d8f669 100644
--- a/perception/lidar_centerpoint/lib/preprocess/pointcloud_densification.cpp
+++ b/perception/lidar_centerpoint/lib/preprocess/pointcloud_densification.cpp
@@ -14,20 +14,19 @@
 
 #include "lidar_centerpoint/preprocess/pointcloud_densification.hpp"
 
-#include <pcl_ros/transforms.hpp>
+#include "pcl_conversions/pcl_conversions.h"
+#include "pcl_ros/transforms.hpp"
 
-#include <boost/optional.hpp>
+#include "boost/optional.hpp"
 
-#include <pcl_conversions/pcl_conversions.h>
+#include <string>
+#include <utility>
 #ifdef ROS_DISTRO_GALACTIC
-#include <tf2_eigen/tf2_eigen.h>
+#include "tf2_eigen/tf2_eigen.h"
 #else
-#include <tf2_eigen/tf2_eigen.hpp>
+#include "tf2_eigen/tf2_eigen.hpp"
 #endif
 
-#include <string>
-#include <utility>
-
 namespace
 {
 boost::optional<geometry_msgs::msg::Transform> getTransform(
@@ -61,7 +60,8 @@ PointCloudDensification::PointCloudDensification(const DensificationParam & para
 }
 
 bool PointCloudDensification::enqueuePointCloud(
-  const sensor_msgs::msg::PointCloud2 & pointcloud_msg, const tf2_ros::Buffer & tf_buffer)
+  const sensor_msgs::msg::PointCloud2 & pointcloud_msg, const tf2_ros::Buffer & tf_buffer,
+  cudaStream_t stream)
 {
   const auto header = pointcloud_msg.header;
 
@@ -73,9 +73,9 @@ bool PointCloudDensification::enqueuePointCloud(
     }
     auto affine_world2current = transformToEigen(transform_world2current.get());
 
-    enqueue(pointcloud_msg, affine_world2current);
+    enqueue(pointcloud_msg, affine_world2current, stream);
   } else {
-    enqueue(pointcloud_msg, Eigen::Affine3f::Identity());
+    enqueue(pointcloud_msg, Eigen::Affine3f::Identity(), stream);
   }
 
   dequeue();
@@ -84,12 +84,24 @@ bool PointCloudDensification::enqueuePointCloud(
 }
 
 void PointCloudDensification::enqueue(
-  const sensor_msgs::msg::PointCloud2 & msg, const Eigen::Affine3f & affine_world2current)
+  const sensor_msgs::msg::PointCloud2 & msg, const Eigen::Affine3f & affine_world2current,
+  cudaStream_t stream)
 {
   affine_world2current_ = affine_world2current;
   current_timestamp_ = rclcpp::Time(msg.header.stamp).seconds();
-  PointCloudWithTransform pointcloud = {msg, affine_world2current.inverse()};
-  pointcloud_cache_.push_front(pointcloud);
+
+  assert(sizeof(uint8_t) * msg.width * msg.height * msg.point_step % sizeof(float) == 0);
+  auto points_d = cuda::make_unique<float[]>(
+    sizeof(uint8_t) * msg.width * msg.height * msg.point_step / sizeof(float));
+  CHECK_CUDA_ERROR(cudaMemcpyAsync(
+    points_d.get(), msg.data.data(), sizeof(uint8_t) * msg.width * msg.height * msg.point_step,
+    cudaMemcpyHostToDevice, stream));
+
+  PointCloudWithTransform pointcloud = {
+    std::move(points_d), msg.header, msg.width * msg.height, msg.point_step,
+    affine_world2current.inverse()};
+
+  pointcloud_cache_.push_front(std::move(pointcloud));
 }
 
 void PointCloudDensification::dequeue()
diff --git a/perception/lidar_centerpoint/lib/preprocess/preprocess_kernel.cu b/perception/lidar_centerpoint/lib/preprocess/preprocess_kernel.cu
index 218aaee125c02..4e78176739c26 100644
--- a/perception/lidar_centerpoint/lib/preprocess/preprocess_kernel.cu
+++ b/perception/lidar_centerpoint/lib/preprocess/preprocess_kernel.cu
@@ -28,9 +28,11 @@
  * limitations under the License.
  */
 
+#include "lidar_centerpoint/cuda_utils.hpp"
 #include "lidar_centerpoint/preprocess/preprocess_kernel.hpp"
+#include "lidar_centerpoint/utils.hpp"
 
-#include <lidar_centerpoint/utils.hpp>
+#include <cassert>
 
 namespace
 {
@@ -41,6 +43,51 @@ const std::size_t ENCODER_IN_FEATURE_SIZE = 9;  // the same as encoder_in_featur
 
 namespace centerpoint
 {
+
+__global__ void generateSweepPoints_kernel(
+  const float * input_points, size_t points_size, int input_point_step, float time_lag,
+  const float * transform_array, int num_features, float * output_points)
+{
+  int point_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (point_idx >= points_size) return;
+
+  const float input_x = input_points[point_idx * input_point_step + 0];
+  const float input_y = input_points[point_idx * input_point_step + 1];
+  const float input_z = input_points[point_idx * input_point_step + 2];
+
+  // transform_array is expected to be column-major
+  output_points[point_idx * num_features + 0] = transform_array[0] * input_x +
+                                                transform_array[4] * input_y +
+                                                transform_array[8] * input_z + transform_array[12];
+  output_points[point_idx * num_features + 1] = transform_array[1] * input_x +
+                                                transform_array[5] * input_y +
+                                                transform_array[9] * input_z + transform_array[13];
+  output_points[point_idx * num_features + 2] = transform_array[2] * input_x +
+                                                transform_array[6] * input_y +
+                                                transform_array[10] * input_z + transform_array[14];
+  output_points[point_idx * num_features + 3] = time_lag;
+}
+
+cudaError_t generateSweepPoints_launch(
+  const float * input_points, size_t points_size, int input_point_step, float time_lag,
+  const float * transform_array, int num_features, float * output_points, cudaStream_t stream)
+{
+  auto transform_d = cuda::make_unique<float[]>(16);
+  CHECK_CUDA_ERROR(cudaMemcpyAsync(
+    transform_d.get(), transform_array, 16 * sizeof(float), cudaMemcpyHostToDevice, stream));
+
+  dim3 blocks((points_size + 256 - 1) / 256);
+  dim3 threads(256);
+  assert(num_features == 4);
+
+  generateSweepPoints_kernel<<<blocks, threads, 0, stream>>>(
+    input_points, points_size, input_point_step, time_lag, transform_d.get(), num_features,
+    output_points);
+
+  cudaError_t err = cudaGetLastError();
+  return err;
+}
+
 __global__ void generateVoxels_random_kernel(
   const float * points, size_t points_size, float min_x_range, float max_x_range, float min_y_range,
   float max_y_range, float min_z_range, float max_z_range, float pillar_x_size, float pillar_y_size,
diff --git a/perception/lidar_centerpoint/lib/preprocess/voxel_generator.cpp b/perception/lidar_centerpoint/lib/preprocess/voxel_generator.cpp
index 07a1a2de725f5..e90474fa07327 100644
--- a/perception/lidar_centerpoint/lib/preprocess/voxel_generator.cpp
+++ b/perception/lidar_centerpoint/lib/preprocess/voxel_generator.cpp
@@ -14,7 +14,12 @@
 
 #include "lidar_centerpoint/preprocess/voxel_generator.hpp"
 
-#include <sensor_msgs/point_cloud2_iterator.hpp>
+#include "lidar_centerpoint/centerpoint_trt.hpp"
+#include "lidar_centerpoint/preprocess/preprocess_kernel.hpp"
+
+#include "sensor_msgs/point_cloud2_iterator.hpp"
+
+#include <type_traits>
 
 namespace centerpoint
 {
@@ -38,35 +43,40 @@ VoxelGeneratorTemplate::VoxelGeneratorTemplate(
 }
 
 bool VoxelGeneratorTemplate::enqueuePointCloud(
-  const sensor_msgs::msg::PointCloud2 & input_pointcloud_msg, const tf2_ros::Buffer & tf_buffer)
+  const sensor_msgs::msg::PointCloud2 & input_pointcloud_msg, const tf2_ros::Buffer & tf_buffer,
+  cudaStream_t stream)
 {
-  return pd_ptr_->enqueuePointCloud(input_pointcloud_msg, tf_buffer);
+  return pd_ptr_->enqueuePointCloud(input_pointcloud_msg, tf_buffer, stream);
 }
 
-std::size_t VoxelGenerator::generateSweepPoints(std::vector<float> & points)
+std::size_t VoxelGenerator::generateSweepPoints(float * points_d, cudaStream_t stream)
 {
-  Eigen::Vector3f point_current, point_past;
-  size_t point_counter{};
+  size_t point_counter = 0;
   for (auto pc_cache_iter = pd_ptr_->getPointCloudCacheIter(); !pd_ptr_->isCacheEnd(pc_cache_iter);
        pc_cache_iter++) {
-    auto pc_msg = pc_cache_iter->pointcloud_msg;
+    auto sweep_num_points = pc_cache_iter->num_points;
+    auto point_step = pc_cache_iter->point_step;
     auto affine_past2current =
       pd_ptr_->getAffineWorldToCurrent() * pc_cache_iter->affine_past2world;
     float time_lag = static_cast<float>(
-      pd_ptr_->getCurrentTimestamp() - rclcpp::Time(pc_msg.header.stamp).seconds());
-
-    for (sensor_msgs::PointCloud2ConstIterator<float> x_iter(pc_msg, "x"), y_iter(pc_msg, "y"),
-         z_iter(pc_msg, "z");
-         x_iter != x_iter.end(); ++x_iter, ++y_iter, ++z_iter) {
-      point_past << *x_iter, *y_iter, *z_iter;
-      point_current = affine_past2current * point_past;
+      pd_ptr_->getCurrentTimestamp() - rclcpp::Time(pc_cache_iter->header.stamp).seconds());
 
-      points.at(point_counter * config_.point_feature_size_) = point_current.x();
-      points.at(point_counter * config_.point_feature_size_ + 1) = point_current.y();
-      points.at(point_counter * config_.point_feature_size_ + 2) = point_current.z();
-      points.at(point_counter * config_.point_feature_size_ + 3) = time_lag;
-      ++point_counter;
+    if (point_counter + sweep_num_points > CAPACITY_POINT) {
+      RCLCPP_WARN_STREAM(
+        rclcpp::get_logger("lidar_centerpoint"),
+        "Requested number of points exceeds the maximum capacity. Current points = "
+          << point_counter);
+      break;
     }
+
+    static_assert(std::is_same<decltype(affine_past2current.matrix()), Eigen::Matrix4f &>::value);
+    static_assert(!Eigen::Matrix4f::IsRowMajor, "matrices should be col-major.");
+    generateSweepPoints_launch(
+      pc_cache_iter->points_d.get(), sweep_num_points, point_step / sizeof(float), time_lag,
+      affine_past2current.matrix().data(), config_.point_feature_size_,
+      points_d + config_.point_feature_size_ * point_counter, stream);
+
+    point_counter += sweep_num_points;
   }
   return point_counter;
 }
diff --git a/perception/lidar_centerpoint/src/node.cpp b/perception/lidar_centerpoint/src/node.cpp
index f9687a6342c9c..8bf62e3e0168b 100644
--- a/perception/lidar_centerpoint/src/node.cpp
+++ b/perception/lidar_centerpoint/src/node.cpp
@@ -14,17 +14,7 @@
 
 #include "lidar_centerpoint/node.hpp"
 
-#include <lidar_centerpoint/centerpoint_config.hpp>
-#include <lidar_centerpoint/preprocess/pointcloud_densification.hpp>
-#include <lidar_centerpoint/ros_utils.hpp>
-#include <lidar_centerpoint/utils.hpp>
-#include <pcl_ros/transforms.hpp>
-
-#ifdef ROS_DISTRO_GALACTIC
-#include <tf2_geometry_msgs/tf2_geometry_msgs.h>
-#else
-#include <tf2_geometry_msgs/tf2_geometry_msgs.hpp>
-#endif
+#include "pcl_ros/transforms.hpp"
 
 #include <Eigen/Dense>
 #include <Eigen/Geometry>
@@ -33,6 +23,17 @@
 #include <string>
 #include <vector>
 
+#ifdef ROS_DISTRO_GALACTIC
+#include "tf2_geometry_msgs/tf2_geometry_msgs.h"
+#else
+#include "tf2_geometry_msgs/tf2_geometry_msgs.hpp"
+#endif
+
+#include "lidar_centerpoint/centerpoint_config.hpp"
+#include "lidar_centerpoint/preprocess/pointcloud_densification.hpp"
+#include "lidar_centerpoint/ros_utils.hpp"
+#include "lidar_centerpoint/utils.hpp"
+
 namespace centerpoint
 {
 LidarCenterPointNode::LidarCenterPointNode(const rclcpp::NodeOptions & node_options)
diff --git a/perception/lidar_centerpoint/test/test_postprocess_kernel.hpp b/perception/lidar_centerpoint/test/test_postprocess_kernel.hpp
index d0c9123da9e77..8f35b98119afe 100644
--- a/perception/lidar_centerpoint/test/test_postprocess_kernel.hpp
+++ b/perception/lidar_centerpoint/test/test_postprocess_kernel.hpp
@@ -31,17 +31,17 @@ class PostprocessKernelTest : public testing::Test
   void SetUp() override;
   void TearDown() override;
 
-  cudaStream_t stream_{nullptr};
+  cudaStream_t stream_{};
 
-  std::unique_ptr<PostProcessCUDA> postprocess_cuda_ptr_;
-  std::unique_ptr<centerpoint::CenterPointConfig> config_ptr_;
+  std::unique_ptr<PostProcessCUDA> postprocess_cuda_ptr_{};
+  std::unique_ptr<centerpoint::CenterPointConfig> config_ptr_{};
 
-  cuda::unique_ptr<float[]> head_out_heatmap_d_;
-  cuda::unique_ptr<float[]> head_out_offset_d_;
-  cuda::unique_ptr<float[]> head_out_z_d_;
-  cuda::unique_ptr<float[]> head_out_dim_d_;
-  cuda::unique_ptr<float[]> head_out_rot_d_;
-  cuda::unique_ptr<float[]> head_out_vel_d_;
+  cuda::unique_ptr<float[]> head_out_heatmap_d_{};
+  cuda::unique_ptr<float[]> head_out_offset_d_{};
+  cuda::unique_ptr<float[]> head_out_z_d_{};
+  cuda::unique_ptr<float[]> head_out_dim_d_{};
+  cuda::unique_ptr<float[]> head_out_rot_d_{};
+  cuda::unique_ptr<float[]> head_out_vel_d_{};
 };
 
 }  // namespace centerpoint
diff --git a/perception/lidar_centerpoint/test/test_preprocess_kernel.hpp b/perception/lidar_centerpoint/test/test_preprocess_kernel.hpp
index 57ff51966a417..1a3e850026886 100644
--- a/perception/lidar_centerpoint/test/test_preprocess_kernel.hpp
+++ b/perception/lidar_centerpoint/test/test_preprocess_kernel.hpp
@@ -28,7 +28,7 @@ class PreprocessKernelTest : public testing::Test
   void SetUp() override;
   void TearDown() override;
 
-  cudaStream_t stream_{nullptr};
+  cudaStream_t stream_{};
 
   std::size_t max_voxel_size_{};
   std::size_t max_point_in_voxel_size_{};
diff --git a/perception/lidar_centerpoint/test/test_voxel_generator.cpp b/perception/lidar_centerpoint/test/test_voxel_generator.cpp
index 5b0b3cd112e6c..6bded105536dc 100644
--- a/perception/lidar_centerpoint/test/test_voxel_generator.cpp
+++ b/perception/lidar_centerpoint/test/test_voxel_generator.cpp
@@ -14,9 +14,9 @@
 
 #include "test_voxel_generator.hpp"
 
-#include <sensor_msgs/point_cloud2_iterator.hpp>
+#include "gtest/gtest.h"
 
-#include <gtest/gtest.h>
+#include "sensor_msgs/point_cloud2_iterator.hpp"
 
 void VoxelGeneratorTest::SetUp()
 {
@@ -95,6 +95,8 @@ void VoxelGeneratorTest::SetUp()
   transform2_ = transform1_;
   transform2_.header.stamp = cloud2_->header.stamp;
   transform2_.transform.translation.x = world_origin_x + delta_pointcloud_x_;
+
+  cudaStreamCreate(&stream_);
 }
 
 void VoxelGeneratorTest::TearDown()
@@ -117,8 +119,17 @@ TEST_F(VoxelGeneratorTest, SingleFrame)
   points.resize(capacity_ * config.point_feature_size_);
   std::fill(points.begin(), points.end(), std::nan(""));
 
-  bool status1 = voxel_generator.enqueuePointCloud(*cloud1_, *tf2_buffer_);
-  std::size_t generated_points_num = voxel_generator.generateSweepPoints(points);
+  auto points_d = cuda::make_unique<float[]>(capacity_ * config.point_feature_size_);
+  cudaMemcpy(
+    points_d.get(), points.data(), capacity_ * config.point_feature_size_ * sizeof(float),
+    cudaMemcpyHostToDevice);
+
+  bool status1 = voxel_generator.enqueuePointCloud(*cloud1_, *tf2_buffer_, stream_);
+  std::size_t generated_points_num = voxel_generator.generateSweepPoints(points_d.get(), stream_);
+
+  cudaMemcpy(
+    points.data(), points_d.get(), capacity_ * config.point_feature_size_ * sizeof(float),
+    cudaMemcpyDeviceToHost);
 
   EXPECT_TRUE(status1);
   EXPECT_EQ(points_per_pointcloud_, generated_points_num);
@@ -155,9 +166,18 @@ TEST_F(VoxelGeneratorTest, TwoFramesNoTf)
   points.resize(capacity_ * config.point_feature_size_);
   std::fill(points.begin(), points.end(), std::nan(""));
 
-  bool status1 = voxel_generator.enqueuePointCloud(*cloud1_, *tf2_buffer_);
-  bool status2 = voxel_generator.enqueuePointCloud(*cloud2_, *tf2_buffer_);
-  std::size_t generated_points_num = voxel_generator.generateSweepPoints(points);
+  auto points_d = cuda::make_unique<float[]>(capacity_ * config.point_feature_size_);
+  cudaMemcpy(
+    points_d.get(), points.data(), capacity_ * config.point_feature_size_ * sizeof(float),
+    cudaMemcpyHostToDevice);
+
+  bool status1 = voxel_generator.enqueuePointCloud(*cloud1_, *tf2_buffer_, stream_);
+  bool status2 = voxel_generator.enqueuePointCloud(*cloud2_, *tf2_buffer_, stream_);
+  std::size_t generated_points_num = voxel_generator.generateSweepPoints(points_d.get(), stream_);
+
+  cudaMemcpy(
+    points.data(), points_d.get(), capacity_ * config.point_feature_size_ * sizeof(float),
+    cudaMemcpyDeviceToHost);
 
   EXPECT_FALSE(status1);
   EXPECT_FALSE(status2);
@@ -180,12 +200,21 @@ TEST_F(VoxelGeneratorTest, TwoFrames)
   points.resize(capacity_ * config.point_feature_size_);
   std::fill(points.begin(), points.end(), std::nan(""));
 
+  auto points_d = cuda::make_unique<float[]>(capacity_ * config.point_feature_size_);
+  cudaMemcpy(
+    points_d.get(), points.data(), capacity_ * config.point_feature_size_ * sizeof(float),
+    cudaMemcpyHostToDevice);
+
   tf2_buffer_->setTransform(transform1_, "authority1");
   tf2_buffer_->setTransform(transform2_, "authority1");
 
-  bool status1 = voxel_generator.enqueuePointCloud(*cloud1_, *tf2_buffer_);
-  bool status2 = voxel_generator.enqueuePointCloud(*cloud2_, *tf2_buffer_);
-  std::size_t generated_points_num = voxel_generator.generateSweepPoints(points);
+  bool status1 = voxel_generator.enqueuePointCloud(*cloud1_, *tf2_buffer_, stream_);
+  bool status2 = voxel_generator.enqueuePointCloud(*cloud2_, *tf2_buffer_, stream_);
+  std::size_t generated_points_num = voxel_generator.generateSweepPoints(points_d.get(), stream_);
+
+  cudaMemcpy(
+    points.data(), points_d.get(), capacity_ * config.point_feature_size_ * sizeof(float),
+    cudaMemcpyDeviceToHost);
 
   EXPECT_TRUE(status1);
   EXPECT_TRUE(status2);
diff --git a/perception/lidar_centerpoint/test/test_voxel_generator.hpp b/perception/lidar_centerpoint/test/test_voxel_generator.hpp
index 8fb7d8dffa44d..48355b8a331ba 100644
--- a/perception/lidar_centerpoint/test/test_voxel_generator.hpp
+++ b/perception/lidar_centerpoint/test/test_voxel_generator.hpp
@@ -33,34 +33,36 @@ class VoxelGeneratorTest : public testing::Test
   void TearDown() override;
 
   // These need to be public so that they can be accessed in the test cases
-  rclcpp::Node::SharedPtr node_;
+  rclcpp::Node::SharedPtr node_{};
 
-  std::unique_ptr<sensor_msgs::msg::PointCloud2> cloud1_, cloud2_;
-  geometry_msgs::msg::TransformStamped transform1_, transform2_;
+  std::unique_ptr<sensor_msgs::msg::PointCloud2> cloud1_{}, cloud2_{};
+  geometry_msgs::msg::TransformStamped transform1_{}, transform2_{};
 
-  std::unique_ptr<centerpoint::DensificationParam> densification_param_ptr_;
-  std::unique_ptr<centerpoint::CenterPointConfig> config_ptr_;
+  std::unique_ptr<centerpoint::DensificationParam> densification_param_ptr_{};
+  std::unique_ptr<centerpoint::CenterPointConfig> config_ptr_{};
 
-  std::unique_ptr<tf2_ros::Buffer> tf2_buffer_;
+  std::unique_ptr<tf2_ros::Buffer> tf2_buffer_{};
 
-  std::string world_frame_;
-  std::string lidar_frame_;
-  std::size_t points_per_pointcloud_;
-  std::size_t capacity_;
-  double delta_pointcloud_x_;
+  std::string world_frame_{};
+  std::string lidar_frame_{};
+  std::size_t points_per_pointcloud_{};
+  std::size_t capacity_{};
+  double delta_pointcloud_x_{};
 
-  std::size_t class_size_;
-  float point_feature_size_;
-  std::size_t max_voxel_size_;
+  std::size_t class_size_{};
+  float point_feature_size_{};
+  std::size_t max_voxel_size_{};
 
-  std::vector<double> point_cloud_range_;
-  std::vector<double> voxel_size_;
-  std::size_t downsample_factor_;
-  std::size_t encoder_in_feature_size_;
-  float score_threshold_;
-  float circle_nms_dist_threshold_;
-  std::vector<double> yaw_norm_thresholds_;
-  bool has_variance_;
+  std::vector<double> point_cloud_range_{};
+  std::vector<double> voxel_size_{};
+  std::size_t downsample_factor_{};
+  std::size_t encoder_in_feature_size_{};
+  float score_threshold_{};
+  float circle_nms_dist_threshold_{};
+  std::vector<double> yaw_norm_thresholds_{};
+  bool has_variance_{};
+
+  cudaStream_t stream_{};
 };
 
 #endif  // TEST_VOXEL_GENERATOR_HPP_

From 233df1720fd7236ef298cbd9840f541b2648786d Mon Sep 17 00:00:00 2001
From: Ryuta Kambe <ryuta.kambe@tier4.jp>
Date: Tue, 25 Jun 2024 22:49:38 +0900
Subject: [PATCH 02/15] fix(lidar_centerpoint): fix constexpr related bugs
 (#7686)

Signed-off-by: Ryuta Kambe <ryuta.kambe@tier4.jp>
---
 .../test/test_postprocess_kernel.cpp             | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/perception/lidar_centerpoint/test/test_postprocess_kernel.cpp b/perception/lidar_centerpoint/test/test_postprocess_kernel.cpp
index fde4fcbee3d6c..e75aff416b8aa 100644
--- a/perception/lidar_centerpoint/test/test_postprocess_kernel.cpp
+++ b/perception/lidar_centerpoint/test/test_postprocess_kernel.cpp
@@ -94,12 +94,12 @@ TEST_F(PostprocessKernelTest, SingleDetectionTest)
   constexpr float detection_x = 70.f;
   constexpr float detection_y = -38.4f;
   constexpr float detection_z = 1.0;
-  constexpr float detection_log_w = std::log(7.0);
-  constexpr float detection_log_l = std::log(1.0);
-  constexpr float detection_log_h = std::log(2.0);
+  const float detection_log_w = std::log(7.0);
+  const float detection_log_l = std::log(1.0);
+  const float detection_log_h = std::log(2.0);
   constexpr float detection_yaw = M_PI_4;
-  constexpr float detection_yaw_sin = std::sin(detection_yaw);
-  constexpr float detection_yaw_cos = std::sin(detection_yaw);
+  const float detection_yaw_sin = std::sin(detection_yaw);
+  const float detection_yaw_cos = std::sin(detection_yaw);
   constexpr float detection_vel_x = 5.0;
   constexpr float detection_vel_y = -5.0;
 
@@ -240,9 +240,9 @@ TEST_F(PostprocessKernelTest, CircleNMSTest)
   constexpr float detection_x = 70.f;
   constexpr float detection_y = -38.4f;
   constexpr float detection_z = 1.0;
-  constexpr float detection_log_w = std::log(7.0);
-  constexpr float detection_log_l = std::log(1.0);
-  constexpr float detection_log_h = std::log(2.0);
+  const float detection_log_w = std::log(7.0);
+  const float detection_log_l = std::log(1.0);
+  const float detection_log_h = std::log(2.0);
   constexpr float detection_yaw1_sin = 0.0;
   constexpr float detection_yaw1_cos = 1.0;
   constexpr float detection_yaw2_sin = 1.0;

From 94e633d6f19669520c4eee75082d11b8a5b64dfd Mon Sep 17 00:00:00 2001
From: Amadeusz Szymko <amadeusz.szymko.2@tier4.jp>
Date: Mon, 1 Jul 2024 15:51:21 +0900
Subject: [PATCH 03/15] fix(lidar_centerpoint): remove python3-open3d depend
 (#7758)

Signed-off-by: Amadeusz Szymko <amadeusz.szymko.2@tier4.jp>
---
 perception/lidar_centerpoint/package.xml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/perception/lidar_centerpoint/package.xml b/perception/lidar_centerpoint/package.xml
index 280e2219fb1f1..4c6fa471c297a 100644
--- a/perception/lidar_centerpoint/package.xml
+++ b/perception/lidar_centerpoint/package.xml
@@ -16,7 +16,6 @@
   <depend>autoware_universe_utils</depend>
   <depend>object_recognition_utils</depend>
   <depend>pcl_ros</depend>
-  <depend>python3-open3d</depend>
   <depend>rclcpp</depend>
   <depend>rclcpp_components</depend>
   <depend>tensorrt_common</depend>

From 2c763b8a86b9588b918391e4a59f908e73b93761 Mon Sep 17 00:00:00 2001
From: Kenzo Lobos Tsunekawa <kenzo.lobos@tier4.jp>
Date: Wed, 17 Jul 2024 17:10:04 +0900
Subject: [PATCH 04/15] fix: fixed compilation errors after cherrypicks

fix: ml detector buffer capacity and parameterization (#7936)

* fix: increased the buffer capacity in ML-based detectors (1M -> 2M due to new models) but parameterized to users can adapt it to their use cases

Signed-off-by: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp>

* chore: added schema parts

Signed-off-by: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp>

* chore: points are countable !

Signed-off-by: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp>

---------

Signed-off-by: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp>
---
 .../config/pointpainting.param.yaml                 |  1 +
 .../pointpainting_fusion/pointpainting_trt.hpp      |  1 -
 .../src/pointpainting_fusion/node.cpp               |  7 ++++---
 .../lidar_centerpoint/config/centerpoint.param.yaml |  1 +
 .../config/centerpoint_tiny.param.yaml              |  1 +
 .../lidar_centerpoint/centerpoint_config.hpp        | 13 ++++++++-----
 .../include/lidar_centerpoint/centerpoint_trt.hpp   |  2 --
 .../lidar_centerpoint/lib/centerpoint_trt.cpp       |  4 ++--
 .../lib/preprocess/voxel_generator.cpp              |  2 +-
 .../schema/centerpoint.schemal.json                 |  6 ++++++
 perception/lidar_centerpoint/src/node.cpp           |  7 ++++---
 .../lidar_transfusion/config/transfusion.param.yaml |  1 +
 .../lidar_transfusion/transfusion_config.hpp        |  9 ++++++---
 .../schema/transfusion.schema.json                  |  6 ++++++
 .../src/lidar_transfusion_node.cpp                  |  8 ++++++--
 15 files changed, 47 insertions(+), 22 deletions(-)

diff --git a/perception/image_projection_based_fusion/config/pointpainting.param.yaml b/perception/image_projection_based_fusion/config/pointpainting.param.yaml
index 30136bc8f47d0..9227e4fa9319a 100644
--- a/perception/image_projection_based_fusion/config/pointpainting.param.yaml
+++ b/perception/image_projection_based_fusion/config/pointpainting.param.yaml
@@ -5,6 +5,7 @@
     head_onnx_path: "$(var model_path)/pts_backbone_neck_head_$(var model_name).onnx"
     head_engine_path: "$(var model_path)/pts_backbone_neck_head_$(var model_name).engine"
     trt_precision: fp16
+    cloud_capacity: 2000000
     post_process_params:
       # post-process params
       circle_nms_dist_threshold: 0.3
diff --git a/perception/image_projection_based_fusion/include/image_projection_based_fusion/pointpainting_fusion/pointpainting_trt.hpp b/perception/image_projection_based_fusion/include/image_projection_based_fusion/pointpainting_fusion/pointpainting_trt.hpp
index d5892de6d7edb..c8318f79bbeed 100644
--- a/perception/image_projection_based_fusion/include/image_projection_based_fusion/pointpainting_fusion/pointpainting_trt.hpp
+++ b/perception/image_projection_based_fusion/include/image_projection_based_fusion/pointpainting_fusion/pointpainting_trt.hpp
@@ -23,7 +23,6 @@
 
 namespace image_projection_based_fusion
 {
-static constexpr size_t CAPACITY_POINT = 2000000;
 class PointPaintingTRT : public centerpoint::CenterPointTRT
 {
 public:
diff --git a/perception/image_projection_based_fusion/src/pointpainting_fusion/node.cpp b/perception/image_projection_based_fusion/src/pointpainting_fusion/node.cpp
index 3fb2aca2171f7..4df660e65a8c1 100644
--- a/perception/image_projection_based_fusion/src/pointpainting_fusion/node.cpp
+++ b/perception/image_projection_based_fusion/src/pointpainting_fusion/node.cpp
@@ -110,6 +110,7 @@ PointPaintingFusionNode::PointPaintingFusionNode(const rclcpp::NodeOptions & opt
     this->declare_parameter<int>("densification_params.num_past_frames");
   // network param
   const std::string trt_precision = this->declare_parameter<std::string>("trt_precision");
+  const std::size_t cloud_capacity = this->declare_parameter<std::int64_t>("cloud_capacity");
   const std::string encoder_onnx_path = this->declare_parameter<std::string>("encoder_onnx_path");
   const std::string encoder_engine_path =
     this->declare_parameter<std::string>("encoder_engine_path");
@@ -188,9 +189,9 @@ PointPaintingFusionNode::PointPaintingFusionNode(const rclcpp::NodeOptions & opt
   centerpoint::DensificationParam densification_param(
     densification_world_frame_id, densification_num_past_frames);
   centerpoint::CenterPointConfig config(
-    class_names_.size(), point_feature_size, max_voxel_size, pointcloud_range, voxel_size,
-    downsample_factor, encoder_in_feature_size, score_threshold, circle_nms_dist_threshold,
-    yaw_norm_thresholds, has_variance_);
+    class_names_.size(), point_feature_size, cloud_capacity, max_voxel_size, pointcloud_range,
+    voxel_size, downsample_factor, encoder_in_feature_size, score_threshold,
+    circle_nms_dist_threshold, yaw_norm_thresholds, has_variance_);
 
   // create detector
   detector_ptr_ = std::make_unique<image_projection_based_fusion::PointPaintingTRT>(
diff --git a/perception/lidar_centerpoint/config/centerpoint.param.yaml b/perception/lidar_centerpoint/config/centerpoint.param.yaml
index 17ed9a447cdb8..53d77e8d1c42c 100644
--- a/perception/lidar_centerpoint/config/centerpoint.param.yaml
+++ b/perception/lidar_centerpoint/config/centerpoint.param.yaml
@@ -6,6 +6,7 @@
     head_onnx_path: "$(var model_path)/pts_backbone_neck_head_$(var model_name).onnx"
     head_engine_path: "$(var model_path)/pts_backbone_neck_head_$(var model_name).engine"
     trt_precision: fp16
+    cloud_capacity: 2000000
     post_process_params:
       # post-process params
       circle_nms_dist_threshold: 0.5
diff --git a/perception/lidar_centerpoint/config/centerpoint_tiny.param.yaml b/perception/lidar_centerpoint/config/centerpoint_tiny.param.yaml
index 17ed9a447cdb8..53d77e8d1c42c 100644
--- a/perception/lidar_centerpoint/config/centerpoint_tiny.param.yaml
+++ b/perception/lidar_centerpoint/config/centerpoint_tiny.param.yaml
@@ -6,6 +6,7 @@
     head_onnx_path: "$(var model_path)/pts_backbone_neck_head_$(var model_name).onnx"
     head_engine_path: "$(var model_path)/pts_backbone_neck_head_$(var model_name).engine"
     trt_precision: fp16
+    cloud_capacity: 2000000
     post_process_params:
       # post-process params
       circle_nms_dist_threshold: 0.5
diff --git a/perception/lidar_centerpoint/include/lidar_centerpoint/centerpoint_config.hpp b/perception/lidar_centerpoint/include/lidar_centerpoint/centerpoint_config.hpp
index 19fdbe7a8b9c2..7b560cf46e2e3 100644
--- a/perception/lidar_centerpoint/include/lidar_centerpoint/centerpoint_config.hpp
+++ b/perception/lidar_centerpoint/include/lidar_centerpoint/centerpoint_config.hpp
@@ -24,14 +24,16 @@ class CenterPointConfig
 {
 public:
   explicit CenterPointConfig(
-    const std::size_t class_size, const float point_feature_size, const std::size_t max_voxel_size,
-    const std::vector<double> & point_cloud_range, const std::vector<double> & voxel_size,
-    const std::size_t downsample_factor, const std::size_t encoder_in_feature_size,
-    const float score_threshold, const float circle_nms_dist_threshold,
-    const std::vector<double> yaw_norm_thresholds, const bool has_variance)
+    const std::size_t class_size, const float point_feature_size, const std::size_t cloud_capacity,
+    const std::size_t max_voxel_size, const std::vector<double> & point_cloud_range,
+    const std::vector<double> & voxel_size, const std::size_t downsample_factor,
+    const std::size_t encoder_in_feature_size, const float score_threshold,
+    const float circle_nms_dist_threshold, const std::vector<double> yaw_norm_thresholds,
+    const bool has_variance)
   {
     class_size_ = class_size;
     point_feature_size_ = point_feature_size;
+    cloud_capacity_ = cloud_capacity;
     max_voxel_size_ = max_voxel_size;
     if (point_cloud_range.size() == 6) {
       range_min_x_ = static_cast<float>(point_cloud_range[0]);
@@ -85,6 +87,7 @@ class CenterPointConfig
   };
 
   // input params
+  std::size_t cloud_capacity_{};
   std::size_t class_size_{3};
   const std::size_t point_dim_size_{3};  // x, y and z
   std::size_t point_feature_size_{4};    // x, y, z and time-lag
diff --git a/perception/lidar_centerpoint/include/lidar_centerpoint/centerpoint_trt.hpp b/perception/lidar_centerpoint/include/lidar_centerpoint/centerpoint_trt.hpp
index df8100fb6b80e..52ae86951c7cf 100644
--- a/perception/lidar_centerpoint/include/lidar_centerpoint/centerpoint_trt.hpp
+++ b/perception/lidar_centerpoint/include/lidar_centerpoint/centerpoint_trt.hpp
@@ -31,8 +31,6 @@
 
 namespace centerpoint
 {
-static constexpr size_t CAPACITY_POINT = 2000000;
-
 class NetworkParam
 {
 public:
diff --git a/perception/lidar_centerpoint/lib/centerpoint_trt.cpp b/perception/lidar_centerpoint/lib/centerpoint_trt.cpp
index 0e3791e061f62..7047bf25711de 100644
--- a/perception/lidar_centerpoint/lib/centerpoint_trt.cpp
+++ b/perception/lidar_centerpoint/lib/centerpoint_trt.cpp
@@ -85,7 +85,7 @@ void CenterPointTRT::initPtr()
   mask_size_ = config_.grid_size_x_ * config_.grid_size_y_;
 
   // host
-  points_.resize(CAPACITY_POINT * config_.point_feature_size_);
+  points_.resize(config_.cloud_capacity_ * config_.point_feature_size_);
 
   // device
   voxels_d_ = cuda::make_unique<float[]>(voxels_size_);
@@ -100,7 +100,7 @@ void CenterPointTRT::initPtr()
   head_out_dim_d_ = cuda::make_unique<float[]>(grid_xy_size * config_.head_out_dim_size_);
   head_out_rot_d_ = cuda::make_unique<float[]>(grid_xy_size * config_.head_out_rot_size_);
   head_out_vel_d_ = cuda::make_unique<float[]>(grid_xy_size * config_.head_out_vel_size_);
-  points_d_ = cuda::make_unique<float[]>(CAPACITY_POINT * config_.point_feature_size_);
+  points_d_ = cuda::make_unique<float[]>(config_.cloud_capacity_ * config_.point_feature_size_);
   voxels_buffer_d_ = cuda::make_unique<float[]>(voxels_buffer_size_);
   mask_d_ = cuda::make_unique<unsigned int[]>(mask_size_);
   num_voxels_d_ = cuda::make_unique<unsigned int[]>(1);
diff --git a/perception/lidar_centerpoint/lib/preprocess/voxel_generator.cpp b/perception/lidar_centerpoint/lib/preprocess/voxel_generator.cpp
index e90474fa07327..8e91c3b0b36cc 100644
--- a/perception/lidar_centerpoint/lib/preprocess/voxel_generator.cpp
+++ b/perception/lidar_centerpoint/lib/preprocess/voxel_generator.cpp
@@ -61,7 +61,7 @@ std::size_t VoxelGenerator::generateSweepPoints(float * points_d, cudaStream_t s
     float time_lag = static_cast<float>(
       pd_ptr_->getCurrentTimestamp() - rclcpp::Time(pc_cache_iter->header.stamp).seconds());
 
-    if (point_counter + sweep_num_points > CAPACITY_POINT) {
+    if (point_counter + sweep_num_points > config_.cloud_capacity_) {
       RCLCPP_WARN_STREAM(
         rclcpp::get_logger("lidar_centerpoint"),
         "Requested number of points exceeds the maximum capacity. Current points = "
diff --git a/perception/lidar_centerpoint/schema/centerpoint.schemal.json b/perception/lidar_centerpoint/schema/centerpoint.schemal.json
index b11c115cc2466..c3268f0e90d9b 100644
--- a/perception/lidar_centerpoint/schema/centerpoint.schemal.json
+++ b/perception/lidar_centerpoint/schema/centerpoint.schemal.json
@@ -12,6 +12,12 @@
           "default": "fp16",
           "enum": ["fp32", "fp16"]
         },
+        "cloud_capacity": {
+          "type": "integer",
+          "description": "Capacity of the point cloud buffer (should be set to at least the maximum theoretical number of points).",
+          "default": 2000000,
+          "minimum": 1
+        },
         "post_process_params": {
           "type": "object",
           "properties": {
diff --git a/perception/lidar_centerpoint/src/node.cpp b/perception/lidar_centerpoint/src/node.cpp
index 8bf62e3e0168b..53554d0a3becf 100644
--- a/perception/lidar_centerpoint/src/node.cpp
+++ b/perception/lidar_centerpoint/src/node.cpp
@@ -50,6 +50,7 @@ LidarCenterPointNode::LidarCenterPointNode(const rclcpp::NodeOptions & node_opti
   const int densification_num_past_frames =
     this->declare_parameter<int>("densification_params.num_past_frames");
   const std::string trt_precision = this->declare_parameter<std::string>("trt_precision");
+  const std::size_t cloud_capacity = this->declare_parameter<std::int64_t>("cloud_capacity");
   const std::string encoder_onnx_path = this->declare_parameter<std::string>("encoder_onnx_path");
   const std::string encoder_engine_path =
     this->declare_parameter<std::string>("encoder_engine_path");
@@ -104,9 +105,9 @@ LidarCenterPointNode::LidarCenterPointNode(const rclcpp::NodeOptions & node_opti
       "The size of voxel_size != 3: use the default parameters.");
   }
   CenterPointConfig config(
-    class_names_.size(), point_feature_size, max_voxel_size, point_cloud_range, voxel_size,
-    downsample_factor, encoder_in_feature_size, score_threshold, circle_nms_dist_threshold,
-    yaw_norm_thresholds, has_variance_);
+    class_names_.size(), point_feature_size, cloud_capacity, max_voxel_size, point_cloud_range,
+    voxel_size, downsample_factor, encoder_in_feature_size, score_threshold,
+    circle_nms_dist_threshold, yaw_norm_thresholds, has_variance_);
   detector_ptr_ =
     std::make_unique<CenterPointTRT>(encoder_param, head_param, densification_param, config);
 
diff --git a/perception/lidar_transfusion/config/transfusion.param.yaml b/perception/lidar_transfusion/config/transfusion.param.yaml
index feabe359caf1f..2f1d56872dad3 100644
--- a/perception/lidar_transfusion/config/transfusion.param.yaml
+++ b/perception/lidar_transfusion/config/transfusion.param.yaml
@@ -3,6 +3,7 @@
     # network
     class_names: ["CAR", "TRUCK", "BUS", "BICYCLE", "PEDESTRIAN"]
     trt_precision: fp16
+    cloud_capacity: 2000000
     voxels_num: [5000, 30000, 60000]  # [min, opt, max]
     point_cloud_range: [-76.8, -76.8, -3.0, 76.8, 76.8, 5.0]  # [x_min, y_min, z_min, x_max, y_max, z_max]
     voxel_size: [0.3, 0.3, 8.0]  # [x, y, z]
diff --git a/perception/lidar_transfusion/include/lidar_transfusion/transfusion_config.hpp b/perception/lidar_transfusion/include/lidar_transfusion/transfusion_config.hpp
index 31976de56a9da..c6a55d1043a92 100644
--- a/perception/lidar_transfusion/include/lidar_transfusion/transfusion_config.hpp
+++ b/perception/lidar_transfusion/include/lidar_transfusion/transfusion_config.hpp
@@ -25,10 +25,13 @@ class TransfusionConfig
 {
 public:
   TransfusionConfig(
-    const std::vector<int64_t> & voxels_num, const std::vector<double> & point_cloud_range,
-    const std::vector<double> & voxel_size, const float circle_nms_dist_threshold,
+    const std::size_t cloud_capacity, const std::vector<int64_t> & voxels_num,
+    const std::vector<double> & point_cloud_range, const std::vector<double> & voxel_size,
+    const std::size_t num_proposals, const float circle_nms_dist_threshold,
     const std::vector<double> & yaw_norm_thresholds, const float score_threshold)
   {
+    cloud_capacity_ = cloud_capacity;
+
     if (voxels_num.size() == 3) {
       max_voxels_ = voxels_num[2];
 
@@ -79,7 +82,7 @@ class TransfusionConfig
   }
 
   ///// INPUT PARAMETERS /////
-  const std::size_t cloud_capacity_{1000000};
+  std::size_t cloud_capacity_{};
   ///// KERNEL PARAMETERS /////
   const std::size_t threads_for_voxel_{256};  // threads number for a block
   const std::size_t points_per_voxel_{20};
diff --git a/perception/lidar_transfusion/schema/transfusion.schema.json b/perception/lidar_transfusion/schema/transfusion.schema.json
index 41d8d887236a8..0bbbdc888a261 100644
--- a/perception/lidar_transfusion/schema/transfusion.schema.json
+++ b/perception/lidar_transfusion/schema/transfusion.schema.json
@@ -21,6 +21,12 @@
           "default": "fp16",
           "enum": ["fp16", "fp32"]
         },
+        "cloud_capacity": {
+          "type": "integer",
+          "description": "Capacity of the point cloud buffer (should be set to at least the maximum theoretical number of points).",
+          "default": 2000000,
+          "minimum": 1
+        },
         "voxels_num": {
           "type": "array",
           "items": {
diff --git a/perception/lidar_transfusion/src/lidar_transfusion_node.cpp b/perception/lidar_transfusion/src/lidar_transfusion_node.cpp
index e3ea6b3780de8..36cd7e2693413 100644
--- a/perception/lidar_transfusion/src/lidar_transfusion_node.cpp
+++ b/perception/lidar_transfusion/src/lidar_transfusion_node.cpp
@@ -27,10 +27,14 @@ LidarTransfusionNode::LidarTransfusionNode(const rclcpp::NodeOptions & options)
   class_names_ = this->declare_parameter<std::vector<std::string>>("class_names", descriptor);
   const std::string trt_precision =
     this->declare_parameter<std::string>("trt_precision", descriptor);
+  const std::size_t cloud_capacity =
+    this->declare_parameter<std::int64_t>("cloud_capacity", descriptor);
   const auto voxels_num = this->declare_parameter<std::vector<int64_t>>("voxels_num", descriptor);
   const auto point_cloud_range =
     this->declare_parameter<std::vector<double>>("point_cloud_range", descriptor);
   const auto voxel_size = this->declare_parameter<std::vector<double>>("voxel_size", descriptor);
+  const std::size_t num_proposals =
+    this->declare_parameter<std::int64_t>("num_proposals", descriptor);
   const std::string onnx_path = this->declare_parameter<std::string>("onnx_path", descriptor);
   const std::string engine_path = this->declare_parameter<std::string>("engine_path", descriptor);
 
@@ -73,8 +77,8 @@ LidarTransfusionNode::LidarTransfusionNode(const rclcpp::NodeOptions & options)
   DensificationParam densification_param(
     densification_world_frame_id, densification_num_past_frames);
   TransfusionConfig config(
-    voxels_num, point_cloud_range, voxel_size, circle_nms_dist_threshold, yaw_norm_thresholds,
-    score_threshold);
+    voxels_num, point_cloud_range, voxel_size, num_proposals, circle_nms_dist_threshold,
+    yaw_norm_thresholds, score_threshold);
 
   const auto allow_remapping_by_area_matrix =
     this->declare_parameter<std::vector<int64_t>>("allow_remapping_by_area_matrix", descriptor);

From 0b6ea664659e787db0999b9642b1ad3f7f5945bc Mon Sep 17 00:00:00 2001
From: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp>
Date: Mon, 16 Dec 2024 11:53:36 +0900
Subject: [PATCH 05/15] fix: fixed cherrycommit induced errors

Signed-off-by: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp>
---
 .../test/test_postprocess_kernel.cpp          |  3 ++-
 .../test/test_voxel_generator.cpp             | 19 ++++++++++---------
 .../test/test_voxel_generator.hpp             |  1 +
 .../lidar_transfusion/transfusion_config.hpp  |  5 ++++-
 .../src/lidar_transfusion_node.cpp            |  4 ++--
 5 files changed, 19 insertions(+), 13 deletions(-)

diff --git a/perception/lidar_centerpoint/test/test_postprocess_kernel.cpp b/perception/lidar_centerpoint/test/test_postprocess_kernel.cpp
index e75aff416b8aa..cf09df79a96d3 100644
--- a/perception/lidar_centerpoint/test/test_postprocess_kernel.cpp
+++ b/perception/lidar_centerpoint/test/test_postprocess_kernel.cpp
@@ -29,6 +29,7 @@ void PostprocessKernelTest::SetUp()
 
   constexpr std::size_t class_size{5};
   constexpr std::size_t point_feature_size{4};
+  const std::size_t cloud_capacity{2000000};
   constexpr std::size_t max_voxel_size{100000000};
   const std::vector<double> point_cloud_range{-76.8, -76.8, -4.0, 76.8, 76.8, 6.0};
   const std::vector<double> voxel_size{0.32, 0.32, 10.0};
@@ -40,7 +41,7 @@ void PostprocessKernelTest::SetUp()
   constexpr bool has_variance{false};
 
   config_ptr_ = std::make_unique<centerpoint::CenterPointConfig>(
-    class_size, point_feature_size, max_voxel_size, point_cloud_range, voxel_size,
+    class_size, point_feature_size, cloud_capacity, max_voxel_size, point_cloud_range, voxel_size,
     downsample_factor, encoder_in_feature_size, score_threshold, circle_nms_dist_threshold,
     yaw_norm_thresholds, has_variance);
 
diff --git a/perception/lidar_centerpoint/test/test_voxel_generator.cpp b/perception/lidar_centerpoint/test/test_voxel_generator.cpp
index 6bded105536dc..ea4565aba5a5c 100644
--- a/perception/lidar_centerpoint/test/test_voxel_generator.cpp
+++ b/perception/lidar_centerpoint/test/test_voxel_generator.cpp
@@ -31,6 +31,7 @@ void VoxelGeneratorTest::SetUp()
 
   class_size_ = 5;
   point_feature_size_ = 4;
+  cloud_capacity_ = 2000000;
   max_voxel_size_ = 100000000;
   point_cloud_range_ = std::vector<double>{-76.8, -76.8, -4.0, 76.8, 76.8, 6.0};
   voxel_size_ = std::vector<double>{0.32, 0.32, 10.0};
@@ -110,9 +111,9 @@ TEST_F(VoxelGeneratorTest, SingleFrame)
   centerpoint::DensificationParam param(world_frame_, num_past_frames);
 
   centerpoint::CenterPointConfig config(
-    class_size_, point_feature_size_, max_voxel_size_, point_cloud_range_, voxel_size_,
-    downsample_factor_, encoder_in_feature_size_, score_threshold_, circle_nms_dist_threshold_,
-    yaw_norm_thresholds_, has_variance_);
+    class_size_, point_feature_size_, cloud_capacity_, max_voxel_size_, point_cloud_range_,
+    voxel_size_, downsample_factor_, encoder_in_feature_size_, score_threshold_,
+    circle_nms_dist_threshold_, yaw_norm_thresholds_, has_variance_);
 
   centerpoint::VoxelGenerator voxel_generator(param, config);
   std::vector<float> points;
@@ -157,9 +158,9 @@ TEST_F(VoxelGeneratorTest, TwoFramesNoTf)
   centerpoint::DensificationParam param(world_frame_, num_past_frames);
 
   centerpoint::CenterPointConfig config(
-    class_size_, point_feature_size_, max_voxel_size_, point_cloud_range_, voxel_size_,
-    downsample_factor_, encoder_in_feature_size_, score_threshold_, circle_nms_dist_threshold_,
-    yaw_norm_thresholds_, has_variance_);
+    class_size_, point_feature_size_, cloud_capacity_, max_voxel_size_, point_cloud_range_,
+    voxel_size_, downsample_factor_, encoder_in_feature_size_, score_threshold_,
+    circle_nms_dist_threshold_, yaw_norm_thresholds_, has_variance_);
 
   centerpoint::VoxelGenerator voxel_generator(param, config);
   std::vector<float> points;
@@ -191,9 +192,9 @@ TEST_F(VoxelGeneratorTest, TwoFrames)
   centerpoint::DensificationParam param(world_frame_, num_past_frames);
 
   centerpoint::CenterPointConfig config(
-    class_size_, point_feature_size_, max_voxel_size_, point_cloud_range_, voxel_size_,
-    downsample_factor_, encoder_in_feature_size_, score_threshold_, circle_nms_dist_threshold_,
-    yaw_norm_thresholds_, has_variance_);
+    class_size_, point_feature_size_, cloud_capacity_, max_voxel_size_, point_cloud_range_,
+    voxel_size_, downsample_factor_, encoder_in_feature_size_, score_threshold_,
+    circle_nms_dist_threshold_, yaw_norm_thresholds_, has_variance_);
 
   centerpoint::VoxelGenerator voxel_generator(param, config);
   std::vector<float> points;
diff --git a/perception/lidar_centerpoint/test/test_voxel_generator.hpp b/perception/lidar_centerpoint/test/test_voxel_generator.hpp
index 48355b8a331ba..5ca97fedc368c 100644
--- a/perception/lidar_centerpoint/test/test_voxel_generator.hpp
+++ b/perception/lidar_centerpoint/test/test_voxel_generator.hpp
@@ -51,6 +51,7 @@ class VoxelGeneratorTest : public testing::Test
 
   std::size_t class_size_{};
   float point_feature_size_{};
+  std::size_t cloud_capacity_{};
   std::size_t max_voxel_size_{};
 
   std::vector<double> point_cloud_range_{};
diff --git a/perception/lidar_transfusion/include/lidar_transfusion/transfusion_config.hpp b/perception/lidar_transfusion/include/lidar_transfusion/transfusion_config.hpp
index c6a55d1043a92..4b1262c545533 100644
--- a/perception/lidar_transfusion/include/lidar_transfusion/transfusion_config.hpp
+++ b/perception/lidar_transfusion/include/lidar_transfusion/transfusion_config.hpp
@@ -64,6 +64,9 @@ class TransfusionConfig
       voxel_y_size_ = static_cast<float>(voxel_size[1]);
       voxel_z_size_ = static_cast<float>(voxel_size[2]);
     }
+    if (num_proposals > 0) {
+      num_proposals_ = num_proposals;
+    }
     if (score_threshold > 0.0) {
       score_threshold_ = score_threshold;
     }
@@ -110,7 +113,7 @@ class TransfusionConfig
   const std::size_t out_size_factor_{4};
   const std::size_t max_num_points_per_pillar_{points_per_voxel_};
   const std::size_t num_point_values_{4};
-  const std::size_t num_proposals_{200};
+  std::size_t num_proposals_{200};
   // the number of feature maps for pillar scatter
   const std::size_t num_feature_scatter_{pillar_feature_size_};
   // the score threshold for classification
diff --git a/perception/lidar_transfusion/src/lidar_transfusion_node.cpp b/perception/lidar_transfusion/src/lidar_transfusion_node.cpp
index 36cd7e2693413..a07e385208748 100644
--- a/perception/lidar_transfusion/src/lidar_transfusion_node.cpp
+++ b/perception/lidar_transfusion/src/lidar_transfusion_node.cpp
@@ -77,8 +77,8 @@ LidarTransfusionNode::LidarTransfusionNode(const rclcpp::NodeOptions & options)
   DensificationParam densification_param(
     densification_world_frame_id, densification_num_past_frames);
   TransfusionConfig config(
-    voxels_num, point_cloud_range, voxel_size, num_proposals, circle_nms_dist_threshold,
-    yaw_norm_thresholds, score_threshold);
+    cloud_capacity, voxels_num, point_cloud_range, voxel_size, num_proposals,
+    circle_nms_dist_threshold, yaw_norm_thresholds, score_threshold);
 
   const auto allow_remapping_by_area_matrix =
     this->declare_parameter<std::vector<int64_t>>("allow_remapping_by_area_matrix", descriptor);

From ef582c727011c09e79d3f054d32ce30ff9d42681 Mon Sep 17 00:00:00 2001
From: kminoda <44218668+kminoda@users.noreply.github.com>
Date: Wed, 31 Jul 2024 12:17:24 +0900
Subject: [PATCH 06/15] docs(centerpoint): add description for ml package
 params (#8187)

---
 perception/lidar_centerpoint/README.md | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/perception/lidar_centerpoint/README.md b/perception/lidar_centerpoint/README.md
index fa42655b6ca1f..d7e81f7add6ea 100644
--- a/perception/lidar_centerpoint/README.md
+++ b/perception/lidar_centerpoint/README.md
@@ -28,6 +28,22 @@ We trained the models using <https://github.com/open-mmlab/mmdetection3d>.
 
 ## Parameters
 
+### ML Model Parameters
+
+Note that these parameters are associated with ONNX file, predefined during the training phase. Be careful to change ONNX file as well when changing this parameter. Also, whenever you update the ONNX file, do NOT forget to check these values.
+
+| Name                                   | Type         | Default Value                                    | Description                                                           |
+| -------------------------------------- | ------------ | ------------------------------------------------ | --------------------------------------------------------------------- |
+| `model_params.class_names`             | list[string] | ["CAR", "TRUCK", "BUS", "BICYCLE", "PEDESTRIAN"] | list of class names for model outputs                                 |
+| `model_params.point_feature_size`      | int          | `4`                                              | number of features per point in the point cloud                       |
+| `model_params.max_voxel_size`          | int          | `40000`                                          | maximum number of voxels                                              |
+| `model_params.point_cloud_range`       | list[double] | [-76.8, -76.8, -4.0, 76.8, 76.8, 6.0]            | detection range [min_x, min_y, min_z, max_x, max_y, max_z] [m]        |
+| `model_params.voxel_size`              | list[double] | [0.32, 0.32, 10.0]                               | size of each voxel [x, y, z] [m]                                      |
+| `model_params.downsample_factor`       | int          | `1`                                              | downsample factor for coordinates                                     |
+| `model_params.encoder_in_feature_size` | int          | `9`                                              | number of input features to the encoder                               |
+| `model_params.has_variance`            | bool         | `false`                                          | true if the model outputs pose variance as well as pose for each bbox |
+| `model_params.has_twist`               | bool         | `false`                                          | true if the model outputs velocity as well as pose for each bbox      |
+
 ### Core Parameters
 
 | Name                                             | Type         | Default Value             | Description                                                   |

From a5097b4529499cf6579a5516006b6cadf800c7fa Mon Sep 17 00:00:00 2001
From: Amadeusz Szymko <amadeusz.szymko.2@tier4.jp>
Date: Thu, 1 Aug 2024 10:04:54 +0900
Subject: [PATCH 07/15] fix(autoware_lidar_centerpoint): place device vector in
 CUDA device system (#8272)

Signed-off-by: amadeuszsz <amadeusz.szymko.2@tier4.jp>
Signed-off-by: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp>
---
 .../postprocess/postprocess_kernel.hpp          |  3 ---
 .../lib/postprocess/postprocess_kernel.cu       | 17 ++++++++---------
 2 files changed, 8 insertions(+), 12 deletions(-)

diff --git a/perception/lidar_centerpoint/include/lidar_centerpoint/postprocess/postprocess_kernel.hpp b/perception/lidar_centerpoint/include/lidar_centerpoint/postprocess/postprocess_kernel.hpp
index e15d3022c947c..58431b48eb917 100644
--- a/perception/lidar_centerpoint/include/lidar_centerpoint/postprocess/postprocess_kernel.hpp
+++ b/perception/lidar_centerpoint/include/lidar_centerpoint/postprocess/postprocess_kernel.hpp
@@ -19,7 +19,6 @@
 #include "cuda_runtime_api.h"
 #include "lidar_centerpoint/centerpoint_config.hpp"
 #include "lidar_centerpoint/utils.hpp"
-#include "thrust/device_vector.h"
 
 #include <vector>
 
@@ -37,8 +36,6 @@ class PostProcessCUDA
 
 private:
   CenterPointConfig config_;
-  thrust::device_vector<Box3D> boxes3d_d_;
-  thrust::device_vector<float> yaw_norm_thresholds_d_;
 };
 
 }  // namespace centerpoint
diff --git a/perception/lidar_centerpoint/lib/postprocess/postprocess_kernel.cu b/perception/lidar_centerpoint/lib/postprocess/postprocess_kernel.cu
index 12835bab038a6..fb8e364b352fc 100644
--- a/perception/lidar_centerpoint/lib/postprocess/postprocess_kernel.cu
+++ b/perception/lidar_centerpoint/lib/postprocess/postprocess_kernel.cu
@@ -137,10 +137,6 @@ __global__ void generateBoxes3D_kernel(
 
 PostProcessCUDA::PostProcessCUDA(const CenterPointConfig & config) : config_(config)
 {
-  const auto num_raw_boxes3d = config.down_grid_size_y_ * config.down_grid_size_x_;
-  boxes3d_d_ = thrust::device_vector<Box3D>(num_raw_boxes3d);
-  yaw_norm_thresholds_d_ = thrust::device_vector<float>(
-    config_.yaw_norm_thresholds_.begin(), config_.yaw_norm_thresholds_.end());
 }
 
 // cspell: ignore divup
@@ -153,23 +149,26 @@ cudaError_t PostProcessCUDA::generateDetectedBoxes3D_launch(
     divup(config_.down_grid_size_y_, THREADS_PER_BLOCK),
     divup(config_.down_grid_size_x_, THREADS_PER_BLOCK));
   dim3 threads(THREADS_PER_BLOCK, THREADS_PER_BLOCK);
+  auto boxes3d_d =
+    thrust::device_vector<Box3D>(config_.down_grid_size_y_ * config_.down_grid_size_x_);
+  auto yaw_norm_thresholds_d = thrust::device_vector<float>(
+    config_.yaw_norm_thresholds_.begin(), config_.yaw_norm_thresholds_.end());
   generateBoxes3D_kernel<<<blocks, threads, 0, stream>>>(
     out_heatmap, out_offset, out_z, out_dim, out_rot, out_vel, config_.voxel_size_x_,
     config_.voxel_size_y_, config_.range_min_x_, config_.range_min_y_, config_.down_grid_size_x_,
     config_.down_grid_size_y_, config_.downsample_factor_, config_.class_size_,
-    config_.has_variance_, thrust::raw_pointer_cast(yaw_norm_thresholds_d_.data()),
-    thrust::raw_pointer_cast(boxes3d_d_.data()));
+    config_.has_variance_, thrust::raw_pointer_cast(yaw_norm_thresholds_d.data()),
+    thrust::raw_pointer_cast(boxes3d_d.data()));
 
   // suppress by score
   const auto num_det_boxes3d = thrust::count_if(
-    thrust::device, boxes3d_d_.begin(), boxes3d_d_.end(),
-    is_score_greater(config_.score_threshold_));
+    thrust::device, boxes3d_d.begin(), boxes3d_d.end(), is_score_greater(config_.score_threshold_));
   if (num_det_boxes3d == 0) {
     return cudaGetLastError();
   }
   thrust::device_vector<Box3D> det_boxes3d_d(num_det_boxes3d);
   thrust::copy_if(
-    thrust::device, boxes3d_d_.begin(), boxes3d_d_.end(), det_boxes3d_d.begin(),
+    thrust::device, boxes3d_d.begin(), boxes3d_d.end(), det_boxes3d_d.begin(),
     is_score_greater(config_.score_threshold_));
 
   // sort by score

From ac1838f2edb1e9fd7eeeb81e76f4623118b07366 Mon Sep 17 00:00:00 2001
From: kobayu858 <129580202+kobayu858@users.noreply.github.com>
Date: Thu, 22 Aug 2024 16:58:57 +0900
Subject: [PATCH 08/15] fix(autoware_lidar_centerpoint): fix unusedFunction
 (#8572)

fix:unusedFunction

Signed-off-by: kobayu858 <yutaro.kobayashi@tier4.jp>
---
 perception/lidar_centerpoint/lib/utils.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/perception/lidar_centerpoint/lib/utils.cpp b/perception/lidar_centerpoint/lib/utils.cpp
index b6e0a54ab6de9..ade0bce8a4ff3 100644
--- a/perception/lidar_centerpoint/lib/utils.cpp
+++ b/perception/lidar_centerpoint/lib/utils.cpp
@@ -19,7 +19,7 @@
 namespace centerpoint
 {
 // cspell: ignore divup
-std::size_t divup(const std::size_t a, const std::size_t b)
+std::size_t divup(const std::size_t a, const std::size_t b)  // cppcheck-suppress unusedFunction
 {
   if (a == 0) {
     throw std::runtime_error("A dividend of divup isn't positive.");

From 3b73b5ba9657d05b26b2d4a8b39b6236104adb2f Mon Sep 17 00:00:00 2001
From: Yoshi Ri <yoshiyoshidetteiu@gmail.com>
Date: Tue, 3 Sep 2024 19:26:49 +0900
Subject: [PATCH 09/15] chore(autoware_lidar_centerpoint): add centerpoint
 sigma parameter (#8731)

add centerpoint sigma parameter

Signed-off-by: yoshiri <yoshiyoshidetteiu@gmail.com>
Signed-off-by: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp>
---
 .../config/centerpoint_sigma.param.yaml       | 21 +++++++++++++++++++
 1 file changed, 21 insertions(+)
 create mode 100644 perception/lidar_centerpoint/config/centerpoint_sigma.param.yaml

diff --git a/perception/lidar_centerpoint/config/centerpoint_sigma.param.yaml b/perception/lidar_centerpoint/config/centerpoint_sigma.param.yaml
new file mode 100644
index 0000000000000..bd5fc5f3567a5
--- /dev/null
+++ b/perception/lidar_centerpoint/config/centerpoint_sigma.param.yaml
@@ -0,0 +1,21 @@
+/**:
+  ros__parameters:
+
+    # weight files
+    encoder_onnx_path: "$(var model_path)/pts_voxel_encoder_$(var model_name).onnx"
+    encoder_engine_path: "$(var model_path)/pts_voxel_encoder_$(var model_name).engine"
+    head_onnx_path: "$(var model_path)/pts_backbone_neck_head_$(var model_name).onnx"
+    head_engine_path: "$(var model_path)/pts_backbone_neck_head_$(var model_name).engine"
+    trt_precision: fp16
+    cloud_capacity: 2000000
+    post_process_params:
+      # post-process params
+      circle_nms_dist_threshold: 0.5
+      iou_nms_target_class_names: ["CAR"]
+      iou_nms_search_distance_2d: 10.0
+      iou_nms_threshold: 0.1
+      yaw_norm_thresholds: [0.3, 0.3, 0.3, 0.3, 0.0]
+      score_threshold: 0.35
+    densification_params:
+      world_frame_id: map
+      num_past_frames: 1

From d2c763f9b7237918dc798abf4b05f5d2c46929db Mon Sep 17 00:00:00 2001
From: Kenzo Lobos Tsunekawa <kenzo.lobos@tier4.jp>
Date: Tue, 10 Sep 2024 17:14:27 +0900
Subject: [PATCH 10/15] feat(autoware_lidar_centerpoint): shuffled points
 before feeding them to the model (#8814)

* feat: shuffling points before feeding them into the model to achieve uniform sampling into the voxels

Signed-off-by: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp>

* Update perception/autoware_lidar_centerpoint/src/node.cpp

Co-authored-by: kminoda <44218668+kminoda@users.noreply.github.com>

* Update perception/autoware_lidar_centerpoint/src/node.cpp

Co-authored-by: kminoda <44218668+kminoda@users.noreply.github.com>

* Update perception/autoware_lidar_centerpoint/lib/centerpoint_trt.cpp

Co-authored-by: kminoda <44218668+kminoda@users.noreply.github.com>

* Update perception/autoware_lidar_centerpoint/include/autoware/lidar_centerpoint/centerpoint_config.hpp

Co-authored-by: kminoda <44218668+kminoda@users.noreply.github.com>

---------

Signed-off-by: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp>
Co-authored-by: kminoda <44218668+kminoda@users.noreply.github.com>
Signed-off-by: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp>
---
 .../lidar_centerpoint/centerpoint_trt.hpp     |  2 +
 .../include/lidar_centerpoint/cuda_utils.hpp  | 12 +++---
 .../preprocess/preprocess_kernel.hpp          |  5 +++
 .../lidar_centerpoint/lib/centerpoint_trt.cpp | 35 +++++++++++++---
 .../lib/preprocess/preprocess_kernel.cu       | 42 +++++++++++++++++++
 5 files changed, 85 insertions(+), 11 deletions(-)

diff --git a/perception/lidar_centerpoint/include/lidar_centerpoint/centerpoint_trt.hpp b/perception/lidar_centerpoint/include/lidar_centerpoint/centerpoint_trt.hpp
index 52ae86951c7cf..cbbf01c1e6405 100644
--- a/perception/lidar_centerpoint/include/lidar_centerpoint/centerpoint_trt.hpp
+++ b/perception/lidar_centerpoint/include/lidar_centerpoint/centerpoint_trt.hpp
@@ -102,9 +102,11 @@ class CenterPointTRT
   cuda::unique_ptr<float[]> head_out_rot_d_{nullptr};
   cuda::unique_ptr<float[]> head_out_vel_d_{nullptr};
   cuda::unique_ptr<float[]> points_d_{nullptr};
+  cuda::unique_ptr<float[]> points_aux_d_{nullptr};
   cuda::unique_ptr<float[]> voxels_buffer_d_{nullptr};
   cuda::unique_ptr<unsigned int[]> mask_d_{nullptr};
   cuda::unique_ptr<unsigned int[]> num_voxels_d_{nullptr};
+  cuda::unique_ptr<unsigned int[]> shuffle_indices_d_{nullptr};
 };
 
 }  // namespace centerpoint
diff --git a/perception/lidar_centerpoint/include/lidar_centerpoint/cuda_utils.hpp b/perception/lidar_centerpoint/include/lidar_centerpoint/cuda_utils.hpp
index efe3bbb9a5482..494d499dc2924 100644
--- a/perception/lidar_centerpoint/include/lidar_centerpoint/cuda_utils.hpp
+++ b/perception/lidar_centerpoint/include/lidar_centerpoint/cuda_utils.hpp
@@ -89,13 +89,13 @@ cuda::unique_ptr<T> make_unique()
   return cuda::unique_ptr<T>{p};
 }
 
-constexpr size_t CUDA_ALIGN = 256;
+constexpr std::size_t CUDA_ALIGN = 256;
 
 template <typename T>
-inline size_t get_size_aligned(size_t num_elem)
+inline std::size_t get_size_aligned(std::size_t num_elem)
 {
-  size_t size = num_elem * sizeof(T);
-  size_t extra_align = 0;
+  std::size_t size = num_elem * sizeof(T);
+  std::size_t extra_align = 0;
   if (size % CUDA_ALIGN != 0) {
     extra_align = CUDA_ALIGN - size % CUDA_ALIGN;
   }
@@ -103,9 +103,9 @@ inline size_t get_size_aligned(size_t num_elem)
 }
 
 template <typename T>
-inline T * get_next_ptr(size_t num_elem, void *& workspace, size_t & workspace_size)
+inline T * get_next_ptr(std::size_t num_elem, void *& workspace, std::size_t & workspace_size)
 {
-  size_t size = get_size_aligned<T>(num_elem);
+  std::size_t size = get_size_aligned<T>(num_elem);
   if (size > workspace_size) {
     throw ::std::runtime_error("Workspace is too small!");
   }
diff --git a/perception/lidar_centerpoint/include/lidar_centerpoint/preprocess/preprocess_kernel.hpp b/perception/lidar_centerpoint/include/lidar_centerpoint/preprocess/preprocess_kernel.hpp
index 3abcd89cb5c55..85ed58d6c840c 100644
--- a/perception/lidar_centerpoint/include/lidar_centerpoint/preprocess/preprocess_kernel.hpp
+++ b/perception/lidar_centerpoint/include/lidar_centerpoint/preprocess/preprocess_kernel.hpp
@@ -24,6 +24,11 @@ cudaError_t generateSweepPoints_launch(
   const float * input_points, size_t points_size, int input_point_step, float time_lag,
   const float * transform, int num_features, float * output_points, cudaStream_t stream);
 
+cudaError_t shufflePoints_launch(
+  const float * points, const unsigned int * indices, float * shuffled_points,
+  const std::size_t points_size, const std::size_t max_size, const std::size_t offset,
+  cudaStream_t stream);
+
 cudaError_t generateVoxels_random_launch(
   const float * points, size_t points_size, float min_x_range, float max_x_range, float min_y_range,
   float max_y_range, float min_z_range, float max_z_range, float pillar_x_size, float pillar_y_size,
diff --git a/perception/lidar_centerpoint/lib/centerpoint_trt.cpp b/perception/lidar_centerpoint/lib/centerpoint_trt.cpp
index 7047bf25711de..a80d00a1f004f 100644
--- a/perception/lidar_centerpoint/lib/centerpoint_trt.cpp
+++ b/perception/lidar_centerpoint/lib/centerpoint_trt.cpp
@@ -20,8 +20,12 @@
 
 #include <autoware/universe_utils/math/constants.hpp>
 
+#include <algorithm>
+#include <cstdlib>
+#include <ctime>
 #include <iostream>
 #include <memory>
+#include <random>
 #include <string>
 #include <vector>
 
@@ -104,6 +108,21 @@ void CenterPointTRT::initPtr()
   voxels_buffer_d_ = cuda::make_unique<float[]>(voxels_buffer_size_);
   mask_d_ = cuda::make_unique<unsigned int[]>(mask_size_);
   num_voxels_d_ = cuda::make_unique<unsigned int[]>(1);
+
+  points_aux_d_ = cuda::make_unique<float[]>(config_.cloud_capacity_ * config_.point_feature_size_);
+  shuffle_indices_d_ = cuda::make_unique<unsigned int[]>(config_.cloud_capacity_);
+
+  std::vector<unsigned int> indexes(config_.cloud_capacity_);
+  std::iota(indexes.begin(), indexes.end(), 0);
+
+  std::default_random_engine e(0);
+  std::shuffle(indexes.begin(), indexes.end(), e);
+
+  std::srand(std::time(nullptr));
+
+  CHECK_CUDA_ERROR(cudaMemcpyAsync(
+    shuffle_indices_d_.get(), indexes.data(), config_.cloud_capacity_ * sizeof(unsigned int),
+    cudaMemcpyHostToDevice, stream_));
 }
 
 bool CenterPointTRT::detect(
@@ -148,7 +167,13 @@ bool CenterPointTRT::preprocess(
   if (!is_success) {
     return false;
   }
-  const auto count = vg_ptr_->generateSweepPoints(points_d_.get(), stream_);
+
+  const std::size_t count = vg_ptr_->generateSweepPoints(points_aux_d_.get(), stream_);
+  const std::size_t random_offset = std::rand() % config_.cloud_capacity_;
+  CHECK_CUDA_ERROR(shufflePoints_launch(
+    points_aux_d_.get(), shuffle_indices_d_.get(), points_d_.get(), count, config_.cloud_capacity_,
+    random_offset, stream_));
+
   CHECK_CUDA_ERROR(cudaMemsetAsync(num_voxels_d_.get(), 0, sizeof(unsigned int), stream_));
   CHECK_CUDA_ERROR(
     cudaMemsetAsync(voxels_buffer_d_.get(), 0, voxels_buffer_size_ * sizeof(float), stream_));
@@ -160,10 +185,10 @@ bool CenterPointTRT::preprocess(
     num_points_per_voxel_d_.get(), 0, config_.max_voxel_size_ * sizeof(float), stream_));
 
   CHECK_CUDA_ERROR(generateVoxels_random_launch(
-    points_d_.get(), count, config_.range_min_x_, config_.range_max_x_, config_.range_min_y_,
-    config_.range_max_y_, config_.range_min_z_, config_.range_max_z_, config_.voxel_size_x_,
-    config_.voxel_size_y_, config_.voxel_size_z_, config_.grid_size_y_, config_.grid_size_x_,
-    mask_d_.get(), voxels_buffer_d_.get(), stream_));
+    points_d_.get(), config_.cloud_capacity_, config_.range_min_x_, config_.range_max_x_,
+    config_.range_min_y_, config_.range_max_y_, config_.range_min_z_, config_.range_max_z_,
+    config_.voxel_size_x_, config_.voxel_size_y_, config_.voxel_size_z_, config_.grid_size_y_,
+    config_.grid_size_x_, mask_d_.get(), voxels_buffer_d_.get(), stream_));
 
   CHECK_CUDA_ERROR(generateBaseFeatures_launch(
     mask_d_.get(), voxels_buffer_d_.get(), config_.grid_size_y_, config_.grid_size_x_,
diff --git a/perception/lidar_centerpoint/lib/preprocess/preprocess_kernel.cu b/perception/lidar_centerpoint/lib/preprocess/preprocess_kernel.cu
index 4e78176739c26..17f224055e28e 100644
--- a/perception/lidar_centerpoint/lib/preprocess/preprocess_kernel.cu
+++ b/perception/lidar_centerpoint/lib/preprocess/preprocess_kernel.cu
@@ -33,6 +33,7 @@
 #include "lidar_centerpoint/utils.hpp"
 
 #include <cassert>
+#include <cmath>
 
 namespace
 {
@@ -88,6 +89,47 @@ cudaError_t generateSweepPoints_launch(
   return err;
 }
 
+__global__ void shufflePoints_kernel(
+  const float * points, const unsigned int * indices, float * shuffled_points,
+  const std::size_t points_size, const std::size_t max_size, const std::size_t offset)
+{
+  int point_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (point_idx >= max_size) return;
+
+  int src_idx = indices[(point_idx + offset) % max_size];
+  int dst_idx = point_idx;
+
+  if (dst_idx >= points_size) {
+    shuffled_points[4 * dst_idx + 0] = INFINITY;
+    shuffled_points[4 * dst_idx + 1] = INFINITY;
+    shuffled_points[4 * dst_idx + 2] = INFINITY;
+    shuffled_points[4 * dst_idx + 3] = INFINITY;
+  } else {
+    shuffled_points[4 * dst_idx + 0] = points[4 * src_idx + 0];
+    shuffled_points[4 * dst_idx + 1] = points[4 * src_idx + 1];
+    shuffled_points[4 * dst_idx + 2] = points[4 * src_idx + 2];
+    shuffled_points[4 * dst_idx + 3] = points[4 * src_idx + 3];
+  }
+}
+
+cudaError_t shufflePoints_launch(
+  const float * points, const unsigned int * indices, float * shuffled_points,
+  const std::size_t points_size, const std::size_t max_size, const std::size_t offset,
+  cudaStream_t stream)
+{
+  dim3 blocks((max_size + 256 - 1) / 256);
+  dim3 threads(256);
+
+  if (blocks.x == 0) {
+    return cudaGetLastError();
+  }
+
+  shufflePoints_kernel<<<blocks, threads, 0, stream>>>(
+    points, indices, shuffled_points, points_size, max_size, offset);
+  cudaError_t err = cudaGetLastError();
+  return err;
+}
+
 __global__ void generateVoxels_random_kernel(
   const float * points, size_t points_size, float min_x_range, float max_x_range, float min_y_range,
   float max_y_range, float min_z_range, float max_z_range, float pillar_x_size, float pillar_y_size,

From a2bf67608266b4ded779c7eaecfb3df11c7a7438 Mon Sep 17 00:00:00 2001
From: kminoda <44218668+kminoda@users.noreply.github.com>
Date: Tue, 10 Sep 2024 14:54:43 +0900
Subject: [PATCH 11/15] refactor(autoware_lidar_centerpoint): use std::size_t
 instead of size_t (#8820)

* refactor(autoware_lidar_centerpoint): use std::size_t instead of size_t

Signed-off-by: kminoda <koji.minoda@tier4.jp>

* style(pre-commit): autofix

---------

Signed-off-by: kminoda <koji.minoda@tier4.jp>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 .../network/tensorrt_wrapper.hpp              |  2 +-
 .../preprocess/pointcloud_densification.hpp   |  4 ++--
 .../preprocess/preprocess_kernel.hpp          | 10 +++++-----
 .../lib/network/tensorrt_wrapper.cpp          |  2 +-
 .../lib/preprocess/preprocess_kernel.cu       | 19 ++++++++++---------
 .../lib/preprocess/voxel_generator.cpp        |  2 +-
 .../include/lidar_transfusion/cuda_utils.hpp  | 12 ++++++------
 7 files changed, 26 insertions(+), 25 deletions(-)

diff --git a/perception/lidar_centerpoint/include/lidar_centerpoint/network/tensorrt_wrapper.hpp b/perception/lidar_centerpoint/include/lidar_centerpoint/network/tensorrt_wrapper.hpp
index 4af5aac7ff7fe..7415dc192d7ab 100644
--- a/perception/lidar_centerpoint/include/lidar_centerpoint/network/tensorrt_wrapper.hpp
+++ b/perception/lidar_centerpoint/include/lidar_centerpoint/network/tensorrt_wrapper.hpp
@@ -49,7 +49,7 @@ class TensorRTWrapper
 private:
   bool parseONNX(
     const std::string & onnx_path, const std::string & engine_path, const std::string & precision,
-    size_t workspace_size = (1ULL << 30));
+    std::size_t workspace_size = (1ULL << 30));
 
   bool saveEngine(const std::string & engine_path);
 
diff --git a/perception/lidar_centerpoint/include/lidar_centerpoint/preprocess/pointcloud_densification.hpp b/perception/lidar_centerpoint/include/lidar_centerpoint/preprocess/pointcloud_densification.hpp
index c1d6a6b060955..5b7b3beb343cc 100644
--- a/perception/lidar_centerpoint/include/lidar_centerpoint/preprocess/pointcloud_densification.hpp
+++ b/perception/lidar_centerpoint/include/lidar_centerpoint/preprocess/pointcloud_densification.hpp
@@ -52,8 +52,8 @@ struct PointCloudWithTransform
 {
   cuda::unique_ptr<float[]> points_d{nullptr};
   std_msgs::msg::Header header;
-  size_t num_points{0};
-  size_t point_step{0};
+  std::size_t num_points{0};
+  std::size_t point_step{0};
   Eigen::Affine3f affine_past2world;
 };
 
diff --git a/perception/lidar_centerpoint/include/lidar_centerpoint/preprocess/preprocess_kernel.hpp b/perception/lidar_centerpoint/include/lidar_centerpoint/preprocess/preprocess_kernel.hpp
index 85ed58d6c840c..62495d655723e 100644
--- a/perception/lidar_centerpoint/include/lidar_centerpoint/preprocess/preprocess_kernel.hpp
+++ b/perception/lidar_centerpoint/include/lidar_centerpoint/preprocess/preprocess_kernel.hpp
@@ -21,7 +21,7 @@
 namespace centerpoint
 {
 cudaError_t generateSweepPoints_launch(
-  const float * input_points, size_t points_size, int input_point_step, float time_lag,
+  const float * input_points, std::size_t points_size, int input_point_step, float time_lag,
   const float * transform, int num_features, float * output_points, cudaStream_t stream);
 
 cudaError_t shufflePoints_launch(
@@ -30,10 +30,10 @@ cudaError_t shufflePoints_launch(
   cudaStream_t stream);
 
 cudaError_t generateVoxels_random_launch(
-  const float * points, size_t points_size, float min_x_range, float max_x_range, float min_y_range,
-  float max_y_range, float min_z_range, float max_z_range, float pillar_x_size, float pillar_y_size,
-  float pillar_z_size, int grid_y_size, int grid_x_size, unsigned int * mask, float * voxels,
-  cudaStream_t stream);
+  const float * points, std::size_t points_size, float min_x_range, float max_x_range,
+  float min_y_range, float max_y_range, float min_z_range, float max_z_range, float pillar_x_size,
+  float pillar_y_size, float pillar_z_size, int grid_y_size, int grid_x_size, unsigned int * mask,
+  float * voxels, cudaStream_t stream);
 
 cudaError_t generateBaseFeatures_launch(
   unsigned int * mask, float * voxels, int grid_y_size, int grid_x_size, int max_voxel_size,
diff --git a/perception/lidar_centerpoint/lib/network/tensorrt_wrapper.cpp b/perception/lidar_centerpoint/lib/network/tensorrt_wrapper.cpp
index bdfde42a12a02..3fc522eea15c9 100644
--- a/perception/lidar_centerpoint/lib/network/tensorrt_wrapper.cpp
+++ b/perception/lidar_centerpoint/lib/network/tensorrt_wrapper.cpp
@@ -80,7 +80,7 @@ bool TensorRTWrapper::createContext()
 
 bool TensorRTWrapper::parseONNX(
   const std::string & onnx_path, const std::string & engine_path, const std::string & precision,
-  const size_t workspace_size)
+  const std::size_t workspace_size)
 {
   auto builder =
     tensorrt_common::TrtUniquePtr<nvinfer1::IBuilder>(nvinfer1::createInferBuilder(logger_));
diff --git a/perception/lidar_centerpoint/lib/preprocess/preprocess_kernel.cu b/perception/lidar_centerpoint/lib/preprocess/preprocess_kernel.cu
index 17f224055e28e..01f62f22e892b 100644
--- a/perception/lidar_centerpoint/lib/preprocess/preprocess_kernel.cu
+++ b/perception/lidar_centerpoint/lib/preprocess/preprocess_kernel.cu
@@ -46,7 +46,7 @@ namespace centerpoint
 {
 
 __global__ void generateSweepPoints_kernel(
-  const float * input_points, size_t points_size, int input_point_step, float time_lag,
+  const float * input_points, std::size_t points_size, int input_point_step, float time_lag,
   const float * transform_array, int num_features, float * output_points)
 {
   int point_idx = blockIdx.x * blockDim.x + threadIdx.x;
@@ -70,7 +70,7 @@ __global__ void generateSweepPoints_kernel(
 }
 
 cudaError_t generateSweepPoints_launch(
-  const float * input_points, size_t points_size, int input_point_step, float time_lag,
+  const float * input_points, std::size_t points_size, int input_point_step, float time_lag,
   const float * transform_array, int num_features, float * output_points, cudaStream_t stream)
 {
   auto transform_d = cuda::make_unique<float[]>(16);
@@ -131,9 +131,10 @@ cudaError_t shufflePoints_launch(
 }
 
 __global__ void generateVoxels_random_kernel(
-  const float * points, size_t points_size, float min_x_range, float max_x_range, float min_y_range,
-  float max_y_range, float min_z_range, float max_z_range, float pillar_x_size, float pillar_y_size,
-  float pillar_z_size, int grid_y_size, int grid_x_size, unsigned int * mask, float * voxels)
+  const float * points, std::size_t points_size, float min_x_range, float max_x_range,
+  float min_y_range, float max_y_range, float min_z_range, float max_z_range, float pillar_x_size,
+  float pillar_y_size, float pillar_z_size, int grid_y_size, int grid_x_size, unsigned int * mask,
+  float * voxels)
 {
   int point_idx = blockIdx.x * blockDim.x + threadIdx.x;
   if (point_idx >= points_size) return;
@@ -160,10 +161,10 @@ __global__ void generateVoxels_random_kernel(
 }
 
 cudaError_t generateVoxels_random_launch(
-  const float * points, size_t points_size, float min_x_range, float max_x_range, float min_y_range,
-  float max_y_range, float min_z_range, float max_z_range, float pillar_x_size, float pillar_y_size,
-  float pillar_z_size, int grid_y_size, int grid_x_size, unsigned int * mask, float * voxels,
-  cudaStream_t stream)
+  const float * points, std::size_t points_size, float min_x_range, float max_x_range,
+  float min_y_range, float max_y_range, float min_z_range, float max_z_range, float pillar_x_size,
+  float pillar_y_size, float pillar_z_size, int grid_y_size, int grid_x_size, unsigned int * mask,
+  float * voxels, cudaStream_t stream)
 {
   dim3 blocks((points_size + 256 - 1) / 256);
   dim3 threads(256);
diff --git a/perception/lidar_centerpoint/lib/preprocess/voxel_generator.cpp b/perception/lidar_centerpoint/lib/preprocess/voxel_generator.cpp
index 8e91c3b0b36cc..9148ed993c9c3 100644
--- a/perception/lidar_centerpoint/lib/preprocess/voxel_generator.cpp
+++ b/perception/lidar_centerpoint/lib/preprocess/voxel_generator.cpp
@@ -51,7 +51,7 @@ bool VoxelGeneratorTemplate::enqueuePointCloud(
 
 std::size_t VoxelGenerator::generateSweepPoints(float * points_d, cudaStream_t stream)
 {
-  size_t point_counter = 0;
+  std::size_t point_counter = 0;
   for (auto pc_cache_iter = pd_ptr_->getPointCloudCacheIter(); !pd_ptr_->isCacheEnd(pc_cache_iter);
        pc_cache_iter++) {
     auto sweep_num_points = pc_cache_iter->num_points;
diff --git a/perception/lidar_transfusion/include/lidar_transfusion/cuda_utils.hpp b/perception/lidar_transfusion/include/lidar_transfusion/cuda_utils.hpp
index 5c25a936d5392..0f7c72d065193 100644
--- a/perception/lidar_transfusion/include/lidar_transfusion/cuda_utils.hpp
+++ b/perception/lidar_transfusion/include/lidar_transfusion/cuda_utils.hpp
@@ -89,13 +89,13 @@ cuda::unique_ptr<T> make_unique()
   return cuda::unique_ptr<T>{p};
 }
 
-constexpr size_t CUDA_ALIGN = 256;
+constexpr std::size_t CUDA_ALIGN = 256;
 
 template <typename T>
-inline size_t get_size_aligned(size_t num_elem)
+inline std::size_t get_size_aligned(size_t num_elem)
 {
-  size_t size = num_elem * sizeof(T);
-  size_t extra_align = 0;
+  std::size_t size = num_elem * sizeof(T);
+  std::size_t extra_align = 0;
   if (size % CUDA_ALIGN != 0) {
     extra_align = CUDA_ALIGN - size % CUDA_ALIGN;
   }
@@ -103,9 +103,9 @@ inline size_t get_size_aligned(size_t num_elem)
 }
 
 template <typename T>
-inline T * get_next_ptr(size_t num_elem, void *& workspace, size_t & workspace_size)
+inline T * get_next_ptr(size_t num_elem, void *& workspace, std::size_t & workspace_size)
 {
-  size_t size = get_size_aligned<T>(num_elem);
+  std::size_t size = get_size_aligned<T>(num_elem);
   if (size > workspace_size) {
     throw ::std::runtime_error("Workspace is too small!");
   }

From ead817ed7b12f14aec85f9419c765df212b0ccd8 Mon Sep 17 00:00:00 2001
From: Taekjin LEE <taekjin.lee@tier4.jp>
Date: Mon, 30 Sep 2024 10:59:53 +0900
Subject: [PATCH 12/15] fix(autoware_lidar_centerpoint): convert object's
 velocity to follow its definition (#8980)

* fix: convert object's velocity to follow its definition in box3DToDetectedObject function

Signed-off-by: Taekjin LEE <taekjin.lee@tier4.jp>

* Update perception/autoware_lidar_centerpoint/lib/ros_utils.cpp

Co-authored-by: Kenzo Lobos Tsunekawa <kenzo.lobos@tier4.jp>

---------

Signed-off-by: Taekjin LEE <taekjin.lee@tier4.jp>
Co-authored-by: Kenzo Lobos Tsunekawa <kenzo.lobos@tier4.jp>
---
 perception/lidar_centerpoint/lib/ros_utils.cpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/perception/lidar_centerpoint/lib/ros_utils.cpp b/perception/lidar_centerpoint/lib/ros_utils.cpp
index 039d340cc2dc4..3e05e8215e8e3 100644
--- a/perception/lidar_centerpoint/lib/ros_utils.cpp
+++ b/perception/lidar_centerpoint/lib/ros_utils.cpp
@@ -68,8 +68,10 @@ void box3DToDetectedObject(
     float vel_x = box3d.vel_x;
     float vel_y = box3d.vel_y;
     geometry_msgs::msg::Twist twist;
-    twist.linear.x = std::sqrt(std::pow(vel_x, 2) + std::pow(vel_y, 2));
-    twist.angular.z = 2 * (std::atan2(vel_y, vel_x) - yaw);
+    twist.linear.x = std::cos(yaw) * vel_x + std::sin(yaw) * vel_y;
+    twist.linear.y = -std::sin(yaw) * vel_x + std::cos(yaw) * vel_y;
+    twist.angular.z = 0;  // angular velocity is not supported
+
     obj.kinematics.twist_with_covariance.twist = twist;
     obj.kinematics.has_twist = has_twist;
     if (has_variance) {

From 22c33529d1355d41506701107204dfd47d272841 Mon Sep 17 00:00:00 2001
From: Taekjin LEE <taekjin.lee@tier4.jp>
Date: Tue, 1 Oct 2024 18:58:12 +0900
Subject: [PATCH 13/15] fix(autoware_lidar_centerpoint): fix twist covariance
 orientation (#8996)

* fix(autoware_lidar_centerpoint): fix covariance converter considering the twist covariance matrix is based on the object coordinate

Signed-off-by: Taekjin LEE <taekjin.lee@tier4.jp>

fix style

* fix: update test of box3DToDetectedObject function

Signed-off-by: Taekjin LEE <taekjin.lee@tier4.jp>

---------

Signed-off-by: Taekjin LEE <taekjin.lee@tier4.jp>
Signed-off-by: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp>
---
 .../include/lidar_centerpoint/ros_utils.hpp   |  2 +-
 .../lidar_centerpoint/lib/ros_utils.cpp       | 21 ++++++++++++++-----
 .../lidar_centerpoint/test/test_ros_utils.cpp |  7 ++++---
 3 files changed, 21 insertions(+), 9 deletions(-)

diff --git a/perception/lidar_centerpoint/include/lidar_centerpoint/ros_utils.hpp b/perception/lidar_centerpoint/include/lidar_centerpoint/ros_utils.hpp
index 484fbcfd36657..102359e1a43f4 100644
--- a/perception/lidar_centerpoint/include/lidar_centerpoint/ros_utils.hpp
+++ b/perception/lidar_centerpoint/include/lidar_centerpoint/ros_utils.hpp
@@ -35,7 +35,7 @@ uint8_t getSemanticType(const std::string & class_name);
 
 std::array<double, 36> convertPoseCovarianceMatrix(const Box3D & box3d);
 
-std::array<double, 36> convertTwistCovarianceMatrix(const Box3D & box3d);
+std::array<double, 36> convertTwistCovarianceMatrix(const Box3D & box3d, const float yaw);
 
 bool isCarLikeVehicleLabel(const uint8_t label);
 
diff --git a/perception/lidar_centerpoint/lib/ros_utils.cpp b/perception/lidar_centerpoint/lib/ros_utils.cpp
index 3e05e8215e8e3..f9dfbee73a88c 100644
--- a/perception/lidar_centerpoint/lib/ros_utils.cpp
+++ b/perception/lidar_centerpoint/lib/ros_utils.cpp
@@ -50,7 +50,7 @@ void box3DToDetectedObject(
 
   // pose and shape
   // mmdet3d yaw format to ros yaw format
-  float yaw = -box3d.yaw - autoware::universe_utils::pi / 2;
+  const float yaw = -box3d.yaw - autoware::universe_utils::pi / 2;
   obj.kinematics.pose_with_covariance.pose.position =
     autoware::universe_utils::createPoint(box3d.x, box3d.y, box3d.z);
   obj.kinematics.pose_with_covariance.pose.orientation =
@@ -67,6 +67,8 @@ void box3DToDetectedObject(
   if (has_twist) {
     float vel_x = box3d.vel_x;
     float vel_y = box3d.vel_y;
+
+    // twist of the object is based on the object coordinate system
     geometry_msgs::msg::Twist twist;
     twist.linear.x = std::cos(yaw) * vel_x + std::sin(yaw) * vel_y;
     twist.linear.y = -std::sin(yaw) * vel_x + std::cos(yaw) * vel_y;
@@ -76,7 +78,7 @@ void box3DToDetectedObject(
     obj.kinematics.has_twist = has_twist;
     if (has_variance) {
       obj.kinematics.has_twist_covariance = has_variance;
-      obj.kinematics.twist_with_covariance.covariance = convertTwistCovarianceMatrix(box3d);
+      obj.kinematics.twist_with_covariance.covariance = convertTwistCovarianceMatrix(box3d, yaw);
     }
   }
 }
@@ -113,12 +115,21 @@ std::array<double, 36> convertPoseCovarianceMatrix(const Box3D & box3d)
   return pose_covariance;
 }
 
-std::array<double, 36> convertTwistCovarianceMatrix(const Box3D & box3d)
+std::array<double, 36> convertTwistCovarianceMatrix(const Box3D & box3d, const float yaw)
 {
   using POSE_IDX = autoware::universe_utils::xyzrpy_covariance_index::XYZRPY_COV_IDX;
+
+  // twist covariance matrix is based on the object coordinate system
   std::array<double, 36> twist_covariance{};
-  twist_covariance[POSE_IDX::X_X] = box3d.vel_x_variance;
-  twist_covariance[POSE_IDX::Y_Y] = box3d.vel_y_variance;
+  const float cos_yaw = std::cos(yaw);
+  const float sin_yaw = std::sin(yaw);
+  twist_covariance[POSE_IDX::X_X] =
+    box3d.vel_x_variance * cos_yaw * cos_yaw + box3d.vel_y_variance * sin_yaw * sin_yaw;
+  twist_covariance[POSE_IDX::X_Y] =
+    (box3d.vel_y_variance - box3d.vel_x_variance) * sin_yaw * cos_yaw;
+  twist_covariance[POSE_IDX::Y_X] = twist_covariance[POSE_IDX::X_Y];
+  twist_covariance[POSE_IDX::Y_Y] =
+    box3d.vel_x_variance * sin_yaw * sin_yaw + box3d.vel_y_variance * cos_yaw * cos_yaw;
   return twist_covariance;
 }
 
diff --git a/perception/lidar_centerpoint/test/test_ros_utils.cpp b/perception/lidar_centerpoint/test/test_ros_utils.cpp
index 6044a3b6e77c9..2c900da4e2e0f 100644
--- a/perception/lidar_centerpoint/test/test_ros_utils.cpp
+++ b/perception/lidar_centerpoint/test/test_ros_utils.cpp
@@ -122,12 +122,13 @@ TEST(TestSuite, convertPoseCovarianceMatrix)
 TEST(TestSuite, convertTwistCovarianceMatrix)
 {
   centerpoint::Box3D box3d;
-  box3d.vel_x_variance = 0.1;
+  box3d.vel_x_variance = 0.5;
   box3d.vel_y_variance = 0.2;
+  float yaw = 0;
 
-  std::array<double, 36> twist_covariance = centerpoint::convertTwistCovarianceMatrix(box3d);
+  std::array<double, 36> twist_covariance = centerpoint::convertTwistCovarianceMatrix(box3d, yaw);
 
-  EXPECT_FLOAT_EQ(twist_covariance[0], 0.1);
+  EXPECT_FLOAT_EQ(twist_covariance[0], 0.5);
   EXPECT_FLOAT_EQ(twist_covariance[7], 0.2);
 }
 

From ced678992c7c76fab39f31b6676135546642b9b8 Mon Sep 17 00:00:00 2001
From: kobayu858 <129580202+kobayu858@users.noreply.github.com>
Date: Tue, 26 Nov 2024 20:54:54 +0900
Subject: [PATCH 14/15] fix(autoware_lidar_centerpoint): fix
 clang-diagnostic-unused-private-field (#9471)

Signed-off-by: kobayu858 <yutaro.kobayashi@tier4.jp>
---
 perception/lidar_centerpoint/include/lidar_centerpoint/node.hpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/perception/lidar_centerpoint/include/lidar_centerpoint/node.hpp b/perception/lidar_centerpoint/include/lidar_centerpoint/node.hpp
index 154ff97cdb887..ddc1a072b76d0 100644
--- a/perception/lidar_centerpoint/include/lidar_centerpoint/node.hpp
+++ b/perception/lidar_centerpoint/include/lidar_centerpoint/node.hpp
@@ -51,7 +51,6 @@ class LidarCenterPointNode : public rclcpp::Node
   rclcpp::Subscription<sensor_msgs::msg::PointCloud2>::SharedPtr pointcloud_sub_;
   rclcpp::Publisher<autoware_perception_msgs::msg::DetectedObjects>::SharedPtr objects_pub_;
 
-  float score_threshold_{0.0};
   std::vector<std::string> class_names_;
   bool has_variance_{false};
   bool has_twist_{false};

From ef19cc493843778a66a27e0f7ab8845ea5b15574 Mon Sep 17 00:00:00 2001
From: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp>
Date: Mon, 16 Dec 2024 14:16:36 +0900
Subject: [PATCH 15/15] fix: the cloud capacity is not yet on the launcher's
 config file. For now I wil leave it as a default parameter

Signed-off-by: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp>
---
 perception/lidar_centerpoint/src/node.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/perception/lidar_centerpoint/src/node.cpp b/perception/lidar_centerpoint/src/node.cpp
index 53554d0a3becf..c42ef785aed51 100644
--- a/perception/lidar_centerpoint/src/node.cpp
+++ b/perception/lidar_centerpoint/src/node.cpp
@@ -50,7 +50,9 @@ LidarCenterPointNode::LidarCenterPointNode(const rclcpp::NodeOptions & node_opti
   const int densification_num_past_frames =
     this->declare_parameter<int>("densification_params.num_past_frames");
   const std::string trt_precision = this->declare_parameter<std::string>("trt_precision");
-  const std::size_t cloud_capacity = this->declare_parameter<std::int64_t>("cloud_capacity");
+  const std::size_t cloud_capacity = this->declare_parameter<std::int64_t>(
+    "cloud_capacity", 2000000);  // NOTE(knzo25): this is a temporal hack since the launcher for
+                                 // this release does not have the corresponding parameter yet
   const std::string encoder_onnx_path = this->declare_parameter<std::string>("encoder_onnx_path");
   const std::string encoder_engine_path =
     this->declare_parameter<std::string>("encoder_engine_path");