feat: accelerated preprocessing for centerpoint

knzo25 · knzo25 · commit ca1370df58c0 · 2024-05-13T00:36:09.000+09:00
Signed-off-by: Kenzo Lobos-Tsunekawa &lt;kenzo.lobos@tier4.jp&gt;
diff --git a/perception/lidar_centerpoint/include/lidar_centerpoint/preprocess/pointcloud_densification.hpp b/perception/lidar_centerpoint/include/lidar_centerpoint/preprocess/pointcloud_densification.hpp
@@ -15,6 +15,8 @@
 #ifndef LIDAR_CENTERPOINT__PREPROCESS__POINTCLOUD_DENSIFICATION_HPP_
 #define LIDAR_CENTERPOINT__PREPROCESS__POINTCLOUD_DENSIFICATION_HPP_
 
+#include <lidar_centerpoint/cuda_utils.hpp>
+
 #include <tf2_ros/buffer.h>
 #include <tf2_ros/transform_listener.h>
 #ifdef ROS_DISTRO_GALACTIC
@@ -48,7 +50,10 @@ class DensificationParam
 
 struct PointCloudWithTransform
 {
-  sensor_msgs::msg::PointCloud2 pointcloud_msg;
+  cuda::unique_ptr<float[]> points_d{nullptr};
+  std_msgs::msg::Header header;
+  size_t num_points{0};
+  size_t point_step{0};
   Eigen::Affine3f affine_past2world;
 };
 
@@ -58,7 +63,8 @@ class PointCloudDensification
   explicit PointCloudDensification(const DensificationParam & param);
 
   bool enqueuePointCloud(
-    const sensor_msgs::msg::PointCloud2 & input_pointcloud_msg, const tf2_ros::Buffer & tf_buffer);
+    const sensor_msgs::msg::PointCloud2 & input_pointcloud_msg, const tf2_ros::Buffer & tf_buffer,
+    cudaStream_t streams);
 
   double getCurrentTimestamp() const { return current_timestamp_; }
   Eigen::Affine3f getAffineWorldToCurrent() const { return affine_world2current_; }
@@ -73,7 +79,8 @@ class PointCloudDensification
   unsigned int pointcloud_cache_size() const { return param_.pointcloud_cache_size(); }
 
 private:
-  void enqueue(const sensor_msgs::msg::PointCloud2 & msg, const Eigen::Affine3f & affine);
+  void enqueue(
+    const sensor_msgs::msg::PointCloud2 & msg, const Eigen::Affine3f & affine, cudaStream_t stream);
   void dequeue();
 
   DensificationParam param_;
diff --git a/perception/lidar_centerpoint/include/lidar_centerpoint/preprocess/preprocess_kernel.hpp b/perception/lidar_centerpoint/include/lidar_centerpoint/preprocess/preprocess_kernel.hpp
@@ -20,6 +20,10 @@
 
 namespace centerpoint
 {
+cudaError_t generateSweepPoints_launch(
+  const float * input_points, size_t points_size, int input_point_step, float time_lag,
+  const float * transform, int num_features, float * output_points, cudaStream_t stream);
+
 cudaError_t generateVoxels_random_launch(
   const float * points, size_t points_size, float min_x_range, float max_x_range, float min_y_range,
   float max_y_range, float min_z_range, float max_z_range, float pillar_x_size, float pillar_y_size,
diff --git a/perception/lidar_centerpoint/include/lidar_centerpoint/preprocess/voxel_generator.hpp b/perception/lidar_centerpoint/include/lidar_centerpoint/preprocess/voxel_generator.hpp
@@ -31,10 +31,11 @@ class VoxelGeneratorTemplate
   explicit VoxelGeneratorTemplate(
     const DensificationParam & param, const CenterPointConfig & config);
 
-  virtual std::size_t generateSweepPoints(std::vector<float> & points) = 0;
+  virtual std::size_t generateSweepPoints(float * d_points, cudaStream_t stream) = 0;
 
   bool enqueuePointCloud(
-    const sensor_msgs::msg::PointCloud2 & input_pointcloud_msg, const tf2_ros::Buffer & tf_buffer);
+    const sensor_msgs::msg::PointCloud2 & input_pointcloud_msg, const tf2_ros::Buffer & tf_buffer,
+    cudaStream_t stream);
 
 protected:
   std::unique_ptr<PointCloudDensification> pd_ptr_{nullptr};
@@ -50,7 +51,7 @@ class VoxelGenerator : public VoxelGeneratorTemplate
 public:
   using VoxelGeneratorTemplate::VoxelGeneratorTemplate;
 
-  std::size_t generateSweepPoints(std::vector<float> & points) override;
+  std::size_t generateSweepPoints(float * d_points, cudaStream_t stream) override;
 };
 
 }  // namespace centerpoint
diff --git a/perception/lidar_centerpoint/lib/centerpoint_trt.cpp b/perception/lidar_centerpoint/lib/centerpoint_trt.cpp
@@ -130,14 +130,11 @@ bool CenterPointTRT::detect(
 bool CenterPointTRT::preprocess(
   const sensor_msgs::msg::PointCloud2 & input_pointcloud_msg, const tf2_ros::Buffer & tf_buffer)
 {
-  bool is_success = vg_ptr_->enqueuePointCloud(input_pointcloud_msg, tf_buffer);
+  bool is_success = vg_ptr_->enqueuePointCloud(input_pointcloud_msg, tf_buffer, stream_);
   if (!is_success) {
     return false;
   }
-  const auto count = vg_ptr_->generateSweepPoints(points_);
-  CHECK_CUDA_ERROR(cudaMemcpyAsync(
-    points_d_.get(), points_.data(), count * config_.point_feature_size_ * sizeof(float),
-    cudaMemcpyHostToDevice, stream_));
+  const auto count = vg_ptr_->generateSweepPoints(points_d_.get(), stream_);
   CHECK_CUDA_ERROR(cudaMemsetAsync(num_voxels_d_.get(), 0, sizeof(unsigned int), stream_));
   CHECK_CUDA_ERROR(
     cudaMemsetAsync(voxels_buffer_d_.get(), 0, voxels_buffer_size_ * sizeof(float), stream_));
diff --git a/perception/lidar_centerpoint/lib/preprocess/pointcloud_densification.cpp b/perception/lidar_centerpoint/lib/preprocess/pointcloud_densification.cpp
@@ -61,7 +61,8 @@ PointCloudDensification::PointCloudDensification(const DensificationParam & para
 }
 
 bool PointCloudDensification::enqueuePointCloud(
-  const sensor_msgs::msg::PointCloud2 & pointcloud_msg, const tf2_ros::Buffer & tf_buffer)
+  const sensor_msgs::msg::PointCloud2 & pointcloud_msg, const tf2_ros::Buffer & tf_buffer,
+  cudaStream_t stream)
 {
   const auto header = pointcloud_msg.header;
 
@@ -73,9 +74,9 @@ bool PointCloudDensification::enqueuePointCloud(
     }
     auto affine_world2current = transformToEigen(transform_world2current.get());
 
-    enqueue(pointcloud_msg, affine_world2current);
+    enqueue(pointcloud_msg, affine_world2current, stream);
   } else {
-    enqueue(pointcloud_msg, Eigen::Affine3f::Identity());
+    enqueue(pointcloud_msg, Eigen::Affine3f::Identity(), stream);
   }
 
   dequeue();
@@ -84,12 +85,24 @@ bool PointCloudDensification::enqueuePointCloud(
 }
 
 void PointCloudDensification::enqueue(
-  const sensor_msgs::msg::PointCloud2 & msg, const Eigen::Affine3f & affine_world2current)
+  const sensor_msgs::msg::PointCloud2 & msg, const Eigen::Affine3f & affine_world2current,
+  cudaStream_t stream)
 {
   affine_world2current_ = affine_world2current;
   current_timestamp_ = rclcpp::Time(msg.header.stamp).seconds();
-  PointCloudWithTransform pointcloud = {msg, affine_world2current.inverse()};
-  pointcloud_cache_.push_front(pointcloud);
+
+  assert(sizeof(uint8_t) * msg.width * msg.height * msg.point_step % sizeof(float) == 0);
+  auto points_d = cuda::make_unique<float[]>(
+    sizeof(uint8_t) * msg.width * msg.height * msg.point_step / sizeof(float));
+  CHECK_CUDA_ERROR(cudaMemcpyAsync(
+    points_d.get(), msg.data.data(), sizeof(uint8_t) * msg.width * msg.height * msg.point_step,
+    cudaMemcpyHostToDevice, stream));
+
+  PointCloudWithTransform pointcloud = {
+    std::move(points_d), msg.header, msg.width * msg.height, msg.point_step,
+    affine_world2current.inverse()};
+
+  pointcloud_cache_.push_front(std::move(pointcloud));
 }
 
 void PointCloudDensification::dequeue()
diff --git a/perception/lidar_centerpoint/lib/preprocess/preprocess_kernel.cu b/perception/lidar_centerpoint/lib/preprocess/preprocess_kernel.cu
@@ -28,10 +28,11 @@
  * limitations under the License.
  */
 
-#include "lidar_centerpoint/preprocess/preprocess_kernel.hpp"
-
+#include <lidar_centerpoint/cuda_utils.hpp>
+#include <lidar_centerpoint/preprocess/preprocess_kernel.hpp>
 #include <lidar_centerpoint/utils.hpp>
 
+#include <cassert>
 namespace
 {
 const std::size_t MAX_POINT_IN_VOXEL_SIZE = 32;  // the same as max_point_in_voxel_size_ in config
@@ -41,6 +42,50 @@ const std::size_t ENCODER_IN_FEATURE_SIZE = 9;  // the same as encoder_in_featur
 
 namespace centerpoint
 {
+
+__global__ void generateSweepPoints_kernel(
+  const float * input_points, size_t points_size, int input_point_step, float time_lag,
+  const float * transform_array, int num_features, float * output_points)
+{
+  int point_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (point_idx >= points_size) return;
+
+  const float input_x = input_points[point_idx * input_point_step + 0];
+  const float input_y = input_points[point_idx * input_point_step + 1];
+  const float input_z = input_points[point_idx * input_point_step + 2];
+
+  output_points[point_idx * num_features + 0] = transform_array[0] * input_x +
+                                                transform_array[1] * input_y +
+                                                transform_array[2] * input_z + transform_array[3];
+  output_points[point_idx * num_features + 1] = transform_array[4] * input_x +
+                                                transform_array[5] * input_y +
+                                                transform_array[6] * input_z + transform_array[7];
+  output_points[point_idx * num_features + 2] = transform_array[8] * input_x +
+                                                transform_array[9] * input_y +
+                                                transform_array[10] * input_z + transform_array[11];
+  output_points[point_idx * num_features + 3] = time_lag;
+}
+
+cudaError_t generateSweepPoints_launch(
+  const float * input_points, size_t points_size, int input_point_step, float time_lag,
+  const float * transform_array, int num_features, float * output_points, cudaStream_t stream)
+{
+  auto transform_d = cuda::make_unique<float[]>(16);
+  CHECK_CUDA_ERROR(cudaMemcpyAsync(
+    transform_d.get(), transform_array, 16 * sizeof(float), cudaMemcpyHostToDevice, stream));
+
+  dim3 blocks((points_size + 256 - 1) / 256);
+  dim3 threads(256);
+  assert(num_features == 4);
+
+  generateSweepPoints_kernel<<<blocks, threads, 0, stream>>>(
+    input_points, points_size, input_point_step, time_lag, transform_d.get(), num_features,
+    output_points);
+
+  cudaError_t err = cudaGetLastError();
+  return err;
+}
+
 __global__ void generateVoxels_random_kernel(
   const float * points, size_t points_size, float min_x_range, float max_x_range, float min_y_range,
   float max_y_range, float min_z_range, float max_z_range, float pillar_x_size, float pillar_y_size,
diff --git a/perception/lidar_centerpoint/lib/preprocess/voxel_generator.cpp b/perception/lidar_centerpoint/lib/preprocess/voxel_generator.cpp
@@ -14,6 +14,8 @@
 
 #include "lidar_centerpoint/preprocess/voxel_generator.hpp"
 
+#include <lidar_centerpoint/preprocess/preprocess_kernel.hpp>
+
 #include <sensor_msgs/point_cloud2_iterator.hpp>
 
 namespace centerpoint
@@ -38,35 +40,30 @@ VoxelGeneratorTemplate::VoxelGeneratorTemplate(
 }
 
 bool VoxelGeneratorTemplate::enqueuePointCloud(
-  const sensor_msgs::msg::PointCloud2 & input_pointcloud_msg, const tf2_ros::Buffer & tf_buffer)
+  const sensor_msgs::msg::PointCloud2 & input_pointcloud_msg, const tf2_ros::Buffer & tf_buffer,
+  cudaStream_t stream)
 {
-  return pd_ptr_->enqueuePointCloud(input_pointcloud_msg, tf_buffer);
+  return pd_ptr_->enqueuePointCloud(input_pointcloud_msg, tf_buffer, stream);
 }
 
-std::size_t VoxelGenerator::generateSweepPoints(std::vector<float> & points)
+std::size_t VoxelGenerator::generateSweepPoints(float * points_d, cudaStream_t stream)
 {
-  Eigen::Vector3f point_current, point_past;
-  size_t point_counter{};
+  size_t point_counter = 0;
   for (auto pc_cache_iter = pd_ptr_->getPointCloudCacheIter(); !pd_ptr_->isCacheEnd(pc_cache_iter);
        pc_cache_iter++) {
-    auto pc_msg = pc_cache_iter->pointcloud_msg;
+    auto sweep_num_points = pc_cache_iter->num_points;
+    auto point_step = pc_cache_iter->point_step;
     auto affine_past2current =
       pd_ptr_->getAffineWorldToCurrent() * pc_cache_iter->affine_past2world;
     float time_lag = static_cast<float>(
-      pd_ptr_->getCurrentTimestamp() - rclcpp::Time(pc_msg.header.stamp).seconds());
+      pd_ptr_->getCurrentTimestamp() - rclcpp::Time(pc_cache_iter->header.stamp).seconds());
 
-    for (sensor_msgs::PointCloud2ConstIterator<float> x_iter(pc_msg, "x"), y_iter(pc_msg, "y"),
-         z_iter(pc_msg, "z");
-         x_iter != x_iter.end(); ++x_iter, ++y_iter, ++z_iter) {
-      point_past << *x_iter, *y_iter, *z_iter;
-      point_current = affine_past2current * point_past;
+    generateSweepPoints_launch(
+      pc_cache_iter->points_d.get(), sweep_num_points, point_step / sizeof(float), time_lag,
+      affine_past2current.matrix().data(), config_.point_feature_size_, points_d + point_counter,
+      stream);
 
-      points.at(point_counter * config_.point_feature_size_) = point_current.x();
-      points.at(point_counter * config_.point_feature_size_ + 1) = point_current.y();
-      points.at(point_counter * config_.point_feature_size_ + 2) = point_current.z();
-      points.at(point_counter * config_.point_feature_size_ + 3) = time_lag;
-      ++point_counter;
-    }
+    point_counter += sweep_num_points;
   }
   return point_counter;
 }