Skip to content

Commit ca1370d

Browse files
committed
feat: accelerated preprocessing for centerpoint
Signed-off-by: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp>
1 parent 569d56c commit ca1370d

File tree

7 files changed

+101
-37
lines changed

7 files changed

+101
-37
lines changed

perception/lidar_centerpoint/include/lidar_centerpoint/preprocess/pointcloud_densification.hpp

+10-3
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@
1515
#ifndef LIDAR_CENTERPOINT__PREPROCESS__POINTCLOUD_DENSIFICATION_HPP_
1616
#define LIDAR_CENTERPOINT__PREPROCESS__POINTCLOUD_DENSIFICATION_HPP_
1717

18+
#include <lidar_centerpoint/cuda_utils.hpp>
19+
1820
#include <tf2_ros/buffer.h>
1921
#include <tf2_ros/transform_listener.h>
2022
#ifdef ROS_DISTRO_GALACTIC
@@ -48,7 +50,10 @@ class DensificationParam
4850

4951
struct PointCloudWithTransform
5052
{
51-
sensor_msgs::msg::PointCloud2 pointcloud_msg;
53+
cuda::unique_ptr<float[]> points_d{nullptr};
54+
std_msgs::msg::Header header;
55+
size_t num_points{0};
56+
size_t point_step{0};
5257
Eigen::Affine3f affine_past2world;
5358
};
5459

@@ -58,7 +63,8 @@ class PointCloudDensification
5863
explicit PointCloudDensification(const DensificationParam & param);
5964

6065
bool enqueuePointCloud(
61-
const sensor_msgs::msg::PointCloud2 & input_pointcloud_msg, const tf2_ros::Buffer & tf_buffer);
66+
const sensor_msgs::msg::PointCloud2 & input_pointcloud_msg, const tf2_ros::Buffer & tf_buffer,
67+
cudaStream_t streams);
6268

6369
double getCurrentTimestamp() const { return current_timestamp_; }
6470
Eigen::Affine3f getAffineWorldToCurrent() const { return affine_world2current_; }
@@ -73,7 +79,8 @@ class PointCloudDensification
7379
unsigned int pointcloud_cache_size() const { return param_.pointcloud_cache_size(); }
7480

7581
private:
76-
void enqueue(const sensor_msgs::msg::PointCloud2 & msg, const Eigen::Affine3f & affine);
82+
void enqueue(
83+
const sensor_msgs::msg::PointCloud2 & msg, const Eigen::Affine3f & affine, cudaStream_t stream);
7784
void dequeue();
7885

7986
DensificationParam param_;

perception/lidar_centerpoint/include/lidar_centerpoint/preprocess/preprocess_kernel.hpp

+4
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,10 @@
2020

2121
namespace centerpoint
2222
{
23+
cudaError_t generateSweepPoints_launch(
24+
const float * input_points, size_t points_size, int input_point_step, float time_lag,
25+
const float * transform, int num_features, float * output_points, cudaStream_t stream);
26+
2327
cudaError_t generateVoxels_random_launch(
2428
const float * points, size_t points_size, float min_x_range, float max_x_range, float min_y_range,
2529
float max_y_range, float min_z_range, float max_z_range, float pillar_x_size, float pillar_y_size,

perception/lidar_centerpoint/include/lidar_centerpoint/preprocess/voxel_generator.hpp

+4-3
Original file line numberDiff line numberDiff line change
@@ -31,10 +31,11 @@ class VoxelGeneratorTemplate
3131
explicit VoxelGeneratorTemplate(
3232
const DensificationParam & param, const CenterPointConfig & config);
3333

34-
virtual std::size_t generateSweepPoints(std::vector<float> & points) = 0;
34+
virtual std::size_t generateSweepPoints(float * d_points, cudaStream_t stream) = 0;
3535

3636
bool enqueuePointCloud(
37-
const sensor_msgs::msg::PointCloud2 & input_pointcloud_msg, const tf2_ros::Buffer & tf_buffer);
37+
const sensor_msgs::msg::PointCloud2 & input_pointcloud_msg, const tf2_ros::Buffer & tf_buffer,
38+
cudaStream_t stream);
3839

3940
protected:
4041
std::unique_ptr<PointCloudDensification> pd_ptr_{nullptr};
@@ -50,7 +51,7 @@ class VoxelGenerator : public VoxelGeneratorTemplate
5051
public:
5152
using VoxelGeneratorTemplate::VoxelGeneratorTemplate;
5253

53-
std::size_t generateSweepPoints(std::vector<float> & points) override;
54+
std::size_t generateSweepPoints(float * d_points, cudaStream_t stream) override;
5455
};
5556

5657
} // namespace centerpoint

perception/lidar_centerpoint/lib/centerpoint_trt.cpp

+2-5
Original file line numberDiff line numberDiff line change
@@ -130,14 +130,11 @@ bool CenterPointTRT::detect(
130130
bool CenterPointTRT::preprocess(
131131
const sensor_msgs::msg::PointCloud2 & input_pointcloud_msg, const tf2_ros::Buffer & tf_buffer)
132132
{
133-
bool is_success = vg_ptr_->enqueuePointCloud(input_pointcloud_msg, tf_buffer);
133+
bool is_success = vg_ptr_->enqueuePointCloud(input_pointcloud_msg, tf_buffer, stream_);
134134
if (!is_success) {
135135
return false;
136136
}
137-
const auto count = vg_ptr_->generateSweepPoints(points_);
138-
CHECK_CUDA_ERROR(cudaMemcpyAsync(
139-
points_d_.get(), points_.data(), count * config_.point_feature_size_ * sizeof(float),
140-
cudaMemcpyHostToDevice, stream_));
137+
const auto count = vg_ptr_->generateSweepPoints(points_d_.get(), stream_);
141138
CHECK_CUDA_ERROR(cudaMemsetAsync(num_voxels_d_.get(), 0, sizeof(unsigned int), stream_));
142139
CHECK_CUDA_ERROR(
143140
cudaMemsetAsync(voxels_buffer_d_.get(), 0, voxels_buffer_size_ * sizeof(float), stream_));

perception/lidar_centerpoint/lib/preprocess/pointcloud_densification.cpp

+19-6
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,8 @@ PointCloudDensification::PointCloudDensification(const DensificationParam & para
6161
}
6262

6363
bool PointCloudDensification::enqueuePointCloud(
64-
const sensor_msgs::msg::PointCloud2 & pointcloud_msg, const tf2_ros::Buffer & tf_buffer)
64+
const sensor_msgs::msg::PointCloud2 & pointcloud_msg, const tf2_ros::Buffer & tf_buffer,
65+
cudaStream_t stream)
6566
{
6667
const auto header = pointcloud_msg.header;
6768

@@ -73,9 +74,9 @@ bool PointCloudDensification::enqueuePointCloud(
7374
}
7475
auto affine_world2current = transformToEigen(transform_world2current.get());
7576

76-
enqueue(pointcloud_msg, affine_world2current);
77+
enqueue(pointcloud_msg, affine_world2current, stream);
7778
} else {
78-
enqueue(pointcloud_msg, Eigen::Affine3f::Identity());
79+
enqueue(pointcloud_msg, Eigen::Affine3f::Identity(), stream);
7980
}
8081

8182
dequeue();
@@ -84,12 +85,24 @@ bool PointCloudDensification::enqueuePointCloud(
8485
}
8586

8687
void PointCloudDensification::enqueue(
87-
const sensor_msgs::msg::PointCloud2 & msg, const Eigen::Affine3f & affine_world2current)
88+
const sensor_msgs::msg::PointCloud2 & msg, const Eigen::Affine3f & affine_world2current,
89+
cudaStream_t stream)
8890
{
8991
affine_world2current_ = affine_world2current;
9092
current_timestamp_ = rclcpp::Time(msg.header.stamp).seconds();
91-
PointCloudWithTransform pointcloud = {msg, affine_world2current.inverse()};
92-
pointcloud_cache_.push_front(pointcloud);
93+
94+
assert(sizeof(uint8_t) * msg.width * msg.height * msg.point_step % sizeof(float) == 0);
95+
auto points_d = cuda::make_unique<float[]>(
96+
sizeof(uint8_t) * msg.width * msg.height * msg.point_step / sizeof(float));
97+
CHECK_CUDA_ERROR(cudaMemcpyAsync(
98+
points_d.get(), msg.data.data(), sizeof(uint8_t) * msg.width * msg.height * msg.point_step,
99+
cudaMemcpyHostToDevice, stream));
100+
101+
PointCloudWithTransform pointcloud = {
102+
std::move(points_d), msg.header, msg.width * msg.height, msg.point_step,
103+
affine_world2current.inverse()};
104+
105+
pointcloud_cache_.push_front(std::move(pointcloud));
93106
}
94107

95108
void PointCloudDensification::dequeue()

perception/lidar_centerpoint/lib/preprocess/preprocess_kernel.cu

+47-2
Original file line numberDiff line numberDiff line change
@@ -28,10 +28,11 @@
2828
* limitations under the License.
2929
*/
3030

31-
#include "lidar_centerpoint/preprocess/preprocess_kernel.hpp"
32-
31+
#include <lidar_centerpoint/cuda_utils.hpp>
32+
#include <lidar_centerpoint/preprocess/preprocess_kernel.hpp>
3333
#include <lidar_centerpoint/utils.hpp>
3434

35+
#include <cassert>
3536
namespace
3637
{
3738
const std::size_t MAX_POINT_IN_VOXEL_SIZE = 32; // the same as max_point_in_voxel_size_ in config
@@ -41,6 +42,50 @@ const std::size_t ENCODER_IN_FEATURE_SIZE = 9; // the same as encoder_in_featur
4142

4243
namespace centerpoint
4344
{
45+
46+
__global__ void generateSweepPoints_kernel(
47+
const float * input_points, size_t points_size, int input_point_step, float time_lag,
48+
const float * transform_array, int num_features, float * output_points)
49+
{
50+
int point_idx = blockIdx.x * blockDim.x + threadIdx.x;
51+
if (point_idx >= points_size) return;
52+
53+
const float input_x = input_points[point_idx * input_point_step + 0];
54+
const float input_y = input_points[point_idx * input_point_step + 1];
55+
const float input_z = input_points[point_idx * input_point_step + 2];
56+
57+
output_points[point_idx * num_features + 0] = transform_array[0] * input_x +
58+
transform_array[1] * input_y +
59+
transform_array[2] * input_z + transform_array[3];
60+
output_points[point_idx * num_features + 1] = transform_array[4] * input_x +
61+
transform_array[5] * input_y +
62+
transform_array[6] * input_z + transform_array[7];
63+
output_points[point_idx * num_features + 2] = transform_array[8] * input_x +
64+
transform_array[9] * input_y +
65+
transform_array[10] * input_z + transform_array[11];
66+
output_points[point_idx * num_features + 3] = time_lag;
67+
}
68+
69+
cudaError_t generateSweepPoints_launch(
70+
const float * input_points, size_t points_size, int input_point_step, float time_lag,
71+
const float * transform_array, int num_features, float * output_points, cudaStream_t stream)
72+
{
73+
auto transform_d = cuda::make_unique<float[]>(16);
74+
CHECK_CUDA_ERROR(cudaMemcpyAsync(
75+
transform_d.get(), transform_array, 16 * sizeof(float), cudaMemcpyHostToDevice, stream));
76+
77+
dim3 blocks((points_size + 256 - 1) / 256);
78+
dim3 threads(256);
79+
assert(num_features == 4);
80+
81+
generateSweepPoints_kernel<<<blocks, threads, 0, stream>>>(
82+
input_points, points_size, input_point_step, time_lag, transform_d.get(), num_features,
83+
output_points);
84+
85+
cudaError_t err = cudaGetLastError();
86+
return err;
87+
}
88+
4489
__global__ void generateVoxels_random_kernel(
4590
const float * points, size_t points_size, float min_x_range, float max_x_range, float min_y_range,
4691
float max_y_range, float min_z_range, float max_z_range, float pillar_x_size, float pillar_y_size,

perception/lidar_centerpoint/lib/preprocess/voxel_generator.cpp

+15-18
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,8 @@
1414

1515
#include "lidar_centerpoint/preprocess/voxel_generator.hpp"
1616

17+
#include <lidar_centerpoint/preprocess/preprocess_kernel.hpp>
18+
1719
#include <sensor_msgs/point_cloud2_iterator.hpp>
1820

1921
namespace centerpoint
@@ -38,35 +40,30 @@ VoxelGeneratorTemplate::VoxelGeneratorTemplate(
3840
}
3941

4042
bool VoxelGeneratorTemplate::enqueuePointCloud(
41-
const sensor_msgs::msg::PointCloud2 & input_pointcloud_msg, const tf2_ros::Buffer & tf_buffer)
43+
const sensor_msgs::msg::PointCloud2 & input_pointcloud_msg, const tf2_ros::Buffer & tf_buffer,
44+
cudaStream_t stream)
4245
{
43-
return pd_ptr_->enqueuePointCloud(input_pointcloud_msg, tf_buffer);
46+
return pd_ptr_->enqueuePointCloud(input_pointcloud_msg, tf_buffer, stream);
4447
}
4548

46-
std::size_t VoxelGenerator::generateSweepPoints(std::vector<float> & points)
49+
std::size_t VoxelGenerator::generateSweepPoints(float * points_d, cudaStream_t stream)
4750
{
48-
Eigen::Vector3f point_current, point_past;
49-
size_t point_counter{};
51+
size_t point_counter = 0;
5052
for (auto pc_cache_iter = pd_ptr_->getPointCloudCacheIter(); !pd_ptr_->isCacheEnd(pc_cache_iter);
5153
pc_cache_iter++) {
52-
auto pc_msg = pc_cache_iter->pointcloud_msg;
54+
auto sweep_num_points = pc_cache_iter->num_points;
55+
auto point_step = pc_cache_iter->point_step;
5356
auto affine_past2current =
5457
pd_ptr_->getAffineWorldToCurrent() * pc_cache_iter->affine_past2world;
5558
float time_lag = static_cast<float>(
56-
pd_ptr_->getCurrentTimestamp() - rclcpp::Time(pc_msg.header.stamp).seconds());
59+
pd_ptr_->getCurrentTimestamp() - rclcpp::Time(pc_cache_iter->header.stamp).seconds());
5760

58-
for (sensor_msgs::PointCloud2ConstIterator<float> x_iter(pc_msg, "x"), y_iter(pc_msg, "y"),
59-
z_iter(pc_msg, "z");
60-
x_iter != x_iter.end(); ++x_iter, ++y_iter, ++z_iter) {
61-
point_past << *x_iter, *y_iter, *z_iter;
62-
point_current = affine_past2current * point_past;
61+
generateSweepPoints_launch(
62+
pc_cache_iter->points_d.get(), sweep_num_points, point_step / sizeof(float), time_lag,
63+
affine_past2current.matrix().data(), config_.point_feature_size_, points_d + point_counter,
64+
stream);
6365

64-
points.at(point_counter * config_.point_feature_size_) = point_current.x();
65-
points.at(point_counter * config_.point_feature_size_ + 1) = point_current.y();
66-
points.at(point_counter * config_.point_feature_size_ + 2) = point_current.z();
67-
points.at(point_counter * config_.point_feature_size_ + 3) = time_lag;
68-
++point_counter;
69-
}
66+
point_counter += sweep_num_points;
7067
}
7168
return point_counter;
7269
}

0 commit comments

Comments
 (0)