From af2e8843a2395f20edcee6cb09047be62628c3eb Mon Sep 17 00:00:00 2001 From: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp> Date: Mon, 25 Nov 2024 17:08:54 +0900 Subject: [PATCH 01/55] feat: moved the cuda pointcloud preprocessor and organized from a personal repository Signed-off-by: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp> --- .../CMakeLists.txt | 100 +++ .../README.md | 20 + .../cuda_pointcloud_preprocessor.param.yaml | 22 + .../docs/cuda-organized-pointcloud-adapter.md | 41 ++ .../docs/cuda-pointcloud-preprocessor.md | 46 ++ ...cuda_organized_pointcloud_adapter_node.hpp | 77 ++ .../cuda_pointcloud_preprocessor.hpp | 154 ++++ .../cuda_pointcloud_preprocessor_node.hpp | 114 +++ .../point_types.hpp | 50 ++ .../cuda_pointcloud_preprocessor.launch.xml | 16 + .../package.xml | 32 + .../cuda_pointcloud_preprocessor.schema.json | 140 ++++ ...cuda_organized_pointcloud_adapter_node.cpp | 203 +++++ .../cuda_pointcloud_preprocessor.cu | 692 ++++++++++++++++++ .../cuda_pointcloud_preprocessor_node.cpp | 253 +++++++ 15 files changed, 1960 insertions(+) create mode 100644 sensing/autoware_cuda_pointcloud_preprocessor/CMakeLists.txt create mode 100644 sensing/autoware_cuda_pointcloud_preprocessor/README.md create mode 100644 sensing/autoware_cuda_pointcloud_preprocessor/config/cuda_pointcloud_preprocessor.param.yaml create mode 100644 sensing/autoware_cuda_pointcloud_preprocessor/docs/cuda-organized-pointcloud-adapter.md create mode 100644 sensing/autoware_cuda_pointcloud_preprocessor/docs/cuda-pointcloud-preprocessor.md create mode 100644 sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_organized_pointcloud_adapter/cuda_organized_pointcloud_adapter_node.hpp create mode 100644 sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.hpp create mode 100644 sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.hpp create mode 100644 sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/point_types.hpp create mode 100644 sensing/autoware_cuda_pointcloud_preprocessor/launch/cuda_pointcloud_preprocessor.launch.xml create mode 100644 sensing/autoware_cuda_pointcloud_preprocessor/package.xml create mode 100644 sensing/autoware_cuda_pointcloud_preprocessor/schema/cuda_pointcloud_preprocessor.schema.json create mode 100644 sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_organized_pointcloud_adapter/cuda_organized_pointcloud_adapter_node.cpp create mode 100644 sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu create mode 100644 sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.cpp diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/CMakeLists.txt b/sensing/autoware_cuda_pointcloud_preprocessor/CMakeLists.txt new file mode 100644 index 0000000000000..06b1f5d67540f --- /dev/null +++ b/sensing/autoware_cuda_pointcloud_preprocessor/CMakeLists.txt @@ -0,0 +1,100 @@ +cmake_minimum_required(VERSION 3.14) +project(autoware_cuda_pointcloud_preprocessor) + +find_package(ament_cmake_auto REQUIRED) +find_package(CUDA) +find_package(PCL REQUIRED) + +if(NOT ${CUDA_FOUND}) + message(WARNING "cuda was not found, so the autoware_cuda_pointcloud_preprocessor package will not be built.") + return() +endif() + +ament_auto_find_build_dependencies() + +# Default to C++17 +if (NOT CMAKE_CXX_STANDARD) + set(CMAKE_CXX_STANDARD 17) +endif () + +if (CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "Clang") + add_compile_options(-Wall -Wextra -Wpedantic -Wunused-function) +endif () + +if(BUILD_TESTING) + find_package(ament_lint_auto REQUIRED) + ament_lint_auto_find_test_dependencies() +endif() + +include_directories( + include + SYSTEM + ${CUDA_INCLUDE_DIRS} +) + +list(APPEND CUDA_NVCC_FLAGS "--expt-relaxed-constexpr -diag-suppress 20012") + +cuda_add_library(cuda_pointcloud_preprocessor_lib SHARED + src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu +) + +target_link_libraries(cuda_pointcloud_preprocessor_lib + ${PCL_LIBRARIES} +) + +target_include_directories(cuda_pointcloud_preprocessor_lib SYSTEM PRIVATE + ${autoware_point_types_INCLUDE_DIRS} + ${cuda_blackboard_INCLUDE_DIRS} + ${geometry_msgs_INCLUDE_DIRS} + ${nebula_common_INCLUDE_DIRS} + ${rclcpp_INCLUDE_DIRS} + ${rcl_interfaces_INCLUDE_DIRS} + ${sensor_msgs_INCLUDE_DIRS} + ${tf2_INCLUDE_DIRS} +) + +# Targets +ament_auto_add_library(cuda_organized_pointcloud_adapter SHARED + src/cuda_organized_pointcloud_adapter/cuda_organized_pointcloud_adapter_node.cpp +) + +target_link_libraries(cuda_organized_pointcloud_adapter + ${CUDA_LIBRARIES} +) + +ament_auto_add_library(cuda_pointcloud_preprocessor SHARED + src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.cpp +) + +target_link_libraries(cuda_pointcloud_preprocessor + ${CUDA_LIBRARIES} + cuda_pointcloud_preprocessor_lib +) + +rclcpp_components_register_node(cuda_organized_pointcloud_adapter + PLUGIN "autoware::cuda_organized_pointcloud_adapter::CudaOrganizedPointcloudAdapterNode" + EXECUTABLE cuda_organized_pointcloud_adapter_node +) + +rclcpp_components_register_node(cuda_pointcloud_preprocessor + PLUGIN "autoware::cuda_pointcloud_preprocessor::CudaPointcloudPreprocessorNode" + EXECUTABLE cuda_pointcloud_preprocessor_node +) + +install(DIRECTORY launch config + DESTINATION share/${PROJECT_NAME} +) + +ament_auto_package() + +# Set ROS_DISTRO macros +set(ROS_DISTRO $ENV{ROS_DISTRO}) +if(${ROS_DISTRO} STREQUAL "rolling") + add_compile_definitions(ROS_DISTRO_ROLLING) +elseif(${ROS_DISTRO} STREQUAL "foxy") + add_compile_definitions(ROS_DISTRO_FOXY) +elseif(${ROS_DISTRO} STREQUAL "galactic") + add_compile_definitions(ROS_DISTRO_GALACTIC) +elseif(${ROS_DISTRO} STREQUAL "humble") + add_compile_definitions(ROS_DISTRO_HUMBLE) +endif() diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/README.md b/sensing/autoware_cuda_pointcloud_preprocessor/README.md new file mode 100644 index 0000000000000..d15082b69a661 --- /dev/null +++ b/sensing/autoware_cuda_pointcloud_preprocessor/README.md @@ -0,0 +1,20 @@ +# autoware_cuda_pointcloud_preprocessor + +## Purpose + +The pointcloud preprocessing implemented in `autoware_pointcloud_preprocessor` has been thoroughly tested in autoware. However, the latency it introduces does not scale well with modern LiDAR devices due to the high number of points they introduce. + +To alleviate this issue, this package reimplements most of the pipeline presented in `autoware_pointcloud_preprocessor` leveraging the use of GPGPUs. In particular, this package makes use of CUDA to provide accelerated versions of the already establishes implementations, while also maintaining compatibility with normal ROS nodes/topics. + +## Inner-workings / Algorithms + +A detailed description of each filter's algorithm is available in the following links. + +| Filter Name | Description | Detail | +| --------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------- | +| cuda_organized_pointcloud_adapter | Organizes a pointcloud per ring/channel, so that the memory layout allows parallel processing in cuda | [link](docs/cuda-organized-pointcloud-adapter.md) | +| cuda_pointcloud_preprocessor | Implements the cropping, distortion correction, and outlier filtering (ring-based) of the `autoware_pointcloud_preprocessor`'s cpu versions. | [link](docs/cuda-pointcloud-preprocessor.md) | + +## (Optional) Future extensions / Unimplemented parts + +The subsample filters implemented in `autoware_pointcloud_preprocessor` will have similar counterparts in this package. diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/config/cuda_pointcloud_preprocessor.param.yaml b/sensing/autoware_cuda_pointcloud_preprocessor/config/cuda_pointcloud_preprocessor.param.yaml new file mode 100644 index 0000000000000..22358cfadf2a7 --- /dev/null +++ b/sensing/autoware_cuda_pointcloud_preprocessor/config/cuda_pointcloud_preprocessor.param.yaml @@ -0,0 +1,22 @@ +/**: + ros__parameters: + + base_frame: base_link + use_imu: true + use_3d_distortion_correction: false + distance_ratio: 1.03 + object_length_threshold: 0.1 + num_points_threshold: 4 + + self_crop.min_x: 1.0 + self_crop.min_y: 1.0 + self_crop.min_z: 1.0 + self_crop.max_x: -1.0 + self_crop.max_y: -1.0 + self_crop.max_z: -1.0 + mirror_crop.min_x: 1.0 + mirror_crop.min_y: 1.0 + mirror_crop.min_z: 1.0 + mirror_crop.max_x: -1.0 + mirror_crop.max_y: -1.0 + mirror_crop.max_z: -1.0 diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/docs/cuda-organized-pointcloud-adapter.md b/sensing/autoware_cuda_pointcloud_preprocessor/docs/cuda-organized-pointcloud-adapter.md new file mode 100644 index 0000000000000..634cdfb3e14f0 --- /dev/null +++ b/sensing/autoware_cuda_pointcloud_preprocessor/docs/cuda-organized-pointcloud-adapter.md @@ -0,0 +1,41 @@ +# cuda_organized_pointcloud_adapter + +## Purpose + +The node `cuda_pointcloud_preprocessor` expects a 2D pointcloud where every row represents a single channel/ring and each row's points are in non decreasing azimuth order. + +To utilize the previously mentioned node, this node provides an adapter to convert standard flat pointclouds (`height` equals to 1) to the required 2D tensor. + +In addition, this node uses the `cuda_blackboard`, a cuda transport layer that enables a zero-copy mechanism between GPU and GPU memory for both input and output. + +## Inner-workings / Algorithms + +To create the required 2D tensor, this node iterates the input pointcloud sequentially, filling the output 2D tensor depending on the input point's channel. + +The output tensor's size is also estimated in this node, based on the largest `channel` value and the maximum number of points per channel observed so far. + +## Inputs / Outputs + +### Input + +| Name | Type | Description | +| -------------------- | ------------------------------- | ------------------------- | +| `~/input/pointcloud` | `sensor_msgs::msg::PointCloud2` | Input pointcloud's topic. | + +### Output + +| Name | Type | Description | +| -------------------------- | ------------------------------------------------ | ---------------------------------------- | +| `~/output/pointcloud` | `sensor_msgs::msg::PointCloud2` | Processed pointcloud's topic | +| `~/output/pointcloud/cuda` | `negotiated_interfaces/msg/NegotiatedTopicsInfo` | Processed pointcloud's negotiation topic | + +## Parameters + +### Core Parameters + +{{ json_to_markdown("sensing/autoware_cuda_pointcloud_preprocessor/schema/cuda_pointcloud_preprocessor.schema.schema.json") }} + +## Assumptions / Known limits + +- This algorithm assumes that the input points will be in non-decreasing azimuth order (per ring). +- This node expects that the input pointcloud is flat (`height` equals to 1) and follows the `autoware::point_types::PointXYZIRC` layout defined in the `autoware_point_types` package. diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/docs/cuda-pointcloud-preprocessor.md b/sensing/autoware_cuda_pointcloud_preprocessor/docs/cuda-pointcloud-preprocessor.md new file mode 100644 index 0000000000000..459f15569d7ca --- /dev/null +++ b/sensing/autoware_cuda_pointcloud_preprocessor/docs/cuda-pointcloud-preprocessor.md @@ -0,0 +1,46 @@ +# cuda_pointcloud_preprocessor + +## Purpose + +This node implements all standard pointcloud preprocessing algorithms applied to a single LiDAR's pointcloud in CUDA. +In particular, this node implements: + +- crop boxing (ego-vehicle and ego-vehicle's mirrors) +- distortion correction +- ring-based outlier filtering + +## Inner-workings / Algorithms + +As this node reimplements the functionalities of the CPU-version algorithms, please have a look at the documentations of [crop-box](/sensing/autoware_pointcloud_preprocessor/docs/crop-box-filter.md), [distortion correction](/sensing/autoware_pointcloud_preprocessor/docs/distortion-corrector.md), and [ring-based outlier filter](/sensing/autoware_pointcloud_preprocessor/docs/ring-outlier-filter.md) for more information about these algorithms. + +In addition to the individual algorithms previously mentioned, this node uses the `cuda_blackboard`, a cuda transport layer that enables a zero-copy mechanism between GPU and GPU memory for both input and output. + +## Inputs / Outputs + +### Input + +| Name | Type | Description | +| ------------------------- | ------------------------------------------------ | ----------------------------------------- | +| `~/input/pointcloud` | `sensor_msgs::msg::PointCloud2` | Input pointcloud's topic. | +| `~/input/pointcloud/cuda` | `negotiated_interfaces/msg/NegotiatedTopicsInfo` | Input pointcloud's type negotiation topic | +| `~/input/twist` | `geometry_msgs::msg::TwistWithCovarianceStamped` | Topic of the twist information. | +| `~/input/imu` | `sensor_msgs::msg::Imu` | Topic of the IMU data. | + +### Output + +| Name | Type | Description | +| -------------------------- | ------------------------------------------------ | ---------------------------------------- | +| `~/output/pointcloud` | `sensor_msgs::msg::PointCloud2` | Processed pointcloud's topic | +| `~/output/pointcloud/cuda` | `negotiated_interfaces/msg/NegotiatedTopicsInfo` | Processed pointcloud's negotiation topic | + +## Parameters + +### Core Parameters + +{{ json_to_markdown("sensing/autoware_cuda_pointcloud_preprocessor/schema/cuda_pointcloud_preprocessor.schema.schema.json") }} + +## Assumptions / Known limits + +- The CUDA implementations, while following the original CPU ones, will not offer the same numerical results, and small approximations were needed to maximize the GPU use. +- This node expects that the input pointcloud follows the `autoware::point_types::PointXYZIRCAEDT` layout and the output pointcloud will use the `autoware::point_types::PointXYZIRC` layout defined in the `autoware_point_types` package. +- The input pointcloud must be a 2D tensor where each row represents a different channel/ring with its points in non-decreasing azimuth order. Invalid points should contain 0-values. The `cuda_organized_pointcloud_adapter` provides such a pointcloud. diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_organized_pointcloud_adapter/cuda_organized_pointcloud_adapter_node.hpp b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_organized_pointcloud_adapter/cuda_organized_pointcloud_adapter_node.hpp new file mode 100644 index 0000000000000..8930a1f87e63b --- /dev/null +++ b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_organized_pointcloud_adapter/cuda_organized_pointcloud_adapter_node.hpp @@ -0,0 +1,77 @@ +// Copyright 2024 TIER IV, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef AUTOWARE__CUDA_ORGANIZED_POINTCLOUD_ADAPTER__CUDA_ORGANIZED_POINTCLOUD_ADAPTER_NODE_HPP_ +#define AUTOWARE__CUDA_ORGANIZED_POINTCLOUD_ADAPTER__CUDA_ORGANIZED_POINTCLOUD_ADAPTER_NODE_HPP_ + +#include <autoware/point_types/types.hpp> +#include <autoware/universe_utils/ros/debug_publisher.hpp> +#include <autoware/universe_utils/system/stop_watch.hpp> +#include <cuda_blackboard/cuda_adaptation.hpp> +#include <cuda_blackboard/cuda_blackboard_publisher.hpp> +#include <cuda_blackboard/cuda_pointcloud2.hpp> +#include <rclcpp/rclcpp.hpp> + +#include <sensor_msgs/msg/point_cloud2.hpp> + +#include <tf2/transform_datatypes.h> +#include <tf2_ros/buffer.h> +#include <tf2_ros/transform_listener.h> + +#include <chrono> +#include <deque> +#include <memory> +#include <vector> + +namespace autoware::cuda_organized_pointcloud_adapter +{ + +class CudaOrganizedPointcloudAdapterNode : public rclcpp::Node +{ +public: + explicit CudaOrganizedPointcloudAdapterNode(const rclcpp::NodeOptions & node_options); + ~CudaOrganizedPointcloudAdapterNode() = default; + +private: + void estimatePointcloudRingInfo( + const sensor_msgs::msg::PointCloud2::ConstSharedPtr input_pointcloud_msg_ptr); + + bool orderPointcloud( + const sensor_msgs::msg::PointCloud2::ConstSharedPtr input_pointcloud_msg_ptr); + + // Callback + void pointcloudCallback( + const sensor_msgs::msg::PointCloud2::ConstSharedPtr input_pointcloud_msg_ptr); + + // Subscriber + rclcpp::Subscription<sensor_msgs::msg::PointCloud2>::SharedPtr pointcloud_sub_{}; + + // CUDA pub + std::unique_ptr<cuda_blackboard::CudaBlackboardPublisher<cuda_blackboard::CudaPointCloud2>> pub_; + + std::size_t num_rings_{0}; + std::size_t max_points_per_ring_{0}; + + std::vector<std::size_t> next_ring_index_; + std::vector<autoware::point_types::PointXYZIRCAEDT> buffer_; + // autoware::point_types::PointXYZIRCAEDT * device_buffer_; + cuda_blackboard::CudaUniquePtr<std::uint8_t[]> device_buffer_; + + std::unique_ptr<autoware::universe_utils::StopWatch<std::chrono::milliseconds>> stop_watch_ptr_; + std::unique_ptr<autoware::universe_utils::DebugPublisher> debug_publisher_; +}; + +} // namespace autoware::cuda_organized_pointcloud_adapter + +#endif // AUTOWARE__CUDA_ORGANIZED_POINTCLOUD_ADAPTER__CUDA_ORGANIZED_POINTCLOUD_ADAPTER_NODE_HPP_ diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.hpp b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.hpp new file mode 100644 index 0000000000000..b4ae4eecc304e --- /dev/null +++ b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.hpp @@ -0,0 +1,154 @@ +// Copyright 2024 TIER IV, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef AUTOWARE__CUDA_POINTCLOUD_PREPROCESSOR__CUDA_POINTCLOUD_PREPROCESSOR_HPP_ +#define AUTOWARE__CUDA_POINTCLOUD_PREPROCESSOR__CUDA_POINTCLOUD_PREPROCESSOR_HPP_ + +#include "autoware/cuda_pointcloud_preprocessor/point_types.hpp" + +#include <Eigen/Core> +#include <Eigen/Geometry> +#include <cuda_blackboard/cuda_pointcloud2.hpp> + +#include <geometry_msgs/msg/transform_stamped.hpp> +#include <geometry_msgs/msg/twist_with_covariance_stamped.hpp> +#include <geometry_msgs/msg/vector3_stamped.hpp> +#include <sensor_msgs/msg/point_cloud2.hpp> + +#include <thrust/device_vector.h> + +#include <cmath> +#include <deque> + +namespace autoware::cuda_pointcloud_preprocessor +{ + +struct TwistStruct2D +{ + float cum_x; + float cum_y; + float cum_theta; + float cum_cos_theta; + float cum_sin_theta; + uint32_t last_stamp_nsec; // relative to the start of the pointcloud + uint32_t stamp_nsec; // relative to the start of the pointcloud + float vx; + float vtheta; +}; + +struct TwistStruct3D +{ + // Eigen::Affine3f cum_transform; + float cum_transform_buffer[16]; + /* float cum_x; + float cum_y; + float cum_theta; */ + /* float cum_cos_theta; + float cum_sin_theta; */ + uint32_t last_stamp_nsec; // relative to the start of the pointcloud + uint32_t stamp_nsec; // relative to the start of the pointcloud + /* float vx; + float vtheta; */ + float v[3]; + float w[3]; +}; + +struct TransformStruct +{ + float x; + float y; + float z; + float m11; + float m12; + float m13; + float m21; + float m22; + float m23; + float m31; + float m32; + float m33; +}; + +struct CropBoxParameters +{ + float min_x{std::nanf("")}; + float max_x{std::nanf("")}; + float min_y{std::nanf("")}; + float max_y{std::nanf("")}; + float min_z{std::nanf("")}; + float max_z{std::nanf("")}; +}; + +struct RingOutlierFilterParameters +{ + float distance_ratio{std::nanf("")}; + float object_length_threshold{std::nanf("")}; + std::size_t num_points_threshold{0}; +}; + +class CudaPointcloudPreprocessor +{ +public: + CudaPointcloudPreprocessor(); + ~CudaPointcloudPreprocessor() = default; + + void setCropBoxParameters( + const CropBoxParameters & self_crop_box_parameters, + const CropBoxParameters & mirror_crop_box_parameters); + void setRingOutlierFilterParameters(const RingOutlierFilterParameters & ring_outlier_parameters); + void set3DUndistortion(bool value); + + void preallocateOutput(); + + std::unique_ptr<cuda_blackboard::CudaPointCloud2> process( + const cuda_blackboard::CudaPointCloud2 & input_pointcloud_msg, + const geometry_msgs::msg::TransformStamped & transform_msg, + const std::deque<geometry_msgs::msg::TwistWithCovarianceStamped> & twist_queue, + const std::deque<geometry_msgs::msg::Vector3Stamped> & angular_velocity_queue); + +private: + void setupTwist2DStructs( + const cuda_blackboard::CudaPointCloud2 & input_pointcloud_msg, + const std::deque<geometry_msgs::msg::TwistWithCovarianceStamped> & twist_queue, + const std::deque<geometry_msgs::msg::Vector3Stamped> & angular_velocity_queue); + + void setupTwist3DStructs( + const cuda_blackboard::CudaPointCloud2 & input_pointcloud_msg, + const std::deque<geometry_msgs::msg::TwistWithCovarianceStamped> & twist_queue, + const std::deque<geometry_msgs::msg::Vector3Stamped> & angular_velocity_queue); + + CropBoxParameters self_crop_box_parameters_{}; + CropBoxParameters mirror_crop_box_parameters_{}; + RingOutlierFilterParameters ring_outlier_parameters_{}; + bool use_3d_undistortion_{false}; + + int max_rings_{}; + int max_points_per_ring_{}; + + std::vector<sensor_msgs::msg::PointField> point_fields_{}; + std::unique_ptr<cuda_blackboard::CudaPointCloud2> output_pointcloud_ptr_{}; + + thrust::device_vector<InputPointType> device_transformed_points_{}; + thrust::device_vector<OutputPointType> device_output_points_{}; + thrust::device_vector<uint32_t> device_self_crop_mask_{}; + thrust::device_vector<uint32_t> device_mirror_crop_mask_{}; + thrust::device_vector<uint32_t> device_ring_outlier_mask_{}; + thrust::device_vector<uint32_t> device_indices_{}; + thrust::device_vector<TwistStruct2D> device_twist_2d_structs_{}; + thrust::device_vector<TwistStruct3D> device_twist_3d_structs_{}; +}; + +} // namespace autoware::cuda_pointcloud_preprocessor + +#endif // AUTOWARE__CUDA_POINTCLOUD_PREPROCESSOR__CUDA_POINTCLOUD_PREPROCESSOR_HPP_ diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.hpp b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.hpp new file mode 100644 index 0000000000000..5fdf4258eebcb --- /dev/null +++ b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.hpp @@ -0,0 +1,114 @@ +// Copyright 2024 TIER IV, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef AUTOWARE__CUDA_POINTCLOUD_PREPROCESSOR__CUDA_POINTCLOUD_PREPROCESSOR_NODE_HPP_ +#define AUTOWARE__CUDA_POINTCLOUD_PREPROCESSOR__CUDA_POINTCLOUD_PREPROCESSOR_NODE_HPP_ + +#include "autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.hpp" +#include "autoware/cuda_pointcloud_preprocessor/point_types.hpp" + +#include <autoware/point_types/types.hpp> +#include <autoware/universe_utils/ros/debug_publisher.hpp> +#include <autoware/universe_utils/system/stop_watch.hpp> +#include <cuda_blackboard/cuda_adaptation.hpp> +#include <cuda_blackboard/cuda_blackboard_publisher.hpp> +#include <cuda_blackboard/cuda_blackboard_subscriber.hpp> +#include <cuda_blackboard/cuda_pointcloud2.hpp> +#include <rclcpp/rclcpp.hpp> + +#include <geometry_msgs/msg/twist_with_covariance_stamped.hpp> +#include <geometry_msgs/msg/vector3_stamped.hpp> +#include <sensor_msgs/msg/imu.hpp> +#include <sensor_msgs/msg/point_cloud2.hpp> + +#include <tf2/transform_datatypes.h> +#include <tf2_ros/buffer.h> +#include <tf2_ros/transform_listener.h> + +#include <chrono> +#include <deque> +#include <memory> +#include <vector> + +#define CHECK_OFFSET(structure1, structure2, field) \ + static_assert( \ + offsetof(structure1, field) == offsetof(structure2, field), \ + "Offset of " #field " in " #structure1 " does not match expected offset.") + +namespace autoware::cuda_pointcloud_preprocessor +{ + +static_assert(sizeof(InputPointType) == sizeof(autoware::point_types::PointXYZIRCAEDT)); +static_assert(sizeof(OutputPointType) == sizeof(autoware::point_types::PointXYZIRC)); + +CHECK_OFFSET(InputPointType, autoware::point_types::PointXYZIRCAEDT, x); +CHECK_OFFSET(InputPointType, autoware::point_types::PointXYZIRCAEDT, y); +CHECK_OFFSET(InputPointType, autoware::point_types::PointXYZIRCAEDT, z); +CHECK_OFFSET(InputPointType, autoware::point_types::PointXYZIRCAEDT, intensity); +CHECK_OFFSET(InputPointType, autoware::point_types::PointXYZIRCAEDT, return_type); +CHECK_OFFSET(InputPointType, autoware::point_types::PointXYZIRCAEDT, channel); +CHECK_OFFSET(InputPointType, autoware::point_types::PointXYZIRCAEDT, azimuth); +CHECK_OFFSET(InputPointType, autoware::point_types::PointXYZIRCAEDT, elevation); +CHECK_OFFSET(InputPointType, autoware::point_types::PointXYZIRCAEDT, distance); +CHECK_OFFSET(InputPointType, autoware::point_types::PointXYZIRCAEDT, time_stamp); + +CHECK_OFFSET(OutputPointType, autoware::point_types::PointXYZIRCAEDT, x); +CHECK_OFFSET(OutputPointType, autoware::point_types::PointXYZIRCAEDT, y); +CHECK_OFFSET(OutputPointType, autoware::point_types::PointXYZIRCAEDT, z); +CHECK_OFFSET(OutputPointType, autoware::point_types::PointXYZIRCAEDT, intensity); +CHECK_OFFSET(OutputPointType, autoware::point_types::PointXYZIRCAEDT, return_type); +CHECK_OFFSET(OutputPointType, autoware::point_types::PointXYZIRCAEDT, channel); + +class CudaPointcloudPreprocessorNode : public rclcpp::Node +{ +public: + explicit CudaPointcloudPreprocessorNode(const rclcpp::NodeOptions & node_options); + ~CudaPointcloudPreprocessorNode() = default; + +private: + bool getTransform( + const std::string & target_frame, const std::string & source_frame, + tf2::Transform * tf2_transform_ptr); + + // Callback + void pointcloudCallback(const std::shared_ptr<const cuda_blackboard::CudaPointCloud2> cuda_msg); + void twistCallback( + const geometry_msgs::msg::TwistWithCovarianceStamped::ConstSharedPtr pointcloud_msg); + void imuCallback(const sensor_msgs::msg::Imu::ConstSharedPtr imu_msg); + + tf2_ros::Buffer tf2_buffer_; + tf2_ros::TransformListener tf2_listener_; + + std::string base_frame_; + bool imu_tranform_valid_{false}; + std::deque<geometry_msgs::msg::TwistWithCovarianceStamped> twist_queue_; + std::deque<geometry_msgs::msg::Vector3Stamped> angular_velocity_queue_; + + // Subscriber + rclcpp::Subscription<geometry_msgs::msg::TwistWithCovarianceStamped>::SharedPtr twist_sub_{}; + rclcpp::Subscription<sensor_msgs::msg::Imu>::SharedPtr imu_sub_{}; + + // CUDA pub & sub + std::unique_ptr<cuda_blackboard::CudaBlackboardPublisher<cuda_blackboard::CudaPointCloud2>> pub_; + std::unique_ptr<cuda_blackboard::CudaBlackboardSubscriber<cuda_blackboard::CudaPointCloud2>> sub_; + + std::unique_ptr<CudaPointcloudPreprocessor> cuda_pointcloud_preprocessor_; + + std::unique_ptr<autoware::universe_utils::StopWatch<std::chrono::milliseconds>> stop_watch_ptr_; + std::unique_ptr<autoware::universe_utils::DebugPublisher> debug_publisher_; +}; + +} // namespace autoware::cuda_pointcloud_preprocessor + +#endif // AUTOWARE__CUDA_POINTCLOUD_PREPROCESSOR__CUDA_POINTCLOUD_PREPROCESSOR_NODE_HPP_ diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/point_types.hpp b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/point_types.hpp new file mode 100644 index 0000000000000..50010ac0f2b67 --- /dev/null +++ b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/point_types.hpp @@ -0,0 +1,50 @@ + +// Copyright 2024 TIER IV, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#ifndef AUTOWARE__CUDA_POINTCLOUD_PREPROCESSOR__POINT_TYPES_HPP_ +#define AUTOWARE__CUDA_POINTCLOUD_PREPROCESSOR__POINT_TYPES_HPP_ + +#include <cstdint> + +namespace autoware::cuda_pointcloud_preprocessor +{ + +// Note: We can not use PCL nor uniform initialization here because of thrust +struct OutputPointType +{ + float x; + float y; + float z; + std::uint8_t intensity; + std::uint8_t return_type; + std::uint16_t channel; +}; + +struct InputPointType +{ + float x; + float y; + float z; + std::uint8_t intensity; + std::uint8_t return_type; + std::uint16_t channel; + float azimuth; + float elevation; + float distance; + std::uint32_t time_stamp; +}; + +} // namespace autoware::cuda_pointcloud_preprocessor + +#endif // AUTOWARE__CUDA_POINTCLOUD_PREPROCESSOR__POINT_TYPES_HPP_ diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/launch/cuda_pointcloud_preprocessor.launch.xml b/sensing/autoware_cuda_pointcloud_preprocessor/launch/cuda_pointcloud_preprocessor.launch.xml new file mode 100644 index 0000000000000..e8843bd804bf0 --- /dev/null +++ b/sensing/autoware_cuda_pointcloud_preprocessor/launch/cuda_pointcloud_preprocessor.launch.xml @@ -0,0 +1,16 @@ +<launch> + <arg name="input/pointcloud" default="/sensing/lidar/top/pointcloud_raw_ex"/> + <arg name="input/imu" default="/sensing/imu/imu_data"/> + <arg name="input/twist" default="/sensing/vehicle_velocity_converter/twist_with_covariance"/> + <arg name="output/pointcloud" default="/sensing/lidar/top/test"/> + + <arg name="cuda_pointcloud_preprocessor_param_file" default="$(find-pkg-share cuda_pointcloud_preprocessor)/config/cuda_pointcloud_preprocessor.param.yaml"/> + + <node pkg="cuda_pointcloud_preprocessor" exec="cuda_pointcloud_preprocessor_node" name="cuda_pointcloud_preprocessor" output="screen"> + <remap from="~/input/pointcloud" to="$(var input/pointcloud)"/> + <remap from="~/input/imu" to="$(var input/imu)"/> + <remap from="~/input/twist" to="$(var input/twist)"/> + <remap from="~/output/pointcloud" to="$(var output/pointcloud)"/> + <param from="$(var cuda_pointcloud_preprocessor_param_file)"/> + </node> +</launch> diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/package.xml b/sensing/autoware_cuda_pointcloud_preprocessor/package.xml new file mode 100644 index 0000000000000..edbd7ff61c874 --- /dev/null +++ b/sensing/autoware_cuda_pointcloud_preprocessor/package.xml @@ -0,0 +1,32 @@ +<?xml version="1.0"?> +<package format="3"> + <name>autoware_cuda_pointcloud_preprocessor</name> + <version>0.1.0</version> + <description>autoware_cuda_pointcloud_preprocessor</description> + <maintainer email="kenzo.lobos@tier4.jp">Kenzo Lobos-Tsunekawa</maintainer> + <author email="kenzo.lobos@tier4.jp">Kenzo Lobos-Tsunekawa</author> + <license>Apache License 2.0</license> + + <buildtool_depend>ament_cmake_auto</buildtool_depend> + <buildtool_depend>autoware_cmake</buildtool_depend> + + <depend>autoware_point_types</depend> + <depend>autoware_universe_utils</depend> + <depend>cuda_blackboard</depend> + <depend>pcl_conversions</depend> + <depend>pcl_ros</depend> + <depend>rclcpp</depend> + <depend>rclcpp_components</depend> + <depend>sensor_msgs</depend> + <depend>sophus</depend> + <depend>tf2</depend> + <depend>tier4_debug_msgs</depend> + + <test_depend>ament_clang_format</test_depend> + <test_depend>ament_lint_auto</test_depend> + <test_depend>ament_lint_common</test_depend> + + <export> + <build_type>ament_cmake</build_type> + </export> +</package> diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/schema/cuda_pointcloud_preprocessor.schema.json b/sensing/autoware_cuda_pointcloud_preprocessor/schema/cuda_pointcloud_preprocessor.schema.json new file mode 100644 index 0000000000000..196f455ffe75b --- /dev/null +++ b/sensing/autoware_cuda_pointcloud_preprocessor/schema/cuda_pointcloud_preprocessor.schema.json @@ -0,0 +1,140 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "title": "Parameters for the cuda_pointcloud_preprocessor", + "type": "object", + "definitions": { + "crop_box_filter": { + "type": "object", + "properties": { + "base_frame": { + "type": "string", + "description": "The undistortion algorithm is based on a base frame, which must be the same as the twist frame.", + "default": "base_link" + }, + "use_imu": { + "type": "boolean", + "description": "Use IMU angular velocity, otherwise, use twist angular velocity.", + "default": "true" + }, + "use_3d_distortion_correction": { + "type": "boolean", + "description": "Use 3d distortion correction algorithm, otherwise, use 2d distortion correction algorithm.", + "default": "false" + }, + "distance_ratio": { + "type": "number", + "description": "distance_ratio", + "default": "1.03", + "minimum": 0.0 + }, + "object_length_threshold": { + "type": "number", + "description": "object_length_threshold", + "default": "0.1", + "minimum": 0.0 + }, + "num_points_threshold": { + "type": "integer", + "description": "num_points_threshold", + "default": "4", + "minimum": 0 + }, + "self_crop.min_x": { + "type": "number", + "description": "minimum x-coordinate value for self cropping in meters", + "default": "-1.0" + }, + "self_crop.min_y": { + "type": "number", + "description": "minimum y-coordinate value for self cropping in meters", + "default": "-1.0" + }, + "self_crop.min_z": { + "type": "number", + "description": "minimum z-coordinate value for self cropping in meters", + "default": "-1.0" + }, + "self_crop.max_x": { + "type": "number", + "description": "maximum x-coordinate value for self cropping in meters", + "default": "1.0" + }, + "self_crop.max_y": { + "type": "number", + "description": "maximum y-coordinate value for self cropping in meters", + "default": "1.0" + }, + "self_crop.max_z": { + "type": "number", + "description": "maximum z-coordinate value for self cropping in meters", + "default": "1.0" + }, + "mirror_crop.min_x": { + "type": "number", + "description": "minimum x-coordinate value for mirror cropping in meters", + "default": "-1.0" + }, + "mirror_crop.min_y": { + "type": "number", + "description": "minimum y-coordinate value for mirror cropping in meters", + "default": "-1.0" + }, + "mirror_crop.min_z": { + "type": "number", + "description": "minimum z-coordinate value for mirror cropping in meters", + "default": "-1.0" + }, + "mirror_crop.max_x": { + "type": "number", + "description": "maximum x-coordinate value for mirror cropping in meters", + "default": "1.0" + }, + "mirror_crop.max_y": { + "type": "number", + "description": "maximum y-coordinate value for mirror cropping in meters", + "default": "1.0" + }, + "mirror_crop.max_z": { + "type": "number", + "description": "maximum z-coordinate value for mirror cropping in meters", + "default": "1.0" + } + }, + "required": [ + "base_frame", + "use_imu", + "use_3d_distortion_correction", + "distance_ratio", + "object_length_threshold", + "num_points_threshold", + "self_crop.min_x", + "self_crop.min_y", + "self_crop.min_z", + "self_crop.max_x", + "self_crop.max_y", + "self_crop.max_z", + "mirror_crop.min_x", + "mirror_crop.min_y", + "mirror_crop.min_z", + "mirror_crop.max_x", + "mirror_crop.max_y", + "mirror_crop.max_z" + ], + "additionalProperties": false + } + }, + "properties": { + "/**": { + "type": "object", + "properties": { + "ros__parameters": { + "$ref": "#/definitions/cuda_pointcloud_preprocessor" + } + }, + "required": ["ros__parameters"], + "additionalProperties": false + } + }, + "required": ["/**"], + "additionalProperties": false +} diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_organized_pointcloud_adapter/cuda_organized_pointcloud_adapter_node.cpp b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_organized_pointcloud_adapter/cuda_organized_pointcloud_adapter_node.cpp new file mode 100644 index 0000000000000..bd5717fbbcb5e --- /dev/null +++ b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_organized_pointcloud_adapter/cuda_organized_pointcloud_adapter_node.cpp @@ -0,0 +1,203 @@ +// Copyright 2024 TIER IV, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "autoware/cuda_organized_pointcloud_adapter/cuda_organized_pointcloud_adapter_node.hpp" + +#include <cuda_runtime.h> + +#include <vector> + +namespace autoware::cuda_organized_pointcloud_adapter +{ +using sensor_msgs::msg::PointCloud2; + +CudaOrganizedPointcloudAdapterNode::CudaOrganizedPointcloudAdapterNode( + const rclcpp::NodeOptions & node_options) +: Node("cuda_organized_pointcloud_adapter", node_options) +{ + pub_ = + std::make_unique<cuda_blackboard::CudaBlackboardPublisher<cuda_blackboard::CudaPointCloud2>>( + *this, "~/output/pointcloud"); + + rclcpp::SubscriptionOptions sub_options; + sub_options.use_intra_process_comm = rclcpp::IntraProcessSetting::Enable; + + pointcloud_sub_ = this->create_subscription<sensor_msgs::msg::PointCloud2>( + "~/input/pointcloud", rclcpp::SensorDataQoS{}.keep_last(1), + std::bind(&CudaOrganizedPointcloudAdapterNode::pointcloudCallback, this, std::placeholders::_1), + sub_options); + + // initialize debug tool + { + using autoware::universe_utils::DebugPublisher; + using autoware::universe_utils::StopWatch; + + stop_watch_ptr_ = std::make_unique<StopWatch<std::chrono::milliseconds>>(); + debug_publisher_ = std::make_unique<DebugPublisher>(this, "cuda_organized_pointcloud_adapter"); + stop_watch_ptr_->tic("processing_time"); + } +} + +void CudaOrganizedPointcloudAdapterNode::estimatePointcloudRingInfo( + const sensor_msgs::msg::PointCloud2::ConstSharedPtr input_pointcloud_msg_ptr) +{ + const autoware::point_types::PointXYZIRCAEDT * input_buffer = + reinterpret_cast<const autoware::point_types::PointXYZIRCAEDT *>( + input_pointcloud_msg_ptr->data.data()); + + std::size_t max_ring = 0; + + for (std::size_t i = 0; i < input_pointcloud_msg_ptr->width * input_pointcloud_msg_ptr->height; + i++) { + const autoware::point_types::PointXYZIRCAEDT & point = input_buffer[i]; + const std::size_t ring = static_cast<std::size_t>(point.channel); + max_ring = std::max(max_ring, ring); + } + + // Set max rings to the next power of two + num_rings_ = std::pow(2, std::ceil(std::log2(max_ring + 1))); + num_rings_ = std::max(num_rings_, static_cast<std::size_t>(16)); + std::vector<std::size_t> ring_points(num_rings_, 0); + + for (std::size_t i = 0; i < input_pointcloud_msg_ptr->width * input_pointcloud_msg_ptr->height; + i++) { + const autoware::point_types::PointXYZIRCAEDT & point = input_buffer[i]; + const std::size_t ring = point.channel; + ring_points[ring]++; + } + + // Set max points per ring to the next multiple of 512 + max_points_per_ring_ = *std::max_element(ring_points.begin(), ring_points.end()); + max_points_per_ring_ = std::max(max_points_per_ring_, static_cast<std::size_t>(512)); + max_points_per_ring_ = (max_points_per_ring_ + 511) / 512 * 512; + + next_ring_index_.resize(num_rings_); + std::fill(next_ring_index_.begin(), next_ring_index_.end(), 0); + buffer_.resize(num_rings_ * max_points_per_ring_); + + /* if (device_buffer_ != nullptr) { + cudaFree(device_buffer_); + } */ + + device_buffer_ = cuda_blackboard::make_unique<std::uint8_t[]>( + num_rings_ * max_points_per_ring_ * sizeof(autoware::point_types::PointXYZIRCAEDT)); + + /* cudaMalloc( + device_buffer_, + num_rings_ * max_points_per_ring_ * sizeof(autoware::point_types::PointXYZIRCAEDT)); */ + + RCLCPP_INFO_STREAM( + get_logger(), "Estimated rings: " << num_rings_ + << ", max_points_per_ring: " << max_points_per_ring_ + << ". This should only be done during the first iterations. " + "Otherwise, performance will be affected."); +} + +bool CudaOrganizedPointcloudAdapterNode::orderPointcloud( + const sensor_msgs::msg::PointCloud2::ConstSharedPtr input_pointcloud_msg_ptr) +{ + const autoware::point_types::PointXYZIRCAEDT * input_buffer = + reinterpret_cast<const autoware::point_types::PointXYZIRCAEDT *>( + input_pointcloud_msg_ptr->data.data()); + + bool ring_overflow = false; + bool point_overflow = false; + + for (std::size_t i = 0; i < input_pointcloud_msg_ptr->width * input_pointcloud_msg_ptr->height; + i++) { + const autoware::point_types::PointXYZIRCAEDT & point = input_buffer[i]; + const std::size_t raw_ring = point.channel; + const std::size_t ring = raw_ring % num_rings_; + + const std::size_t raw_index = next_ring_index_[ring]; + const std::size_t index = raw_index % max_points_per_ring_; + + ring_overflow |= raw_ring >= num_rings_; + point_overflow |= raw_index >= max_points_per_ring_; + + buffer_[ring * max_points_per_ring_ + index] = point; + next_ring_index_[ring] = (index + 1) % max_points_per_ring_; + } + + return !ring_overflow && !point_overflow; +} + +void CudaOrganizedPointcloudAdapterNode::pointcloudCallback( + const sensor_msgs::msg::PointCloud2::ConstSharedPtr input_pointcloud_msg_ptr) +{ + stop_watch_ptr_->toc("processing_time", true); + + // TODO(knzo25): check the pointcloud layout at least once + + assert(input_pointcloud_msg_ptr->point_step == sizeof(autoware::point_types::PointXYZIRCAEDT)); + + if (num_rings_ == 0 || max_points_per_ring_ == 0) { + estimatePointcloudRingInfo(input_pointcloud_msg_ptr); + } + + if (!orderPointcloud(input_pointcloud_msg_ptr)) { + estimatePointcloudRingInfo(input_pointcloud_msg_ptr); + orderPointcloud(input_pointcloud_msg_ptr); + } + + // Copy to cuda memory + cudaMemcpy( + device_buffer_.get(), buffer_.data(), + num_rings_ * max_points_per_ring_ * sizeof(autoware::point_types::PointXYZIRCAEDT), + cudaMemcpyHostToDevice); + + auto cuda_pointcloud_msg_ptr = std::make_unique<cuda_blackboard::CudaPointCloud2>(); + cuda_pointcloud_msg_ptr->width = max_points_per_ring_; + cuda_pointcloud_msg_ptr->height = num_rings_; + cuda_pointcloud_msg_ptr->point_step = sizeof(autoware::point_types::PointXYZIRCAEDT); + cuda_pointcloud_msg_ptr->row_step = + max_points_per_ring_ * sizeof(autoware::point_types::PointXYZIRCAEDT); + cuda_pointcloud_msg_ptr->data = + std::move(device_buffer_); /*reinterpret_cast<uint8_t *>(device_buffer_)*/ + ; + cuda_pointcloud_msg_ptr->is_dense = input_pointcloud_msg_ptr->is_dense; + cuda_pointcloud_msg_ptr->header = input_pointcloud_msg_ptr->header; + + pub_->publish(std::move(cuda_pointcloud_msg_ptr)); + + if (debug_publisher_) { + const double processing_time_ms = stop_watch_ptr_->toc("processing_time", true); + debug_publisher_->publish<tier4_debug_msgs::msg::Float64Stamped>( + "debug/processing_time_ms", processing_time_ms); + + double now_stamp_seconds = rclcpp::Time(this->get_clock()->now()).seconds(); + double cloud_stamp_seconds = rclcpp::Time(input_pointcloud_msg_ptr->header.stamp).seconds(); + + debug_publisher_->publish<tier4_debug_msgs::msg::Float64Stamped>( + "debug/latency_ms", 1000.f * (now_stamp_seconds - cloud_stamp_seconds)); + } + + // Allocate cuda memory + device_buffer_ = cuda_blackboard::make_unique<std::uint8_t[]>( + num_rings_ * max_points_per_ring_ * sizeof(autoware::point_types::PointXYZIRCAEDT)); + /* cudaMalloc( + &device_buffer_, + num_rings_ * max_points_per_ring_ * sizeof(autoware::point_types::PointXYZIRCAEDT)); */ + // Clear indexes + std::fill(next_ring_index_.begin(), next_ring_index_.end(), 0); + + // Clear pointcloud buffer + std::fill(buffer_.begin(), buffer_.end(), autoware::point_types::PointXYZIRCAEDT{}); +} + +} // namespace autoware::cuda_organized_pointcloud_adapter + +#include "rclcpp_components/register_node_macro.hpp" +RCLCPP_COMPONENTS_REGISTER_NODE( + autoware::cuda_organized_pointcloud_adapter::CudaOrganizedPointcloudAdapterNode) diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu new file mode 100644 index 0000000000000..ad8910aef5b76 --- /dev/null +++ b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu @@ -0,0 +1,692 @@ +// Copyright 2024 TIER IV, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.hpp" +#include "autoware/cuda_pointcloud_preprocessor/point_types.hpp" + +#include <Eigen/Core> +#include <Eigen/Dense> +#include <Eigen/Geometry> + +#include <cuda_runtime.h> +#include <tf2/utils.h> +#include <thrust/execution_policy.h> +#include <thrust/host_vector.h> +#include <thrust/reduce.h> +#include <thrust/scan.h> + +namespace autoware::cuda_pointcloud_preprocessor +{ + +__host__ __device__ Eigen::Matrix3f skewSymmetric(const Eigen::Vector3f & v) +{ + Eigen::Matrix3f m; + m << 0, -v.z(), v.y(), v.z(), 0, -v.x(), -v.y(), v.x(), 0; + return m; +} + +__host__ __device__ Eigen::Matrix3f leftJacobianSO3(const Eigen::Vector3f & omega) +{ + double theta = omega.norm(); + if (std::abs(theta) < 1e-6) { + return Eigen::Matrix3f::Identity(); + } + + Eigen::Matrix3f Omega = skewSymmetric(omega); + + Eigen::Matrix3f Omega2 = Omega * Omega; + double theta2 = theta * theta; + double theta3 = theta2 * theta; + + // Rodrigues' formula for Jacobian + return Eigen::Matrix3f::Identity() + ((1 - cos(theta)) / theta2) * Omega + + ((theta - sin(theta)) / theta3) * Omega2; +} + +__host__ __device__ Eigen::Matrix4f transformationMatrixFromVelocity( + const Eigen::Vector3f & linear_velocity, const Eigen::Vector3f & angular_velocity, double dt) +{ + Eigen::Matrix3f R = Eigen::AngleAxisf(angular_velocity.norm() * dt, angular_velocity.normalized()) + .toRotationMatrix(); + Eigen::Matrix3f J = leftJacobianSO3(angular_velocity * dt); + + Eigen::Vector3f translation = J * (linear_velocity * dt); + + Eigen::Matrix4f transformation = Eigen::Matrix4f::Identity(); + transformation.block<3, 3>(0, 0) = R; + transformation.block<3, 1>(0, 3) = translation; + + return transformation; +} + +__global__ void transformPointsKernel( + const InputPointType * input_points, InputPointType * output_points, int num_points, + TransformStruct transform) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < num_points) { + output_points[idx] = input_points[idx]; + + const float x = input_points[idx].x; + const float y = input_points[idx].y; + const float z = input_points[idx].z; + + output_points[idx].x = transform.m11 * x + transform.m12 * y + transform.m13 * z + transform.x; + output_points[idx].y = transform.m21 * x + transform.m22 * y + transform.m23 * z + transform.y; + output_points[idx].z = transform.m31 * x + transform.m32 * y + transform.m33 * z + transform.z; + } +} + +__global__ void cropBoxKernel( + InputPointType * d_points, uint32_t * output_mask, int num_points, float min_x, float min_y, + float min_z, float max_x, float max_y, float max_z) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < num_points) { + const float x = d_points[idx].x; + const float y = d_points[idx].y; + const float z = d_points[idx].z; + + if (x > min_x && x < max_x && y > min_y && y < max_y && z > min_z && z < max_z) { + output_mask[idx] = 0; + } else { + output_mask[idx] = 1; + } + } +} + +__global__ void combineMasksKernel( + uint32_t * mask1, uint32_t * mask2, uint32_t * mask3, int num_points, uint32_t * output_mask) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < num_points) { + output_mask[idx] = mask1[idx] & mask2[idx] & mask3[idx]; + } +} + +__global__ void extractInputPointIndicesKernel( + InputPointType * input_points, uint32_t * masks, uint32_t * indices, int num_points, + InputPointType * output_points) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < num_points && masks[idx] == 1) { + output_points[indices[idx] - 1] = input_points[idx]; + } +} + +__global__ void extractOutputPointIndicesKernel( + OutputPointType * input_points, uint32_t * masks, uint32_t * indices, int num_points, + OutputPointType * output_points) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < num_points && masks[idx] == 1) { + output_points[indices[idx] - 1] = input_points[idx]; + } +} + +__global__ void extractInputPointsToOutputPoints_indicesKernel( + InputPointType * input_points, uint32_t * masks, uint32_t * indices, int num_points, + OutputPointType * output_points) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < num_points && masks[idx] == 1) { + InputPointType & input_point = input_points[idx]; + OutputPointType & output_point = output_points[indices[idx] - 1]; + output_point.x = input_point.x; + output_point.y = input_point.y; + output_point.z = input_point.z; + output_point.intensity = input_point.intensity; + output_point.return_type = input_point.return_type; + output_point.channel = input_point.channel; + } +} + +__global__ void undistort2dKernel( + InputPointType * input_points, int num_points, TwistStruct2D * twist_structs, int num_twists) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < num_points) { + InputPointType & point = input_points[idx]; + + // The twist must always be newer than the point ! (or it was the last twist) + int twist_index = 0; + while (twist_index < num_twists && twist_structs[twist_index].stamp_nsec < point.time_stamp) { + twist_index++; + } + + twist_index = min(twist_index, num_twists - 1); + + TwistStruct2D twist = twist_structs[twist_index]; + float x = twist.cum_x; + float y = twist.cum_y; + float theta = twist.cum_theta; + + double dt_nsec = + point.time_stamp > twist.last_stamp_nsec ? point.time_stamp - twist.last_stamp_nsec : 0; + double dt = 1e-9 * (dt_nsec); + + theta += twist.vtheta * dt; + float d = twist.vx * dt; + x += d * cos(theta); + y += d * sin(theta); + + float distorted_x = point.x; + float distorted_y = point.y; + + point.x = distorted_x * cos(theta) - distorted_y * sin(theta) + x; + point.y = distorted_x * sin(theta) + distorted_y * cos(theta) + y; + } +} + +__global__ void undistort3dKernel( + InputPointType * input_points, int num_points, TwistStruct3D * twist_structs, int num_twists) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < num_points) { + InputPointType & point = input_points[idx]; + + // The twist must always be newer than the point ! (or it was the last twist) + int twist_index = 0; + while (twist_index < num_twists && twist_structs[twist_index].stamp_nsec < point.time_stamp) { + twist_index++; + } + + twist_index = min(twist_index, num_twists - 1); + + TwistStruct3D twist = twist_structs[twist_index]; + Eigen::Map<Eigen::Matrix4f> cum_transform_buffer_map(twist.cum_transform_buffer); + Eigen::Map<Eigen::Vector3f> v_map(twist.v); + Eigen::Map<Eigen::Vector3f> w_map(twist.w); + + double dt_nsec = + point.time_stamp > twist.last_stamp_nsec ? point.time_stamp - twist.last_stamp_nsec : 0; + double dt = 1e-9 * (dt_nsec); + + Eigen::Matrix4f transform = + cum_transform_buffer_map * transformationMatrixFromVelocity(v_map, w_map, dt); + Eigen::Vector3f p(point.x, point.y, point.z); + p = transform.block<3, 3>(0, 0) * p + transform.block<3, 1>(0, 3); + + point.x = p.x(); + point.y = p.y(); + point.z = p.z(); + } +} + +__global__ void ringOutlierFilterKernel( + const InputPointType * d_points, uint32_t * output_mask, int num_rings, int max_points_per_ring, + float distance_ratio, float object_length_threshold_squared, int num_points_threshold) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + int j = idx / max_points_per_ring; + int i = idx % max_points_per_ring; + + if (j >= num_rings || i >= max_points_per_ring) { + return; + } + + int min_i = max(i - num_points_threshold, 0); + int max_i = min(i + num_points_threshold, max_points_per_ring); + + int walk_size = 1; + int left_idx = min_i; + int right_idx = min_i + 1; + + for (int k = min_i; k < max_i - 1; k++) { + const InputPointType & left_point = d_points[j * max_points_per_ring + k]; + const InputPointType & right_point = d_points[j * max_points_per_ring + k + 1]; + + // Find biggest walk that passes through i + float azimuth_diff = right_point.azimuth - left_point.azimuth; + azimuth_diff = azimuth_diff < 0.f ? azimuth_diff + 2 * M_PI : azimuth_diff; + + if ( + max(left_point.distance, right_point.distance) < + min(left_point.distance, right_point.distance) * distance_ratio && + azimuth_diff < 1.f * M_PI / 180.f) { + // Determined to be included in the same walk + walk_size++; + right_idx++; + } else if (k >= i) { + break; + } else { + walk_size = 1; + left_idx = k + 1; + right_idx = k + 2; // this is safe since we break if k >= i + } + } + + const InputPointType & left_point = d_points[j * max_points_per_ring + left_idx]; + const InputPointType & right_point = d_points[j * max_points_per_ring + right_idx - 1]; + const float x = left_point.x - right_point.x; + const float y = left_point.y - right_point.y; + const float z = left_point.z - right_point.z; + + output_mask[j * max_points_per_ring + i] = + ((walk_size > num_points_threshold) || + (x * x + y * y + z * z >= object_length_threshold_squared)) + ? 1 + : 0; +} + +__global__ void transformPointTypeKernel( + const InputPointType * device_input_points, int num_points, OutputPointType * device_ouput_points) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < num_points) { + const InputPointType & input_point = device_input_points[idx]; + OutputPointType & output_point = device_ouput_points[idx]; + + output_point.x = input_point.x; + output_point.y = input_point.y; + output_point.z = input_point.z; + output_point.intensity = (float)input_point.intensity; + } +} + +CudaPointcloudPreprocessor::CudaPointcloudPreprocessor() +{ + sensor_msgs::msg::PointField x_field, y_field, z_field, intensity_field, return_type_field, + channel_field, azimuth_field, elevation_field, distance_field, time_stamp_field; + x_field.name = "x"; + x_field.offset = 0; + x_field.datatype = sensor_msgs::msg::PointField::FLOAT32; + x_field.count = 1; + + y_field.name = "y"; + y_field.offset = 4; + y_field.datatype = sensor_msgs::msg::PointField::FLOAT32; + y_field.count = 1; + + z_field.name = "z"; + z_field.offset = 8; + z_field.datatype = sensor_msgs::msg::PointField::FLOAT32; + z_field.count = 1; + + intensity_field.name = "intensity"; + intensity_field.offset = 12; + intensity_field.datatype = sensor_msgs::msg::PointField::UINT8; + intensity_field.count = 1; + + return_type_field.name = "return_type"; + return_type_field.offset = 13; + return_type_field.datatype = sensor_msgs::msg::PointField::UINT8; + return_type_field.count = 1; + + channel_field.name = "channel"; + channel_field.offset = 14; + channel_field.datatype = sensor_msgs::msg::PointField::UINT16; + channel_field.count = 1; + + static_assert(sizeof(OutputPointType) == 16, "OutputPointType size is not 16 bytes"); + static_assert(offsetof(OutputPointType, x) == 0); + static_assert(offsetof(OutputPointType, y) == 4); + static_assert(offsetof(OutputPointType, z) == 8); + static_assert(offsetof(OutputPointType, intensity) == 12); + static_assert(offsetof(OutputPointType, return_type) == 13); + static_assert(offsetof(OutputPointType, channel) == 14); + + point_fields_.push_back(x_field); + point_fields_.push_back(y_field); + point_fields_.push_back(z_field); + point_fields_.push_back(intensity_field); + point_fields_.push_back(return_type_field); + point_fields_.push_back(channel_field); +} + +void CudaPointcloudPreprocessor::setCropBoxParameters( + const CropBoxParameters & self_crop_box_parameters, + const CropBoxParameters & mirror_crop_box_parameters) +{ + self_crop_box_parameters_ = self_crop_box_parameters; + mirror_crop_box_parameters_ = mirror_crop_box_parameters; +} + +void CudaPointcloudPreprocessor::setRingOutlierFilterParameters( + const RingOutlierFilterParameters & ring_outlier_parameters) +{ + ring_outlier_parameters_ = ring_outlier_parameters; +} + +void CudaPointcloudPreprocessor::set3DUndistortion(bool use_3d_undistortion) +{ + use_3d_undistortion_ = use_3d_undistortion; +} + +void CudaPointcloudPreprocessor::preallocateOutput() +{ + output_pointcloud_ptr_ = std::make_unique<cuda_blackboard::CudaPointCloud2>(); + output_pointcloud_ptr_->data = cuda_blackboard::make_unique<std::uint8_t[]>( + max_rings_ * max_points_per_ring_ * sizeof(OutputPointType)); +} + +void CudaPointcloudPreprocessor::setupTwist2DStructs( + const cuda_blackboard::CudaPointCloud2 & input_pointcloud_msg, + const std::deque<geometry_msgs::msg::TwistWithCovarianceStamped> & twist_queue, + const std::deque<geometry_msgs::msg::Vector3Stamped> & angular_velocity_queue) +{ + const InputPointType * device_input_points = + reinterpret_cast<const InputPointType *>(input_pointcloud_msg.data.get()); + InputPointType first_point; + cudaMemcpy(&first_point, &device_input_points[0], sizeof(InputPointType), cudaMemcpyDeviceToHost); + + // Twist preprocessing + + uint64_t pointcloud_stamp_nsec = 1'000'000'000 * input_pointcloud_msg.header.stamp.sec + + input_pointcloud_msg.header.stamp.nanosec; + + thrust::host_vector<TwistStruct2D> host_twist_structs; + + float cum_x = 0; + float cum_y = 0; + float cum_theta = 0; + // All time stamps from now on are in nsec from the "beginning of the pointcloud" + uint32_t last_stamp_nsec = first_point.time_stamp; + + std::size_t twist_index = 0; + std::size_t angular_velocity_index = 0; + + for (; twist_index < twist_queue.size() || + angular_velocity_index < angular_velocity_queue.size();) { + uint64_t twist_stamp, input_twist_global_stamp_nsec, angular_velocity_global_stamp_nsec; + float vx, vtheta; + + if (twist_index < twist_queue.size()) { + input_twist_global_stamp_nsec = + 1'000'000'000 * static_cast<uint64_t>(twist_queue[twist_index].header.stamp.sec) + + static_cast<uint64_t>(twist_queue[twist_index].header.stamp.nanosec); + vx = twist_queue[twist_index].twist.twist.linear.x; + } else { + input_twist_global_stamp_nsec = std::numeric_limits<uint64_t>::max(); + vx = 0.0; + } + + if (angular_velocity_index < angular_velocity_queue.size()) { + angular_velocity_global_stamp_nsec = + 1'000'000'000 * + static_cast<uint64_t>(angular_velocity_queue[angular_velocity_index].header.stamp.sec) + + static_cast<uint64_t>(angular_velocity_queue[angular_velocity_index].header.stamp.nanosec); + vtheta = angular_velocity_queue[angular_velocity_index].vector.z; + } else { + angular_velocity_global_stamp_nsec = std::numeric_limits<uint64_t>::max(); + vtheta = 0.0; + } + + if (input_twist_global_stamp_nsec < angular_velocity_global_stamp_nsec) { + twist_stamp = input_twist_global_stamp_nsec; + twist_index++; + } else if (input_twist_global_stamp_nsec > angular_velocity_global_stamp_nsec) { + twist_stamp = angular_velocity_global_stamp_nsec; + angular_velocity_index++; + } else { + twist_index++; + angular_velocity_index++; + } + + TwistStruct2D twist; + twist.cum_x = cum_x; + twist.cum_y = cum_y; + twist.cum_theta = cum_theta; + + uint64_t twist_global_stamp_nsec = twist_stamp; + assert(twist_global_stamp_nsec > pointcloud_stamp_nsec); // by construction + uint32_t twist_from_pointcloud_start_nsec = twist_global_stamp_nsec - pointcloud_stamp_nsec; + + twist.stamp_nsec = twist_from_pointcloud_start_nsec; + twist.vx = vx; + twist.vtheta = vtheta; + twist.last_stamp_nsec = last_stamp_nsec; + host_twist_structs.push_back(twist); + + double dt_seconds = 1e-9 * (twist.stamp_nsec - last_stamp_nsec); + last_stamp_nsec = twist.stamp_nsec; + cum_theta += vtheta * dt_seconds; + float d = twist.vx * dt_seconds; + cum_x += d * cos(cum_theta); + cum_y += d * sin(cum_theta); + } + + // Copy to device + device_twist_2d_structs_ = host_twist_structs; +} + +void CudaPointcloudPreprocessor::setupTwist3DStructs( + const cuda_blackboard::CudaPointCloud2 & input_pointcloud_msg, + const std::deque<geometry_msgs::msg::TwistWithCovarianceStamped> & twist_queue, + const std::deque<geometry_msgs::msg::Vector3Stamped> & angular_velocity_queue) +{ + const InputPointType * device_input_points = + reinterpret_cast<const InputPointType *>(input_pointcloud_msg.data.get()); + InputPointType first_point; + cudaMemcpy(&first_point, &device_input_points[0], sizeof(InputPointType), cudaMemcpyDeviceToHost); + + // Twist preprocessing + + uint64_t pointcloud_stamp_nsec = 1'000'000'000 * input_pointcloud_msg.header.stamp.sec + + input_pointcloud_msg.header.stamp.nanosec; + + thrust::host_vector<TwistStruct3D> host_twist_structs; + + Eigen::Matrix4f cum_transform = Eigen::Matrix4f::Identity(); + Eigen::Vector3f v = Eigen::Vector3f::Zero(); + Eigen::Vector3f w = Eigen::Vector3f::Zero(); + + // All time stamps from now on are in nsec from the "beginning of the pointcloud" + uint32_t last_stamp_nsec = first_point.time_stamp; + + std::size_t twist_index = 0; + std::size_t angular_velocity_index = 0; + + for (; twist_index < twist_queue.size() || + angular_velocity_index < angular_velocity_queue.size();) { + uint64_t twist_stamp, input_twist_global_stamp_nsec, angular_velocity_global_stamp_nsec; + + if (twist_index < twist_queue.size()) { + input_twist_global_stamp_nsec = + 1'000'000'000 * static_cast<uint64_t>(twist_queue[twist_index].header.stamp.sec) + + static_cast<uint64_t>(twist_queue[twist_index].header.stamp.nanosec); + v.x() = twist_queue[twist_index].twist.twist.linear.x; + v.y() = twist_queue[twist_index].twist.twist.linear.y; + v.z() = twist_queue[twist_index].twist.twist.linear.z; + } else { + input_twist_global_stamp_nsec = std::numeric_limits<uint64_t>::max(); + v = Eigen::Vector3f::Zero(); + } + + if (angular_velocity_index < angular_velocity_queue.size()) { + angular_velocity_global_stamp_nsec = + 1'000'000'000 * + static_cast<uint64_t>(angular_velocity_queue[angular_velocity_index].header.stamp.sec) + + static_cast<uint64_t>(angular_velocity_queue[angular_velocity_index].header.stamp.nanosec); + w.x() = angular_velocity_queue[angular_velocity_index].vector.x; + w.y() = angular_velocity_queue[angular_velocity_index].vector.y; + w.z() = angular_velocity_queue[angular_velocity_index].vector.z; + } else { + angular_velocity_global_stamp_nsec = std::numeric_limits<uint64_t>::max(); + w = Eigen::Vector3f::Zero(); + } + + if (input_twist_global_stamp_nsec < angular_velocity_global_stamp_nsec) { + twist_stamp = input_twist_global_stamp_nsec; + twist_index++; + } else if (input_twist_global_stamp_nsec > angular_velocity_global_stamp_nsec) { + twist_stamp = angular_velocity_global_stamp_nsec; + angular_velocity_index++; + } else { + twist_index++; + angular_velocity_index++; + } + + TwistStruct3D twist; + + Eigen::Map<Eigen::Matrix4f> cum_transform_buffer_map(twist.cum_transform_buffer); + Eigen::Map<Eigen::Vector3f> v_map(twist.v); + Eigen::Map<Eigen::Vector3f> w_map(twist.w); + cum_transform_buffer_map = cum_transform; + + uint64_t twist_global_stamp_nsec = twist_stamp; + assert(twist_global_stamp_nsec > pointcloud_stamp_nsec); // by construction + uint32_t twist_from_pointcloud_start_nsec = twist_global_stamp_nsec - pointcloud_stamp_nsec; + + twist.stamp_nsec = twist_from_pointcloud_start_nsec; + v_map = v; + w_map = w; + twist.last_stamp_nsec = last_stamp_nsec; + host_twist_structs.push_back(twist); + + double dt_seconds = 1e-9 * (twist.stamp_nsec - last_stamp_nsec); + last_stamp_nsec = twist.stamp_nsec; + + auto delta_transform = transformationMatrixFromVelocity(v, w, dt_seconds); + cum_transform = cum_transform * delta_transform; + } + + // Copy to device + device_twist_3d_structs_ = host_twist_structs; +} + +std::unique_ptr<cuda_blackboard::CudaPointCloud2> CudaPointcloudPreprocessor::process( + const cuda_blackboard::CudaPointCloud2 & input_pointcloud_msg, + const geometry_msgs::msg::TransformStamped & transform_msg, + const std::deque<geometry_msgs::msg::TwistWithCovarianceStamped> & twist_queue, + const std::deque<geometry_msgs::msg::Vector3Stamped> & angular_velocity_queue) +{ + auto frame_id = input_pointcloud_msg.header.frame_id; + + auto num_input_points = input_pointcloud_msg.width * input_pointcloud_msg.height; + if ( + input_pointcloud_msg.width * input_pointcloud_msg.height > max_rings_ * max_points_per_ring_) { + max_rings_ = input_pointcloud_msg.height; + max_points_per_ring_ = input_pointcloud_msg.width; + + device_transformed_points_.resize(num_input_points); + device_self_crop_mask_.resize(num_input_points); + device_mirror_crop_mask_.resize(num_input_points); + device_ring_outlier_mask_.resize(num_input_points); + device_indices_.resize(num_input_points); + + preallocateOutput(); + } + + tf2::Quaternion rotation_quaternion( + transform_msg.transform.rotation.x, transform_msg.transform.rotation.y, + transform_msg.transform.rotation.z, transform_msg.transform.rotation.w); + tf2::Matrix3x3 rotation_matrix; + rotation_matrix.setRotation(rotation_quaternion); + + TransformStruct transform_struct; + transform_struct.x = static_cast<float>(transform_msg.transform.translation.x); + transform_struct.y = static_cast<float>(transform_msg.transform.translation.y); + transform_struct.z = static_cast<float>(transform_msg.transform.translation.z); + transform_struct.m11 = static_cast<float>(rotation_matrix[0][0]); + transform_struct.m12 = static_cast<float>(rotation_matrix[0][1]); + transform_struct.m13 = static_cast<float>(rotation_matrix[0][2]); + transform_struct.m21 = static_cast<float>(rotation_matrix[1][0]); + transform_struct.m22 = static_cast<float>(rotation_matrix[1][1]); + transform_struct.m23 = static_cast<float>(rotation_matrix[1][2]); + transform_struct.m31 = static_cast<float>(rotation_matrix[2][0]); + transform_struct.m32 = static_cast<float>(rotation_matrix[2][1]); + transform_struct.m33 = static_cast<float>(rotation_matrix[2][2]); + + // Twist preprocessing + const InputPointType * device_input_points = + reinterpret_cast<const InputPointType *>(input_pointcloud_msg.data.get()); + + if (use_3d_undistortion_) { + setupTwist3DStructs(input_pointcloud_msg, twist_queue, angular_velocity_queue); + } else { + setupTwist2DStructs(input_pointcloud_msg, twist_queue, angular_velocity_queue); + } + + // Obtain raw pointers for the kernels + TwistStruct2D * device_twist_2d_structs = + thrust::raw_pointer_cast(device_twist_2d_structs_.data()); + TwistStruct3D * device_twist_3d_structs = + thrust::raw_pointer_cast(device_twist_3d_structs_.data()); + InputPointType * device_transformed_points = + thrust::raw_pointer_cast(device_transformed_points_.data()); + uint32_t * device_self_crop_mask = thrust::raw_pointer_cast(device_self_crop_mask_.data()); + uint32_t * device_mirror_crop_mask = thrust::raw_pointer_cast(device_mirror_crop_mask_.data()); + uint32_t * device_ring_outlier_mask = thrust::raw_pointer_cast(device_ring_outlier_mask_.data()); + uint32_t * device_indices = thrust::raw_pointer_cast(device_indices_.data()); + + const int threadsPerBlock = 256; + const int blocksPerGrid = (num_input_points + threadsPerBlock - 1) / threadsPerBlock; + + transformPointsKernel<<<blocksPerGrid, threadsPerBlock>>>( + device_input_points, device_transformed_points, num_input_points, transform_struct); + + cropBoxKernel<<<blocksPerGrid, threadsPerBlock>>>( + device_transformed_points, device_self_crop_mask, num_input_points, + self_crop_box_parameters_.min_x, self_crop_box_parameters_.min_y, + self_crop_box_parameters_.min_z, self_crop_box_parameters_.max_x, + self_crop_box_parameters_.max_y, self_crop_box_parameters_.max_z); + + cropBoxKernel<<<blocksPerGrid, threadsPerBlock>>>( + device_transformed_points, device_mirror_crop_mask, num_input_points, + mirror_crop_box_parameters_.min_x, mirror_crop_box_parameters_.min_y, + mirror_crop_box_parameters_.min_z, mirror_crop_box_parameters_.max_x, + mirror_crop_box_parameters_.max_y, mirror_crop_box_parameters_.max_z); + + if (use_3d_undistortion_ && device_twist_3d_structs_.size() > 0) { + undistort3dKernel<<<blocksPerGrid, threadsPerBlock>>>( + device_transformed_points, num_input_points, device_twist_3d_structs, + device_twist_3d_structs_.size()); + } else if (!use_3d_undistortion_ && device_twist_2d_structs_.size() > 0) { + undistort2dKernel<<<blocksPerGrid, threadsPerBlock>>>( + device_transformed_points, num_input_points, device_twist_2d_structs, + device_twist_2d_structs_.size()); + } + + ringOutlierFilterKernel<<<blocksPerGrid, threadsPerBlock>>>( + device_transformed_points, device_ring_outlier_mask, max_rings_, max_points_per_ring_, + ring_outlier_parameters_.distance_ratio, + ring_outlier_parameters_.object_length_threshold * + ring_outlier_parameters_.object_length_threshold, + ring_outlier_parameters_.num_points_threshold); + + combineMasksKernel<<<blocksPerGrid, threadsPerBlock>>>( + device_self_crop_mask, device_mirror_crop_mask, device_ring_outlier_mask, num_input_points, + device_ring_outlier_mask); + + thrust::inclusive_scan( + thrust::device, device_ring_outlier_mask, device_ring_outlier_mask + num_input_points, + device_indices); + + uint32_t num_output_points; + cudaMemcpy( + &num_output_points, device_indices + num_input_points - 1, sizeof(uint32_t), + cudaMemcpyDeviceToHost); + + if (num_output_points > 0) { + extractInputPointsToOutputPoints_indicesKernel<<<blocksPerGrid, threadsPerBlock>>>( + device_transformed_points, device_ring_outlier_mask, device_indices, num_input_points, + reinterpret_cast<OutputPointType *>(output_pointcloud_ptr_->data.get())); + } + + // Copy the transformed points back + output_pointcloud_ptr_->row_step = num_output_points * sizeof(OutputPointType); + output_pointcloud_ptr_->width = num_output_points; + output_pointcloud_ptr_->height = 1; + + output_pointcloud_ptr_->fields = point_fields_; + output_pointcloud_ptr_->is_dense = true; + output_pointcloud_ptr_->is_bigendian = input_pointcloud_msg.is_bigendian; + output_pointcloud_ptr_->point_step = sizeof(OutputPointType); + output_pointcloud_ptr_->header.stamp = input_pointcloud_msg.header.stamp; + + return std::move(output_pointcloud_ptr_); +} + +} // namespace autoware::cuda_pointcloud_preprocessor diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.cpp b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.cpp new file mode 100644 index 0000000000000..fdfae020f1e19 --- /dev/null +++ b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.cpp @@ -0,0 +1,253 @@ +// Copyright 2024 TIER IV, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.hpp" + +#include "autoware/cuda_pointcloud_preprocessor/point_types.hpp" + +#include <autoware/point_types/types.hpp> + +#include <geometry_msgs/msg/transform_stamped.hpp> +#include <tf2_geometry_msgs/tf2_geometry_msgs.hpp> + +#include <cuda_runtime.h> +#include <pcl/pcl_base.h> +#include <pcl/point_cloud.h> +#include <pcl/point_types.h> + +#include <chrono> +#include <string> +#include <vector> + +namespace autoware::cuda_pointcloud_preprocessor +{ +using sensor_msgs::msg::PointCloud2; + +CudaPointcloudPreprocessorNode::CudaPointcloudPreprocessorNode( + const rclcpp::NodeOptions & node_options) +: Node("cuda_pointcloud_preprocessor", node_options), + tf2_buffer_(this->get_clock()), + tf2_listener_(tf2_buffer_) +{ + using std::placeholders::_1; + + // Parameters + base_frame_ = declare_parameter<std::string>("base_frame"); + + RingOutlierFilterParameters ring_outlier_filter_parameters; + ring_outlier_filter_parameters.distance_ratio = declare_parameter<float>("distance_ratio"); + ring_outlier_filter_parameters.object_length_threshold = + declare_parameter<float>("object_length_threshold"); + ring_outlier_filter_parameters.num_points_threshold = + declare_parameter<int>("num_points_threshold"); + + CropBoxParameters self_crop_box_parameters, mirror_crop_box_parameters; + self_crop_box_parameters.min_x = declare_parameter<float>("self_crop.min_x"); + self_crop_box_parameters.min_y = declare_parameter<float>("self_crop.min_y"); + self_crop_box_parameters.min_z = declare_parameter<float>("self_crop.min_z"); + self_crop_box_parameters.max_x = declare_parameter<float>("self_crop.max_x"); + self_crop_box_parameters.max_y = declare_parameter<float>("self_crop.max_y"); + self_crop_box_parameters.max_z = declare_parameter<float>("self_crop.max_z"); + + mirror_crop_box_parameters.min_x = declare_parameter<float>("mirror_crop.min_x"); + mirror_crop_box_parameters.min_y = declare_parameter<float>("mirror_crop.min_y"); + mirror_crop_box_parameters.min_z = declare_parameter<float>("mirror_crop.min_z"); + mirror_crop_box_parameters.max_x = declare_parameter<float>("mirror_crop.max_x"); + mirror_crop_box_parameters.max_y = declare_parameter<float>("mirror_crop.max_y"); + mirror_crop_box_parameters.max_z = declare_parameter<float>("mirror_crop.max_z"); + + bool use_3d_undistortion = declare_parameter<bool>("use_3d_distortion_correction"); + bool use_imu = declare_parameter<bool>("use_imu"); + + // Subscriber + sub_ = + std::make_unique<cuda_blackboard::CudaBlackboardSubscriber<cuda_blackboard::CudaPointCloud2>>( + *this, "~/input/pointcloud", false, + std::bind(&CudaPointcloudPreprocessorNode::pointcloudCallback, this, _1)); + + // Publisher + pub_ = + std::make_unique<cuda_blackboard::CudaBlackboardPublisher<cuda_blackboard::CudaPointCloud2>>( + *this, "~/output/pointcloud"); + twist_sub_ = this->create_subscription<geometry_msgs::msg::TwistWithCovarianceStamped>( + "~/input/twist", 10, + std::bind(&CudaPointcloudPreprocessorNode::twistCallback, this, std::placeholders::_1)); + + if (use_imu) { + imu_sub_ = this->create_subscription<sensor_msgs::msg::Imu>( + "~/input/imu", 10, + std::bind(&CudaPointcloudPreprocessorNode::imuCallback, this, std::placeholders::_1)); + } + + cuda_pointcloud_preprocessor_ = std::make_unique<CudaPointcloudPreprocessor>(); + cuda_pointcloud_preprocessor_->setRingOutlierFilterParameters(ring_outlier_filter_parameters); + cuda_pointcloud_preprocessor_->setCropBoxParameters( + self_crop_box_parameters, mirror_crop_box_parameters); + cuda_pointcloud_preprocessor_->set3DUndistortion(use_3d_undistortion); + + // initialize debug tool + { + using autoware::universe_utils::DebugPublisher; + using autoware::universe_utils::StopWatch; + + stop_watch_ptr_ = std::make_unique<StopWatch<std::chrono::milliseconds>>(); + debug_publisher_ = std::make_unique<DebugPublisher>(this, "cuda_pointcloud_preprocessor"); + stop_watch_ptr_->tic("processing_time"); + } +} + +bool CudaPointcloudPreprocessorNode::getTransform( + const std::string & target_frame, const std::string & source_frame, + tf2::Transform * tf2_transform_ptr) +{ + if (target_frame == source_frame) { + tf2_transform_ptr->setOrigin(tf2::Vector3(0.0, 0.0, 0.0)); + tf2_transform_ptr->setRotation(tf2::Quaternion(0.0, 0.0, 0.0, 1.0)); + return true; + } + + try { + const auto transform_msg = + tf2_buffer_.lookupTransform(target_frame, source_frame, tf2::TimePointZero); + tf2::convert(transform_msg.transform, *tf2_transform_ptr); + } catch (const tf2::TransformException & ex) { + RCLCPP_WARN(get_logger(), "%s", ex.what()); + RCLCPP_ERROR( + get_logger(), "Please publish TF %s to %s", target_frame.c_str(), source_frame.c_str()); + + tf2_transform_ptr->setOrigin(tf2::Vector3(0.0, 0.0, 0.0)); + tf2_transform_ptr->setRotation(tf2::Quaternion(0.0, 0.0, 0.0, 1.0)); + return false; + } + return true; +} + +void CudaPointcloudPreprocessorNode::twistCallback( + const geometry_msgs::msg::TwistWithCovarianceStamped::ConstSharedPtr twist_msg_ptr) +{ + twist_queue_.push_back(*twist_msg_ptr); + + while (!twist_queue_.empty()) { + // for replay rosbag + if ( + rclcpp::Time(twist_queue_.front().header.stamp) > rclcpp::Time(twist_msg_ptr->header.stamp)) { + twist_queue_.pop_front(); + } else if ( // NOLINT + rclcpp::Time(twist_queue_.front().header.stamp) < + rclcpp::Time(twist_msg_ptr->header.stamp) - rclcpp::Duration::from_seconds(1.0)) { + twist_queue_.pop_front(); + } + break; + } +} + +void CudaPointcloudPreprocessorNode::imuCallback( + const sensor_msgs::msg::Imu::ConstSharedPtr imu_msg) +{ + tf2::Transform tf2_imu_link_to_base_link{}; + getTransform(base_frame_, imu_msg->header.frame_id, &tf2_imu_link_to_base_link); + geometry_msgs::msg::TransformStamped::SharedPtr tf_base2imu_ptr = + std::make_shared<geometry_msgs::msg::TransformStamped>(); + tf_base2imu_ptr->transform.rotation = tf2::toMsg(tf2_imu_link_to_base_link.getRotation()); + + geometry_msgs::msg::Vector3Stamped angular_velocity; + angular_velocity.vector = imu_msg->angular_velocity; + + geometry_msgs::msg::Vector3Stamped transformed_angular_velocity; + tf2::doTransform(angular_velocity, transformed_angular_velocity, *tf_base2imu_ptr); + transformed_angular_velocity.header = imu_msg->header; + angular_velocity_queue_.push_back(transformed_angular_velocity); + + while (!angular_velocity_queue_.empty()) { + // for rosbag replay + if ( + rclcpp::Time(angular_velocity_queue_.front().header.stamp) > + rclcpp::Time(imu_msg->header.stamp)) { + angular_velocity_queue_.pop_front(); + } else if ( // NOLINT + rclcpp::Time(angular_velocity_queue_.front().header.stamp) < + rclcpp::Time(imu_msg->header.stamp) - rclcpp::Duration::from_seconds(1.0)) { + angular_velocity_queue_.pop_front(); + } + break; + } +} + +void CudaPointcloudPreprocessorNode::pointcloudCallback( + const std::shared_ptr<const cuda_blackboard::CudaPointCloud2> input_pointcloud_msg_ptr) +{ + static_assert( + sizeof(InputPointType) == sizeof(autoware::point_types::PointXYZIRCAEDT), + "PointStruct and PointXYZIRCAEDT must have the same size"); + + stop_watch_ptr_->toc("processing_time", true); + + // Make sure that the first twist is newer than the first point + InputPointType first_point; + cudaMemcpy( + &first_point, input_pointcloud_msg_ptr->data.get(), sizeof(InputPointType), + cudaMemcpyDeviceToHost); + double first_point_stamp = input_pointcloud_msg_ptr->header.stamp.sec + + input_pointcloud_msg_ptr->header.stamp.nanosec * 1e-9 + + first_point.time_stamp * 1e-9; + + while (twist_queue_.size() > 1 && + rclcpp::Time(twist_queue_.front().header.stamp).seconds() < first_point_stamp) { + twist_queue_.pop_front(); + } + + while (angular_velocity_queue_.size() > 1 && + rclcpp::Time(angular_velocity_queue_.front().header.stamp).seconds() < first_point_stamp) { + angular_velocity_queue_.pop_front(); + } + + // Obtain the base link to input pointcloud transform + geometry_msgs::msg::TransformStamped transform_msg; + + try { + transform_msg = tf2_buffer_.lookupTransform( + base_frame_, input_pointcloud_msg_ptr->header.frame_id, tf2::TimePointZero); + } catch (const tf2::TransformException & ex) { + RCLCPP_WARN(get_logger(), "%s", ex.what()); + return; + } + + auto output_pointcloud_ptr = cuda_pointcloud_preprocessor_->process( + *input_pointcloud_msg_ptr, transform_msg, twist_queue_, angular_velocity_queue_); + output_pointcloud_ptr->header.frame_id = base_frame_; + + // Publish + pub_->publish(std::move(output_pointcloud_ptr)); + + if (debug_publisher_) { + const double processing_time_ms = stop_watch_ptr_->toc("processing_time", true); + debug_publisher_->publish<tier4_debug_msgs::msg::Float64Stamped>( + "debug/processing_time_ms", processing_time_ms); + + double now_stamp_seconds = rclcpp::Time(this->get_clock()->now()).seconds(); + double cloud_stamp_seconds = rclcpp::Time(input_pointcloud_msg_ptr->header.stamp).seconds(); + + debug_publisher_->publish<tier4_debug_msgs::msg::Float64Stamped>( + "debug/latency_ms", 1000.f * (now_stamp_seconds - cloud_stamp_seconds)); + } + + // Preallocate for next iteration + cuda_pointcloud_preprocessor_->preallocateOutput(); +} + +} // namespace autoware::cuda_pointcloud_preprocessor + +#include "rclcpp_components/register_node_macro.hpp" +RCLCPP_COMPONENTS_REGISTER_NODE( + autoware::cuda_pointcloud_preprocessor::CudaPointcloudPreprocessorNode) From 774e0990b417ceca330fc0893738c7f43f72874d Mon Sep 17 00:00:00 2001 From: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp> Date: Mon, 25 Nov 2024 18:05:51 +0900 Subject: [PATCH 02/55] chore: fixed incorrect links Signed-off-by: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp> --- .../docs/cuda-pointcloud-preprocessor.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/docs/cuda-pointcloud-preprocessor.md b/sensing/autoware_cuda_pointcloud_preprocessor/docs/cuda-pointcloud-preprocessor.md index 459f15569d7ca..a82d49ba3d6ca 100644 --- a/sensing/autoware_cuda_pointcloud_preprocessor/docs/cuda-pointcloud-preprocessor.md +++ b/sensing/autoware_cuda_pointcloud_preprocessor/docs/cuda-pointcloud-preprocessor.md @@ -11,7 +11,7 @@ In particular, this node implements: ## Inner-workings / Algorithms -As this node reimplements the functionalities of the CPU-version algorithms, please have a look at the documentations of [crop-box](/sensing/autoware_pointcloud_preprocessor/docs/crop-box-filter.md), [distortion correction](/sensing/autoware_pointcloud_preprocessor/docs/distortion-corrector.md), and [ring-based outlier filter](/sensing/autoware_pointcloud_preprocessor/docs/ring-outlier-filter.md) for more information about these algorithms. +As this node reimplements the functionalities of the CPU-version algorithms, please have a look at the documentations of [crop-box](../../autoware_pointcloud_preprocessor/docs/crop-box-filter.md), [distortion correction](../../sensing/autoware_pointcloud_preprocessor/docs/distortion-corrector.md), and [ring-based outlier filter](../../sensing/autoware_pointcloud_preprocessor/docs/ring-outlier-filter.md) for more information about these algorithms. In addition to the individual algorithms previously mentioned, this node uses the `cuda_blackboard`, a cuda transport layer that enables a zero-copy mechanism between GPU and GPU memory for both input and output. From be04f760fe40da54c03ca7b5546bf37f2012ac5d Mon Sep 17 00:00:00 2001 From: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp> Date: Mon, 25 Nov 2024 18:09:23 +0900 Subject: [PATCH 03/55] chore: fixed dead links pt2 Signed-off-by: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp> --- .../docs/cuda-pointcloud-preprocessor.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/docs/cuda-pointcloud-preprocessor.md b/sensing/autoware_cuda_pointcloud_preprocessor/docs/cuda-pointcloud-preprocessor.md index a82d49ba3d6ca..69d800794701b 100644 --- a/sensing/autoware_cuda_pointcloud_preprocessor/docs/cuda-pointcloud-preprocessor.md +++ b/sensing/autoware_cuda_pointcloud_preprocessor/docs/cuda-pointcloud-preprocessor.md @@ -11,7 +11,7 @@ In particular, this node implements: ## Inner-workings / Algorithms -As this node reimplements the functionalities of the CPU-version algorithms, please have a look at the documentations of [crop-box](../../autoware_pointcloud_preprocessor/docs/crop-box-filter.md), [distortion correction](../../sensing/autoware_pointcloud_preprocessor/docs/distortion-corrector.md), and [ring-based outlier filter](../../sensing/autoware_pointcloud_preprocessor/docs/ring-outlier-filter.md) for more information about these algorithms. +As this node reimplements the functionalities of the CPU-version algorithms, please have a look at the documentations of [crop-box](../../autoware_pointcloud_preprocessor/docs/crop-box-filter.md), [distortion correction](../../autoware_pointcloud_preprocessor/docs/distortion-corrector.md), and [ring-based outlier filter](../../autoware_pointcloud_preprocessor/docs/ring-outlier-filter.md) for more information about these algorithms. In addition to the individual algorithms previously mentioned, this node uses the `cuda_blackboard`, a cuda transport layer that enables a zero-copy mechanism between GPU and GPU memory for both input and output. From db02ec78f27b2da0f94b0e06b231cc79881bd25d Mon Sep 17 00:00:00 2001 From: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp> Date: Mon, 25 Nov 2024 18:21:44 +0900 Subject: [PATCH 04/55] chore: fixed spelling errors Signed-off-by: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp> --- .../CMakeLists.txt | 2 +- .../README.md | 2 +- .../cuda_pointcloud_preprocessor.hpp | 12 ++-------- .../cuda_pointcloud_preprocessor_node.hpp | 1 - .../cuda_pointcloud_preprocessor.cu | 22 +++++++++---------- 5 files changed, 15 insertions(+), 24 deletions(-) diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/CMakeLists.txt b/sensing/autoware_cuda_pointcloud_preprocessor/CMakeLists.txt index 06b1f5d67540f..d09f787a3b277 100644 --- a/sensing/autoware_cuda_pointcloud_preprocessor/CMakeLists.txt +++ b/sensing/autoware_cuda_pointcloud_preprocessor/CMakeLists.txt @@ -32,7 +32,7 @@ include_directories( ${CUDA_INCLUDE_DIRS} ) -list(APPEND CUDA_NVCC_FLAGS "--expt-relaxed-constexpr -diag-suppress 20012") +list(APPEND CUDA_NVCC_FLAGS "--expt-relaxed-constexpr -diag-suppress 20012") # cSpell: ignore expt cuda_add_library(cuda_pointcloud_preprocessor_lib SHARED src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/README.md b/sensing/autoware_cuda_pointcloud_preprocessor/README.md index d15082b69a661..25d29a678b1ce 100644 --- a/sensing/autoware_cuda_pointcloud_preprocessor/README.md +++ b/sensing/autoware_cuda_pointcloud_preprocessor/README.md @@ -4,7 +4,7 @@ The pointcloud preprocessing implemented in `autoware_pointcloud_preprocessor` has been thoroughly tested in autoware. However, the latency it introduces does not scale well with modern LiDAR devices due to the high number of points they introduce. -To alleviate this issue, this package reimplements most of the pipeline presented in `autoware_pointcloud_preprocessor` leveraging the use of GPGPUs. In particular, this package makes use of CUDA to provide accelerated versions of the already establishes implementations, while also maintaining compatibility with normal ROS nodes/topics. +To alleviate this issue, this package reimplements most of the pipeline presented in `autoware_pointcloud_preprocessor` leveraging the use of GPGPUs. In particular, this package makes use of CUDA to provide accelerated versions of the already establishes implementations, while also maintaining compatibility with normal ROS nodes/topics. <!-- cSpell: ignore GPGPUs > ## Inner-workings / Algorithms diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.hpp b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.hpp index b4ae4eecc304e..2a22c340a268c 100644 --- a/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.hpp +++ b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.hpp @@ -43,23 +43,15 @@ struct TwistStruct2D float cum_sin_theta; uint32_t last_stamp_nsec; // relative to the start of the pointcloud uint32_t stamp_nsec; // relative to the start of the pointcloud - float vx; - float vtheta; + float v_x; + float v_theta; }; struct TwistStruct3D { - // Eigen::Affine3f cum_transform; float cum_transform_buffer[16]; - /* float cum_x; - float cum_y; - float cum_theta; */ - /* float cum_cos_theta; - float cum_sin_theta; */ uint32_t last_stamp_nsec; // relative to the start of the pointcloud uint32_t stamp_nsec; // relative to the start of the pointcloud - /* float vx; - float vtheta; */ float v[3]; float w[3]; }; diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.hpp b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.hpp index 5fdf4258eebcb..a5c5361c92515 100644 --- a/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.hpp +++ b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.hpp @@ -91,7 +91,6 @@ class CudaPointcloudPreprocessorNode : public rclcpp::Node tf2_ros::TransformListener tf2_listener_; std::string base_frame_; - bool imu_tranform_valid_{false}; std::deque<geometry_msgs::msg::TwistWithCovarianceStamped> twist_queue_; std::deque<geometry_msgs::msg::Vector3Stamped> angular_velocity_queue_; diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu index ad8910aef5b76..ec2fe8fc03e3b 100644 --- a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu +++ b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu @@ -176,8 +176,8 @@ __global__ void undistort2dKernel( point.time_stamp > twist.last_stamp_nsec ? point.time_stamp - twist.last_stamp_nsec : 0; double dt = 1e-9 * (dt_nsec); - theta += twist.vtheta * dt; - float d = twist.vx * dt; + theta += twist.v_theta * dt; + float d = twist.v_x * dt; x += d * cos(theta); y += d * sin(theta); @@ -400,16 +400,16 @@ void CudaPointcloudPreprocessor::setupTwist2DStructs( for (; twist_index < twist_queue.size() || angular_velocity_index < angular_velocity_queue.size();) { uint64_t twist_stamp, input_twist_global_stamp_nsec, angular_velocity_global_stamp_nsec; - float vx, vtheta; + float v_x, v_theta; if (twist_index < twist_queue.size()) { input_twist_global_stamp_nsec = 1'000'000'000 * static_cast<uint64_t>(twist_queue[twist_index].header.stamp.sec) + static_cast<uint64_t>(twist_queue[twist_index].header.stamp.nanosec); - vx = twist_queue[twist_index].twist.twist.linear.x; + v_x = twist_queue[twist_index].twist.twist.linear.x; } else { input_twist_global_stamp_nsec = std::numeric_limits<uint64_t>::max(); - vx = 0.0; + v_x = 0.0; } if (angular_velocity_index < angular_velocity_queue.size()) { @@ -417,10 +417,10 @@ void CudaPointcloudPreprocessor::setupTwist2DStructs( 1'000'000'000 * static_cast<uint64_t>(angular_velocity_queue[angular_velocity_index].header.stamp.sec) + static_cast<uint64_t>(angular_velocity_queue[angular_velocity_index].header.stamp.nanosec); - vtheta = angular_velocity_queue[angular_velocity_index].vector.z; + v_theta = angular_velocity_queue[angular_velocity_index].vector.z; } else { angular_velocity_global_stamp_nsec = std::numeric_limits<uint64_t>::max(); - vtheta = 0.0; + v_theta = 0.0; } if (input_twist_global_stamp_nsec < angular_velocity_global_stamp_nsec) { @@ -444,15 +444,15 @@ void CudaPointcloudPreprocessor::setupTwist2DStructs( uint32_t twist_from_pointcloud_start_nsec = twist_global_stamp_nsec - pointcloud_stamp_nsec; twist.stamp_nsec = twist_from_pointcloud_start_nsec; - twist.vx = vx; - twist.vtheta = vtheta; + twist.v_x = v_x; + twist.v_theta = v_theta; twist.last_stamp_nsec = last_stamp_nsec; host_twist_structs.push_back(twist); double dt_seconds = 1e-9 * (twist.stamp_nsec - last_stamp_nsec); last_stamp_nsec = twist.stamp_nsec; - cum_theta += vtheta * dt_seconds; - float d = twist.vx * dt_seconds; + cum_theta += v_theta * dt_seconds; + float d = twist.v_x * dt_seconds; cum_x += d * cos(cum_theta); cum_y += d * sin(cum_theta); } From 5218a4aad764a816f4b4a14bd60e656910b444b4 Mon Sep 17 00:00:00 2001 From: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp> Date: Mon, 25 Nov 2024 18:33:54 +0900 Subject: [PATCH 05/55] chore: json schema fixes Signed-off-by: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp> --- .../schema/cuda_pointcloud_preprocessor.schema.json | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/schema/cuda_pointcloud_preprocessor.schema.json b/sensing/autoware_cuda_pointcloud_preprocessor/schema/cuda_pointcloud_preprocessor.schema.json index 196f455ffe75b..cc94da3523512 100644 --- a/sensing/autoware_cuda_pointcloud_preprocessor/schema/cuda_pointcloud_preprocessor.schema.json +++ b/sensing/autoware_cuda_pointcloud_preprocessor/schema/cuda_pointcloud_preprocessor.schema.json @@ -3,7 +3,7 @@ "title": "Parameters for the cuda_pointcloud_preprocessor", "type": "object", "definitions": { - "crop_box_filter": { + "cuda_pointcloud_preprocessor": { "type": "object", "properties": { "base_frame": { @@ -23,19 +23,19 @@ }, "distance_ratio": { "type": "number", - "description": "distance_ratio", + "description": "distance ratio between consecutive points to determine is they belong to the same walk", "default": "1.03", "minimum": 0.0 }, "object_length_threshold": { "type": "number", - "description": "object_length_threshold", + "description": "minimum object length to be considered a valid cluster in meters", "default": "0.1", "minimum": 0.0 }, "num_points_threshold": { "type": "integer", - "description": "num_points_threshold", + "description": "threshold on the number of points in a walk to determine if it corresponds to a cluster", "default": "4", "minimum": 0 }, From 4a9daaf74613d28c91a3fd3e171649069be19258 Mon Sep 17 00:00:00 2001 From: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp> Date: Tue, 26 Nov 2024 13:39:27 +0900 Subject: [PATCH 06/55] chore: removed comments and filled the fields Signed-off-by: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp> --- .../cuda_organized_pointcloud_adapter_node.cpp | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_organized_pointcloud_adapter/cuda_organized_pointcloud_adapter_node.cpp b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_organized_pointcloud_adapter/cuda_organized_pointcloud_adapter_node.cpp index bd5717fbbcb5e..07f83230f21fe 100644 --- a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_organized_pointcloud_adapter/cuda_organized_pointcloud_adapter_node.cpp +++ b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_organized_pointcloud_adapter/cuda_organized_pointcloud_adapter_node.cpp @@ -86,17 +86,9 @@ void CudaOrganizedPointcloudAdapterNode::estimatePointcloudRingInfo( std::fill(next_ring_index_.begin(), next_ring_index_.end(), 0); buffer_.resize(num_rings_ * max_points_per_ring_); - /* if (device_buffer_ != nullptr) { - cudaFree(device_buffer_); - } */ - device_buffer_ = cuda_blackboard::make_unique<std::uint8_t[]>( num_rings_ * max_points_per_ring_ * sizeof(autoware::point_types::PointXYZIRCAEDT)); - /* cudaMalloc( - device_buffer_, - num_rings_ * max_points_per_ring_ * sizeof(autoware::point_types::PointXYZIRCAEDT)); */ - RCLCPP_INFO_STREAM( get_logger(), "Estimated rings: " << num_rings_ << ", max_points_per_ring: " << max_points_per_ring_ @@ -158,6 +150,7 @@ void CudaOrganizedPointcloudAdapterNode::pointcloudCallback( cudaMemcpyHostToDevice); auto cuda_pointcloud_msg_ptr = std::make_unique<cuda_blackboard::CudaPointCloud2>(); + cuda_pointcloud_msg_ptr->fields = input_pointcloud_msg_ptr->fields; cuda_pointcloud_msg_ptr->width = max_points_per_ring_; cuda_pointcloud_msg_ptr->height = num_rings_; cuda_pointcloud_msg_ptr->point_step = sizeof(autoware::point_types::PointXYZIRCAEDT); @@ -186,9 +179,7 @@ void CudaOrganizedPointcloudAdapterNode::pointcloudCallback( // Allocate cuda memory device_buffer_ = cuda_blackboard::make_unique<std::uint8_t[]>( num_rings_ * max_points_per_ring_ * sizeof(autoware::point_types::PointXYZIRCAEDT)); - /* cudaMalloc( - &device_buffer_, - num_rings_ * max_points_per_ring_ * sizeof(autoware::point_types::PointXYZIRCAEDT)); */ + // Clear indexes std::fill(next_ring_index_.begin(), next_ring_index_.end(), 0); From 84a4b9f11c9d562297c1e5445e1b246e5ebdd03d Mon Sep 17 00:00:00 2001 From: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp> Date: Tue, 26 Nov 2024 14:24:58 +0900 Subject: [PATCH 07/55] fix: fixed the adapter for the case when the number of points in the pointcloud changes after the first iteration Signed-off-by: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp> --- .../cuda_organized_pointcloud_adapter_node.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_organized_pointcloud_adapter/cuda_organized_pointcloud_adapter_node.cpp b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_organized_pointcloud_adapter/cuda_organized_pointcloud_adapter_node.cpp index 07f83230f21fe..c5864129d05c9 100644 --- a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_organized_pointcloud_adapter/cuda_organized_pointcloud_adapter_node.cpp +++ b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_organized_pointcloud_adapter/cuda_organized_pointcloud_adapter_node.cpp @@ -119,7 +119,7 @@ bool CudaOrganizedPointcloudAdapterNode::orderPointcloud( point_overflow |= raw_index >= max_points_per_ring_; buffer_[ring * max_points_per_ring_ + index] = point; - next_ring_index_[ring] = (index + 1) % max_points_per_ring_; + next_ring_index_[ring] = raw_index + 1; } return !ring_overflow && !point_overflow; From b3c1d7279e7a26fc7a134567b17c1dec5402d064 Mon Sep 17 00:00:00 2001 From: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp> Date: Mon, 23 Dec 2024 14:04:55 +0900 Subject: [PATCH 08/55] feat: used the cuda host allocators for aster host to device copies Signed-off-by: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp> --- .../cuda_organized_pointcloud_adapter_node.hpp | 3 +-- .../cuda_organized_pointcloud_adapter_node.cpp | 14 ++++++++++---- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_organized_pointcloud_adapter/cuda_organized_pointcloud_adapter_node.hpp b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_organized_pointcloud_adapter/cuda_organized_pointcloud_adapter_node.hpp index 8930a1f87e63b..7a31442f26f1d 100644 --- a/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_organized_pointcloud_adapter/cuda_organized_pointcloud_adapter_node.hpp +++ b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_organized_pointcloud_adapter/cuda_organized_pointcloud_adapter_node.hpp @@ -64,8 +64,7 @@ class CudaOrganizedPointcloudAdapterNode : public rclcpp::Node std::size_t max_points_per_ring_{0}; std::vector<std::size_t> next_ring_index_; - std::vector<autoware::point_types::PointXYZIRCAEDT> buffer_; - // autoware::point_types::PointXYZIRCAEDT * device_buffer_; + cuda_blackboard::HostUniquePtr<autoware::point_types::PointXYZIRCAEDT[]> host_buffer_; cuda_blackboard::CudaUniquePtr<std::uint8_t[]> device_buffer_; std::unique_ptr<autoware::universe_utils::StopWatch<std::chrono::milliseconds>> stop_watch_ptr_; diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_organized_pointcloud_adapter/cuda_organized_pointcloud_adapter_node.cpp b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_organized_pointcloud_adapter/cuda_organized_pointcloud_adapter_node.cpp index c5864129d05c9..b7eeccafa2d5a 100644 --- a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_organized_pointcloud_adapter/cuda_organized_pointcloud_adapter_node.cpp +++ b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_organized_pointcloud_adapter/cuda_organized_pointcloud_adapter_node.cpp @@ -84,7 +84,8 @@ void CudaOrganizedPointcloudAdapterNode::estimatePointcloudRingInfo( next_ring_index_.resize(num_rings_); std::fill(next_ring_index_.begin(), next_ring_index_.end(), 0); - buffer_.resize(num_rings_ * max_points_per_ring_); + host_buffer_ = cuda_blackboard::make_host_unique<autoware::point_types::PointXYZIRCAEDT[]>( + num_rings_ * max_points_per_ring_); device_buffer_ = cuda_blackboard::make_unique<std::uint8_t[]>( num_rings_ * max_points_per_ring_ * sizeof(autoware::point_types::PointXYZIRCAEDT)); @@ -106,6 +107,8 @@ bool CudaOrganizedPointcloudAdapterNode::orderPointcloud( bool ring_overflow = false; bool point_overflow = false; + autoware::point_types::PointXYZIRCAEDT * buffer = host_buffer_.get(); + for (std::size_t i = 0; i < input_pointcloud_msg_ptr->width * input_pointcloud_msg_ptr->height; i++) { const autoware::point_types::PointXYZIRCAEDT & point = input_buffer[i]; @@ -118,7 +121,7 @@ bool CudaOrganizedPointcloudAdapterNode::orderPointcloud( ring_overflow |= raw_ring >= num_rings_; point_overflow |= raw_index >= max_points_per_ring_; - buffer_[ring * max_points_per_ring_ + index] = point; + buffer[ring * max_points_per_ring_ + index] = point; next_ring_index_[ring] = raw_index + 1; } @@ -145,7 +148,7 @@ void CudaOrganizedPointcloudAdapterNode::pointcloudCallback( // Copy to cuda memory cudaMemcpy( - device_buffer_.get(), buffer_.data(), + device_buffer_.get(), host_buffer_.get(), num_rings_ * max_points_per_ring_ * sizeof(autoware::point_types::PointXYZIRCAEDT), cudaMemcpyHostToDevice); @@ -184,7 +187,10 @@ void CudaOrganizedPointcloudAdapterNode::pointcloudCallback( std::fill(next_ring_index_.begin(), next_ring_index_.end(), 0); // Clear pointcloud buffer - std::fill(buffer_.begin(), buffer_.end(), autoware::point_types::PointXYZIRCAEDT{}); + auto host_buffer = host_buffer_.get(); + for (std::size_t i = 0; i < num_rings_ * max_points_per_ring_; i++) { + host_buffer[i] = autoware::point_types::PointXYZIRCAEDT{}; + } } } // namespace autoware::cuda_organized_pointcloud_adapter From 887f162beba008759e3b56d30af99005c5448ad5 Mon Sep 17 00:00:00 2001 From: Kenzo Lobos Tsunekawa <kenzo.lobos@tier4.jp> Date: Fri, 10 Jan 2025 18:29:18 +0900 Subject: [PATCH 09/55] Update sensing/autoware_cuda_pointcloud_preprocessor/docs/cuda-pointcloud-preprocessor.md Co-authored-by: Max Schmeller <6088931+mojomex@users.noreply.github.com> --- .../docs/cuda-pointcloud-preprocessor.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/docs/cuda-pointcloud-preprocessor.md b/sensing/autoware_cuda_pointcloud_preprocessor/docs/cuda-pointcloud-preprocessor.md index 69d800794701b..2d360321627d2 100644 --- a/sensing/autoware_cuda_pointcloud_preprocessor/docs/cuda-pointcloud-preprocessor.md +++ b/sensing/autoware_cuda_pointcloud_preprocessor/docs/cuda-pointcloud-preprocessor.md @@ -41,6 +41,6 @@ In addition to the individual algorithms previously mentioned, this node uses th ## Assumptions / Known limits -- The CUDA implementations, while following the original CPU ones, will not offer the same numerical results, and small approximations were needed to maximize the GPU use. +- The CUDA implementations, while following the original CPU ones, will not offer the same numerical results, and small approximations were needed to maximize GPU usage. - This node expects that the input pointcloud follows the `autoware::point_types::PointXYZIRCAEDT` layout and the output pointcloud will use the `autoware::point_types::PointXYZIRC` layout defined in the `autoware_point_types` package. - The input pointcloud must be a 2D tensor where each row represents a different channel/ring with its points in non-decreasing azimuth order. Invalid points should contain 0-values. The `cuda_organized_pointcloud_adapter` provides such a pointcloud. From 0b46fb68140c843b5d03d2df21a76d8eaa6c6bfe Mon Sep 17 00:00:00 2001 From: Kenzo Lobos Tsunekawa <kenzo.lobos@tier4.jp> Date: Fri, 10 Jan 2025 18:31:45 +0900 Subject: [PATCH 10/55] Update sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu Co-authored-by: Manato Hirabayashi <3022416+manato@users.noreply.github.com> --- .../cuda_pointcloud_preprocessor.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu index ec2fe8fc03e3b..d7091f88d3c5a 100644 --- a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu +++ b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu @@ -107,7 +107,7 @@ __global__ void cropBoxKernel( } __global__ void combineMasksKernel( - uint32_t * mask1, uint32_t * mask2, uint32_t * mask3, int num_points, uint32_t * output_mask) + const uint32_t * __restrict__ mask1, const uint32_t * __restrict__ mask2, const uint32_t * __restrict__ mask3, int num_points, uint32_t * output_mask) { int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < num_points) { From 970055e43ec88b8159868f32f169c877fd3311cf Mon Sep 17 00:00:00 2001 From: Kenzo Lobos Tsunekawa <kenzo.lobos@tier4.jp> Date: Fri, 10 Jan 2025 18:32:35 +0900 Subject: [PATCH 11/55] Update sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu Co-authored-by: Manato Hirabayashi <3022416+manato@users.noreply.github.com> --- .../cuda_pointcloud_preprocessor.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu index d7091f88d3c5a..b1793d56e9a8a 100644 --- a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu +++ b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu @@ -71,7 +71,7 @@ __host__ __device__ Eigen::Matrix4f transformationMatrixFromVelocity( } __global__ void transformPointsKernel( - const InputPointType * input_points, InputPointType * output_points, int num_points, + const InputPointType * __restrict__ input_points, InputPointType * output_points, int num_points, TransformStruct transform) { int idx = blockIdx.x * blockDim.x + threadIdx.x; From 71b8a6fbc3d8927240ed05fc75de7f16e6fb9dc0 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 10 Jan 2025 09:35:04 +0000 Subject: [PATCH 12/55] style(pre-commit): autofix --- .../cuda_pointcloud_preprocessor.cu | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu index b1793d56e9a8a..4c769fd24471f 100644 --- a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu +++ b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu @@ -107,7 +107,8 @@ __global__ void cropBoxKernel( } __global__ void combineMasksKernel( - const uint32_t * __restrict__ mask1, const uint32_t * __restrict__ mask2, const uint32_t * __restrict__ mask3, int num_points, uint32_t * output_mask) + const uint32_t * __restrict__ mask1, const uint32_t * __restrict__ mask2, + const uint32_t * __restrict__ mask3, int num_points, uint32_t * output_mask) { int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < num_points) { From d6ee47f7c7fc2c205eb12db7b7d47dfe3cc9b663 Mon Sep 17 00:00:00 2001 From: Kenzo Lobos Tsunekawa <kenzo.lobos@tier4.jp> Date: Fri, 10 Jan 2025 18:39:19 +0900 Subject: [PATCH 13/55] Update sensing/autoware_cuda_pointcloud_preprocessor/docs/cuda-pointcloud-preprocessor.md Co-authored-by: Max Schmeller <6088931+mojomex@users.noreply.github.com> --- .../docs/cuda-pointcloud-preprocessor.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/docs/cuda-pointcloud-preprocessor.md b/sensing/autoware_cuda_pointcloud_preprocessor/docs/cuda-pointcloud-preprocessor.md index 2d360321627d2..4a7e2d070ccb6 100644 --- a/sensing/autoware_cuda_pointcloud_preprocessor/docs/cuda-pointcloud-preprocessor.md +++ b/sensing/autoware_cuda_pointcloud_preprocessor/docs/cuda-pointcloud-preprocessor.md @@ -5,7 +5,7 @@ This node implements all standard pointcloud preprocessing algorithms applied to a single LiDAR's pointcloud in CUDA. In particular, this node implements: -- crop boxing (ego-vehicle and ego-vehicle's mirrors) +- box cropping (ego-vehicle and ego-vehicle's mirrors) - distortion correction - ring-based outlier filtering From 3f6953ff2e3c7db46309aa30b3e05a3bad72a23c Mon Sep 17 00:00:00 2001 From: Kenzo Lobos Tsunekawa <kenzo.lobos@tier4.jp> Date: Fri, 10 Jan 2025 18:46:14 +0900 Subject: [PATCH 14/55] Update sensing/autoware_cuda_pointcloud_preprocessor/README.md Co-authored-by: Max Schmeller <6088931+mojomex@users.noreply.github.com> --- sensing/autoware_cuda_pointcloud_preprocessor/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/README.md b/sensing/autoware_cuda_pointcloud_preprocessor/README.md index 25d29a678b1ce..984da0bcaa518 100644 --- a/sensing/autoware_cuda_pointcloud_preprocessor/README.md +++ b/sensing/autoware_cuda_pointcloud_preprocessor/README.md @@ -4,7 +4,7 @@ The pointcloud preprocessing implemented in `autoware_pointcloud_preprocessor` has been thoroughly tested in autoware. However, the latency it introduces does not scale well with modern LiDAR devices due to the high number of points they introduce. -To alleviate this issue, this package reimplements most of the pipeline presented in `autoware_pointcloud_preprocessor` leveraging the use of GPGPUs. In particular, this package makes use of CUDA to provide accelerated versions of the already establishes implementations, while also maintaining compatibility with normal ROS nodes/topics. <!-- cSpell: ignore GPGPUs > +To alleviate this issue, this package reimplements most of the pipeline presented in `autoware_pointcloud_preprocessor` leveraging the use of GPGPUs. In particular, this package makes use of CUDA to provide accelerated versions of the already established implementations, while also maintaining compatibility with normal ROS nodes/topics. <!-- cSpell: ignore GPGPUs > ## Inner-workings / Algorithms From db99ba64a6528a38a2be98b5ec8cbdfe40e82017 Mon Sep 17 00:00:00 2001 From: Kenzo Lobos Tsunekawa <kenzo.lobos@tier4.jp> Date: Fri, 10 Jan 2025 18:47:54 +0900 Subject: [PATCH 15/55] Update sensing/autoware_cuda_pointcloud_preprocessor/README.md Co-authored-by: Max Schmeller <6088931+mojomex@users.noreply.github.com> --- sensing/autoware_cuda_pointcloud_preprocessor/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/README.md b/sensing/autoware_cuda_pointcloud_preprocessor/README.md index 984da0bcaa518..3c87883971e7e 100644 --- a/sensing/autoware_cuda_pointcloud_preprocessor/README.md +++ b/sensing/autoware_cuda_pointcloud_preprocessor/README.md @@ -13,7 +13,7 @@ A detailed description of each filter's algorithm is available in the following | Filter Name | Description | Detail | | --------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------- | | cuda_organized_pointcloud_adapter | Organizes a pointcloud per ring/channel, so that the memory layout allows parallel processing in cuda | [link](docs/cuda-organized-pointcloud-adapter.md) | -| cuda_pointcloud_preprocessor | Implements the cropping, distortion correction, and outlier filtering (ring-based) of the `autoware_pointcloud_preprocessor`'s cpu versions. | [link](docs/cuda-pointcloud-preprocessor.md) | +| cuda_pointcloud_preprocessor | Implements the cropping, distortion correction, and outlier filtering (ring-based) of the `autoware_pointcloud_preprocessor`'s CPU versions. | [link](docs/cuda-pointcloud-preprocessor.md) | ## (Optional) Future extensions / Unimplemented parts From 9ce2e06ca3dc0e2aa23296cdf10ce528d1eacc2b Mon Sep 17 00:00:00 2001 From: Kenzo Lobos Tsunekawa <kenzo.lobos@tier4.jp> Date: Fri, 10 Jan 2025 19:09:27 +0900 Subject: [PATCH 16/55] Update sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu Co-authored-by: Max Schmeller <6088931+mojomex@users.noreply.github.com> --- .../cuda_pointcloud_preprocessor.cu | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu index 4c769fd24471f..b43d000f82614 100644 --- a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu +++ b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu @@ -98,11 +98,7 @@ __global__ void cropBoxKernel( const float y = d_points[idx].y; const float z = d_points[idx].z; - if (x > min_x && x < max_x && y > min_y && y < max_y && z > min_z && z < max_z) { - output_mask[idx] = 0; - } else { - output_mask[idx] = 1; - } + output_mask[idx] = (x <= min_x || x >= max_x) || (y <= min_y || y >= max_y) || (z <= min_z || z >= max_z); } } From f4749002d9da7753d088845dec1d5a3ac63c596e Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 10 Jan 2025 10:12:18 +0000 Subject: [PATCH 17/55] style(pre-commit): autofix --- .../cuda_pointcloud_preprocessor.cu | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu index b43d000f82614..07450a3b77ecc 100644 --- a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu +++ b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu @@ -98,7 +98,8 @@ __global__ void cropBoxKernel( const float y = d_points[idx].y; const float z = d_points[idx].z; - output_mask[idx] = (x <= min_x || x >= max_x) || (y <= min_y || y >= max_y) || (z <= min_z || z >= max_z); + output_mask[idx] = + (x <= min_x || x >= max_x) || (y <= min_y || y >= max_y) || (z <= min_z || z >= max_z); } } From 594143567849e13a2fc91496b2fa74b95cb031e9 Mon Sep 17 00:00:00 2001 From: Kenzo Lobos Tsunekawa <kenzo.lobos@tier4.jp> Date: Tue, 21 Jan 2025 09:16:00 +0900 Subject: [PATCH 18/55] Update sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu Co-authored-by: Manato Hirabayashi <3022416+manato@users.noreply.github.com> --- .../cuda_pointcloud_preprocessor.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu index 07450a3b77ecc..a37d05ff91313 100644 --- a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu +++ b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu @@ -89,7 +89,7 @@ __global__ void transformPointsKernel( } __global__ void cropBoxKernel( - InputPointType * d_points, uint32_t * output_mask, int num_points, float min_x, float min_y, + InputPointType * __restrict__ d_points, uint32_t * output_mask, int num_points, float min_x, float min_y, float min_z, float max_x, float max_y, float max_z) { int idx = blockIdx.x * blockDim.x + threadIdx.x; From 90ad0b23c606764ffa9c3f4089bfb38f258ed84a Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 21 Jan 2025 00:18:40 +0000 Subject: [PATCH 19/55] style(pre-commit): autofix --- .../cuda_pointcloud_preprocessor.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu index a37d05ff91313..e78b65d53d4c4 100644 --- a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu +++ b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu @@ -89,8 +89,8 @@ __global__ void transformPointsKernel( } __global__ void cropBoxKernel( - InputPointType * __restrict__ d_points, uint32_t * output_mask, int num_points, float min_x, float min_y, - float min_z, float max_x, float max_y, float max_z) + InputPointType * __restrict__ d_points, uint32_t * output_mask, int num_points, float min_x, + float min_y, float min_z, float max_x, float max_y, float max_z) { int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < num_points) { From 13aff8cdce6b1112e033a4a7444dfcfcb2e45eac Mon Sep 17 00:00:00 2001 From: Kenzo Lobos Tsunekawa <kenzo.lobos@tier4.jp> Date: Tue, 21 Jan 2025 09:27:51 +0900 Subject: [PATCH 20/55] Update sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu Co-authored-by: Manato Hirabayashi <3022416+manato@users.noreply.github.com> --- .../cuda_pointcloud_preprocessor.cu | 36 +++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu index e78b65d53d4c4..e7fee664266a1 100644 --- a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu +++ b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu @@ -341,8 +341,44 @@ CudaPointcloudPreprocessor::CudaPointcloudPreprocessor() point_fields_.push_back(intensity_field); point_fields_.push_back(return_type_field); point_fields_.push_back(channel_field); + cudaMemPoolProps pool_props; + memset(&pool_props, 0, sizeof(cudaMemPoolProps)); + pool_props.allocType = cudaMemAllocationTypePinned; + pool_props.handleTypes = cudaMemHandleTypePosixFileDescriptor; + + pool_props.location.type = cudaMemLocationTypeDevice; + cudaGetDevice(&(pool_props.location.id)); + + // cudaMemPool_t device_memory_pool_ needs to be declared as a member of this class + cudaMemPoolCreate(&device_memory_pool_, &pool_props); + MemoryPoolAllocator<TwistStruct2D> allocator_2d(device_memory_pool_); + MemoryPoolAllocator<TwistStruct3D> allocator_3d(device_memory_pool_); + device_twist_2d_structs_ + = thrust::device_vector<TwistStruct2D, MemoryPoolAllocator<TwistStruct2D>>(allocator_2d); + device_twist_3d_structs_ + = thrust::device_vector<TwistStruct3D, MemoryPoolAllocator<TwistStruct3D>>(allocator_3d); } +template <typename T> +class MemoryPoolAllocator { + public: + using value_type = T; + MemoryPoolAllocator(cudaMemPool_t pool) : m_pool(pool) {} + + T* allocate(std::size_t n) { + void* ptr = nullptr; + cudaMallocFromPoolAsync(&ptr, n * sizeof(T), m_pool, cudaStreamDefault); + return static_cast<T*>(ptr); + } + + void deallocate(T* ptr, std::size_t) { + cudaFreeAsync(ptr, cudaStreamDefault); + } + + protected: + cudaMemPool_t m_pool; +}; // MemoryPoolAllocator + void CudaPointcloudPreprocessor::setCropBoxParameters( const CropBoxParameters & self_crop_box_parameters, const CropBoxParameters & mirror_crop_box_parameters) From f0e4c205ad2de03e282e9f8027b385c4ebee7889 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 21 Jan 2025 00:30:23 +0000 Subject: [PATCH 21/55] style(pre-commit): autofix --- .../cuda_pointcloud_preprocessor.cu | 26 +++++++++---------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu index e7fee664266a1..38d6457f10025 100644 --- a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu +++ b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu @@ -353,29 +353,29 @@ CudaPointcloudPreprocessor::CudaPointcloudPreprocessor() cudaMemPoolCreate(&device_memory_pool_, &pool_props); MemoryPoolAllocator<TwistStruct2D> allocator_2d(device_memory_pool_); MemoryPoolAllocator<TwistStruct3D> allocator_3d(device_memory_pool_); - device_twist_2d_structs_ - = thrust::device_vector<TwistStruct2D, MemoryPoolAllocator<TwistStruct2D>>(allocator_2d); - device_twist_3d_structs_ - = thrust::device_vector<TwistStruct3D, MemoryPoolAllocator<TwistStruct3D>>(allocator_3d); + device_twist_2d_structs_ = + thrust::device_vector<TwistStruct2D, MemoryPoolAllocator<TwistStruct2D>>(allocator_2d); + device_twist_3d_structs_ = + thrust::device_vector<TwistStruct3D, MemoryPoolAllocator<TwistStruct3D>>(allocator_3d); } template <typename T> -class MemoryPoolAllocator { - public: +class MemoryPoolAllocator +{ +public: using value_type = T; MemoryPoolAllocator(cudaMemPool_t pool) : m_pool(pool) {} - T* allocate(std::size_t n) { - void* ptr = nullptr; + T * allocate(std::size_t n) + { + void * ptr = nullptr; cudaMallocFromPoolAsync(&ptr, n * sizeof(T), m_pool, cudaStreamDefault); - return static_cast<T*>(ptr); + return static_cast<T *>(ptr); } - void deallocate(T* ptr, std::size_t) { - cudaFreeAsync(ptr, cudaStreamDefault); - } + void deallocate(T * ptr, std::size_t) { cudaFreeAsync(ptr, cudaStreamDefault); } - protected: +protected: cudaMemPool_t m_pool; }; // MemoryPoolAllocator From d05b988ba18630a3831cddf899c657e4680ec9fc Mon Sep 17 00:00:00 2001 From: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp> Date: Tue, 21 Jan 2025 09:52:02 +0900 Subject: [PATCH 22/55] chore: fixed code compilation to reflect Hirabayashi-san's memory pool proposal Signed-off-by: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp> --- .../cuda_pointcloud_preprocessor.hpp | 21 +++++++++++++++++++ .../cuda_pointcloud_preprocessor.cu | 21 ------------------- 2 files changed, 21 insertions(+), 21 deletions(-) diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.hpp b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.hpp index 2a22c340a268c..b53ff31074f24 100644 --- a/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.hpp +++ b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.hpp @@ -89,6 +89,26 @@ struct RingOutlierFilterParameters std::size_t num_points_threshold{0}; }; +template <typename T> +class MemoryPoolAllocator +{ +public: + using value_type = T; + MemoryPoolAllocator(cudaMemPool_t pool) : m_pool(pool) {} + + T * allocate(std::size_t n) + { + void * ptr = nullptr; + cudaMallocFromPoolAsync(&ptr, n * sizeof(T), m_pool, cudaStreamDefault); + return static_cast<T *>(ptr); + } + + void deallocate(T * ptr, std::size_t) { cudaFreeAsync(ptr, cudaStreamDefault); } + +protected: + cudaMemPool_t m_pool; +}; + class CudaPointcloudPreprocessor { public: @@ -131,6 +151,7 @@ class CudaPointcloudPreprocessor std::vector<sensor_msgs::msg::PointField> point_fields_{}; std::unique_ptr<cuda_blackboard::CudaPointCloud2> output_pointcloud_ptr_{}; + cudaMemPool_t device_memory_pool_; thrust::device_vector<InputPointType> device_transformed_points_{}; thrust::device_vector<OutputPointType> device_output_points_{}; thrust::device_vector<uint32_t> device_self_crop_mask_{}; diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu index 38d6457f10025..596b1ce01332b 100644 --- a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu +++ b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu @@ -349,7 +349,6 @@ CudaPointcloudPreprocessor::CudaPointcloudPreprocessor() pool_props.location.type = cudaMemLocationTypeDevice; cudaGetDevice(&(pool_props.location.id)); - // cudaMemPool_t device_memory_pool_ needs to be declared as a member of this class cudaMemPoolCreate(&device_memory_pool_, &pool_props); MemoryPoolAllocator<TwistStruct2D> allocator_2d(device_memory_pool_); MemoryPoolAllocator<TwistStruct3D> allocator_3d(device_memory_pool_); @@ -359,26 +358,6 @@ CudaPointcloudPreprocessor::CudaPointcloudPreprocessor() thrust::device_vector<TwistStruct3D, MemoryPoolAllocator<TwistStruct3D>>(allocator_3d); } -template <typename T> -class MemoryPoolAllocator -{ -public: - using value_type = T; - MemoryPoolAllocator(cudaMemPool_t pool) : m_pool(pool) {} - - T * allocate(std::size_t n) - { - void * ptr = nullptr; - cudaMallocFromPoolAsync(&ptr, n * sizeof(T), m_pool, cudaStreamDefault); - return static_cast<T *>(ptr); - } - - void deallocate(T * ptr, std::size_t) { cudaFreeAsync(ptr, cudaStreamDefault); } - -protected: - cudaMemPool_t m_pool; -}; // MemoryPoolAllocator - void CudaPointcloudPreprocessor::setCropBoxParameters( const CropBoxParameters & self_crop_box_parameters, const CropBoxParameters & mirror_crop_box_parameters) From 5e0cc4758ea69a7c2fd710b3b318710d2f80b93e Mon Sep 17 00:00:00 2001 From: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp> Date: Thu, 23 Jan 2025 16:16:46 +0900 Subject: [PATCH 23/55] feat: generalized the number of crop boxes. For two at least, the new approach is actually faster Signed-off-by: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp> --- .../cuda_pointcloud_preprocessor.hpp | 29 ++++--- .../cuda_pointcloud_preprocessor.cu | 86 +++++++++++-------- .../cuda_pointcloud_preprocessor_node.cpp | 46 ++++++---- 3 files changed, 94 insertions(+), 67 deletions(-) diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.hpp b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.hpp index b53ff31074f24..94c5915d08619 100644 --- a/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.hpp +++ b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.hpp @@ -74,12 +74,12 @@ struct TransformStruct struct CropBoxParameters { - float min_x{std::nanf("")}; - float max_x{std::nanf("")}; - float min_y{std::nanf("")}; - float max_y{std::nanf("")}; - float min_z{std::nanf("")}; - float max_z{std::nanf("")}; + float min_x; + float max_x; + float min_y; + float max_y; + float min_z; + float max_z; }; struct RingOutlierFilterParameters @@ -115,9 +115,7 @@ class CudaPointcloudPreprocessor CudaPointcloudPreprocessor(); ~CudaPointcloudPreprocessor() = default; - void setCropBoxParameters( - const CropBoxParameters & self_crop_box_parameters, - const CropBoxParameters & mirror_crop_box_parameters); + void setCropBoxParameters(const std::vector<CropBoxParameters> & crop_box_parameters); void setRingOutlierFilterParameters(const RingOutlierFilterParameters & ring_outlier_parameters); void set3DUndistortion(bool value); @@ -151,15 +149,20 @@ class CudaPointcloudPreprocessor std::vector<sensor_msgs::msg::PointField> point_fields_{}; std::unique_ptr<cuda_blackboard::CudaPointCloud2> output_pointcloud_ptr_{}; + cudaStream_t stream_; + int max_blocks_per_grid_{}; + const int threads_per_block_{256}; cudaMemPool_t device_memory_pool_; thrust::device_vector<InputPointType> device_transformed_points_{}; thrust::device_vector<OutputPointType> device_output_points_{}; - thrust::device_vector<uint32_t> device_self_crop_mask_{}; - thrust::device_vector<uint32_t> device_mirror_crop_mask_{}; - thrust::device_vector<uint32_t> device_ring_outlier_mask_{}; - thrust::device_vector<uint32_t> device_indices_{}; + thrust::device_vector<std::uint32_t> device_crop_mask_{}; + thrust::device_vector<std::uint32_t> device_ring_outlier_mask_{}; + thrust::device_vector<std::uint32_t> device_indices_{}; thrust::device_vector<TwistStruct2D> device_twist_2d_structs_{}; thrust::device_vector<TwistStruct3D> device_twist_3d_structs_{}; + + thrust::device_vector<CropBoxParameters> host_crop_box_structs_{}; + thrust::device_vector<CropBoxParameters> device_crop_box_structs_{}; }; } // namespace autoware::cuda_pointcloud_preprocessor diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu index 596b1ce01332b..0e2cd45eda0f8 100644 --- a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu +++ b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu @@ -89,27 +89,40 @@ __global__ void transformPointsKernel( } __global__ void cropBoxKernel( - InputPointType * __restrict__ d_points, uint32_t * output_mask, int num_points, float min_x, - float min_y, float min_z, float max_x, float max_y, float max_z) + InputPointType * __restrict__ d_points, std::uint32_t * __restrict__ output_mask, int num_points, + const CropBoxParameters * __restrict__ crop_box_parameters_ptr, int num_crop_boxes) { - int idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx < num_points) { + for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < num_points; + idx += blockDim.x * gridDim.x) { const float x = d_points[idx].x; const float y = d_points[idx].y; const float z = d_points[idx].z; - output_mask[idx] = - (x <= min_x || x >= max_x) || (y <= min_y || y >= max_y) || (z <= min_z || z >= max_z); + std::uint32_t mask = 1; + + for (int i = 0; i < num_crop_boxes; i++) { + const CropBoxParameters & crop_box_parameters = crop_box_parameters_ptr[i]; + const float & min_x = crop_box_parameters.min_x; + const float & min_y = crop_box_parameters.min_y; + const float & min_z = crop_box_parameters.min_z; + const float & max_x = crop_box_parameters.max_x; + const float & max_y = crop_box_parameters.max_y; + const float & max_z = crop_box_parameters.max_z; + mask &= + (x <= min_x || x >= max_x) || (y <= min_y || y >= max_y) || (z <= min_z || z >= max_z); + } + + output_mask[idx] = mask; } } __global__ void combineMasksKernel( - const uint32_t * __restrict__ mask1, const uint32_t * __restrict__ mask2, - const uint32_t * __restrict__ mask3, int num_points, uint32_t * output_mask) + const uint32_t * __restrict__ mask1, const uint32_t * __restrict__ mask2, int num_points, + uint32_t * __restrict__ output_mask) { int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < num_points) { - output_mask[idx] = mask1[idx] & mask2[idx] & mask3[idx]; + output_mask[idx] = mask1[idx] & mask2[idx]; } } @@ -341,6 +354,13 @@ CudaPointcloudPreprocessor::CudaPointcloudPreprocessor() point_fields_.push_back(intensity_field); point_fields_.push_back(return_type_field); point_fields_.push_back(channel_field); + + cudaStreamCreate(&stream_); + + int num_sm; + cudaDeviceGetAttribute(&num_sm, cudaDevAttrMultiProcessorCount, 0); + max_blocks_per_grid_ = 4 * num_sm; // used for strided loops + cudaMemPoolProps pool_props; memset(&pool_props, 0, sizeof(cudaMemPoolProps)); pool_props.allocType = cudaMemAllocationTypePinned; @@ -359,11 +379,10 @@ CudaPointcloudPreprocessor::CudaPointcloudPreprocessor() } void CudaPointcloudPreprocessor::setCropBoxParameters( - const CropBoxParameters & self_crop_box_parameters, - const CropBoxParameters & mirror_crop_box_parameters) + const std::vector<CropBoxParameters> & crop_box_parameters) { - self_crop_box_parameters_ = self_crop_box_parameters; - mirror_crop_box_parameters_ = mirror_crop_box_parameters; + host_crop_box_structs_ = crop_box_parameters; + device_crop_box_structs_ = host_crop_box_structs_; } void CudaPointcloudPreprocessor::setRingOutlierFilterParameters( @@ -584,8 +603,7 @@ std::unique_ptr<cuda_blackboard::CudaPointCloud2> CudaPointcloudPreprocessor::pr max_points_per_ring_ = input_pointcloud_msg.width; device_transformed_points_.resize(num_input_points); - device_self_crop_mask_.resize(num_input_points); - device_mirror_crop_mask_.resize(num_input_points); + device_crop_mask_.resize(num_input_points); device_ring_outlier_mask_.resize(num_input_points); device_indices_.resize(num_input_points); @@ -629,49 +647,41 @@ std::unique_ptr<cuda_blackboard::CudaPointCloud2> CudaPointcloudPreprocessor::pr thrust::raw_pointer_cast(device_twist_3d_structs_.data()); InputPointType * device_transformed_points = thrust::raw_pointer_cast(device_transformed_points_.data()); - uint32_t * device_self_crop_mask = thrust::raw_pointer_cast(device_self_crop_mask_.data()); - uint32_t * device_mirror_crop_mask = thrust::raw_pointer_cast(device_mirror_crop_mask_.data()); + std::uint32_t * device_crop_mask = thrust::raw_pointer_cast(device_crop_mask_.data()); uint32_t * device_ring_outlier_mask = thrust::raw_pointer_cast(device_ring_outlier_mask_.data()); uint32_t * device_indices = thrust::raw_pointer_cast(device_indices_.data()); - const int threadsPerBlock = 256; - const int blocksPerGrid = (num_input_points + threadsPerBlock - 1) / threadsPerBlock; + const int blocks_per_grid = (num_input_points + threads_per_block_ - 1) / threads_per_block_; - transformPointsKernel<<<blocksPerGrid, threadsPerBlock>>>( + transformPointsKernel<<<blocks_per_grid, threads_per_block_, 0, stream_>>>( device_input_points, device_transformed_points, num_input_points, transform_struct); - cropBoxKernel<<<blocksPerGrid, threadsPerBlock>>>( - device_transformed_points, device_self_crop_mask, num_input_points, - self_crop_box_parameters_.min_x, self_crop_box_parameters_.min_y, - self_crop_box_parameters_.min_z, self_crop_box_parameters_.max_x, - self_crop_box_parameters_.max_y, self_crop_box_parameters_.max_z); - - cropBoxKernel<<<blocksPerGrid, threadsPerBlock>>>( - device_transformed_points, device_mirror_crop_mask, num_input_points, - mirror_crop_box_parameters_.min_x, mirror_crop_box_parameters_.min_y, - mirror_crop_box_parameters_.min_z, mirror_crop_box_parameters_.max_x, - mirror_crop_box_parameters_.max_y, mirror_crop_box_parameters_.max_z); + int crop_box_blocks_per_grid = std::min(blocks_per_grid, max_blocks_per_grid_); + cropBoxKernel<<<crop_box_blocks_per_grid, threads_per_block_, 0, stream_>>>( + device_transformed_points, device_crop_mask, num_input_points, + thrust::raw_pointer_cast(device_crop_box_structs_.data()), host_crop_box_structs_.size()); if (use_3d_undistortion_ && device_twist_3d_structs_.size() > 0) { - undistort3dKernel<<<blocksPerGrid, threadsPerBlock>>>( + undistort3dKernel<<<blocks_per_grid, threads_per_block_, 0, stream_>>>( device_transformed_points, num_input_points, device_twist_3d_structs, device_twist_3d_structs_.size()); } else if (!use_3d_undistortion_ && device_twist_2d_structs_.size() > 0) { - undistort2dKernel<<<blocksPerGrid, threadsPerBlock>>>( + undistort2dKernel<<<blocks_per_grid, threads_per_block_, 0, stream_>>>( device_transformed_points, num_input_points, device_twist_2d_structs, device_twist_2d_structs_.size()); } - ringOutlierFilterKernel<<<blocksPerGrid, threadsPerBlock>>>( + ringOutlierFilterKernel<<<blocks_per_grid, threads_per_block_, 0, stream_>>>( device_transformed_points, device_ring_outlier_mask, max_rings_, max_points_per_ring_, ring_outlier_parameters_.distance_ratio, ring_outlier_parameters_.object_length_threshold * ring_outlier_parameters_.object_length_threshold, ring_outlier_parameters_.num_points_threshold); - combineMasksKernel<<<blocksPerGrid, threadsPerBlock>>>( - device_self_crop_mask, device_mirror_crop_mask, device_ring_outlier_mask, num_input_points, - device_ring_outlier_mask); + combineMasksKernel<<<blocks_per_grid, threads_per_block_, 0, stream_>>>( + device_crop_mask, device_ring_outlier_mask, num_input_points, device_ring_outlier_mask); + + cudaStreamSynchronize(stream_); thrust::inclusive_scan( thrust::device, device_ring_outlier_mask, device_ring_outlier_mask + num_input_points, @@ -683,7 +693,7 @@ std::unique_ptr<cuda_blackboard::CudaPointCloud2> CudaPointcloudPreprocessor::pr cudaMemcpyDeviceToHost); if (num_output_points > 0) { - extractInputPointsToOutputPoints_indicesKernel<<<blocksPerGrid, threadsPerBlock>>>( + extractInputPointsToOutputPoints_indicesKernel<<<blocks_per_grid, threads_per_block_>>>( device_transformed_points, device_ring_outlier_mask, device_indices, num_input_points, reinterpret_cast<OutputPointType *>(output_pointcloud_ptr_->data.get())); } diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.cpp b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.cpp index fdfae020f1e19..2f5cb881e97fa 100644 --- a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.cpp +++ b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.cpp @@ -52,20 +52,35 @@ CudaPointcloudPreprocessorNode::CudaPointcloudPreprocessorNode( ring_outlier_filter_parameters.num_points_threshold = declare_parameter<int>("num_points_threshold"); - CropBoxParameters self_crop_box_parameters, mirror_crop_box_parameters; - self_crop_box_parameters.min_x = declare_parameter<float>("self_crop.min_x"); - self_crop_box_parameters.min_y = declare_parameter<float>("self_crop.min_y"); - self_crop_box_parameters.min_z = declare_parameter<float>("self_crop.min_z"); - self_crop_box_parameters.max_x = declare_parameter<float>("self_crop.max_x"); - self_crop_box_parameters.max_y = declare_parameter<float>("self_crop.max_y"); - self_crop_box_parameters.max_z = declare_parameter<float>("self_crop.max_z"); - - mirror_crop_box_parameters.min_x = declare_parameter<float>("mirror_crop.min_x"); - mirror_crop_box_parameters.min_y = declare_parameter<float>("mirror_crop.min_y"); - mirror_crop_box_parameters.min_z = declare_parameter<float>("mirror_crop.min_z"); - mirror_crop_box_parameters.max_x = declare_parameter<float>("mirror_crop.max_x"); - mirror_crop_box_parameters.max_y = declare_parameter<float>("mirror_crop.max_y"); - mirror_crop_box_parameters.max_z = declare_parameter<float>("mirror_crop.max_z"); + const auto crop_box_min_x_vector = declare_parameter<std::vector<double>>("crop_box.min_x"); + const auto crop_box_min_y_vector = declare_parameter<std::vector<double>>("crop_box.min_y"); + const auto crop_box_min_z_vector = declare_parameter<std::vector<double>>("crop_box.min_z"); + + const auto crop_box_max_x_vector = declare_parameter<std::vector<double>>("crop_box.max_x"); + const auto crop_box_max_y_vector = declare_parameter<std::vector<double>>("crop_box.max_y"); + const auto crop_box_max_z_vector = declare_parameter<std::vector<double>>("crop_box.max_z"); + + if ( + crop_box_min_x_vector.size() != crop_box_min_y_vector.size() || + crop_box_min_x_vector.size() != crop_box_min_z_vector.size() || + crop_box_min_x_vector.size() != crop_box_max_x_vector.size() || + crop_box_min_x_vector.size() != crop_box_max_y_vector.size() || + crop_box_min_x_vector.size() != crop_box_max_z_vector.size()) { + throw std::runtime_error("Crop box parameters must have the same size"); + } + + std::vector<CropBoxParameters> crop_box_parameters; + + for (std::size_t i = 0; i < crop_box_min_x_vector.size(); i++) { + CropBoxParameters parameters; + parameters.min_x = crop_box_min_x_vector[i]; + parameters.min_y = crop_box_min_y_vector[i]; + parameters.min_z = crop_box_min_z_vector[i]; + parameters.max_x = crop_box_max_x_vector[i]; + parameters.max_y = crop_box_max_y_vector[i]; + parameters.max_z = crop_box_max_z_vector[i]; + crop_box_parameters.push_back(parameters); + } bool use_3d_undistortion = declare_parameter<bool>("use_3d_distortion_correction"); bool use_imu = declare_parameter<bool>("use_imu"); @@ -92,8 +107,7 @@ CudaPointcloudPreprocessorNode::CudaPointcloudPreprocessorNode( cuda_pointcloud_preprocessor_ = std::make_unique<CudaPointcloudPreprocessor>(); cuda_pointcloud_preprocessor_->setRingOutlierFilterParameters(ring_outlier_filter_parameters); - cuda_pointcloud_preprocessor_->setCropBoxParameters( - self_crop_box_parameters, mirror_crop_box_parameters); + cuda_pointcloud_preprocessor_->setCropBoxParameters(crop_box_parameters); cuda_pointcloud_preprocessor_->set3DUndistortion(use_3d_undistortion); // initialize debug tool From 6baaea2a8da07873a43ab74b1e866576050f3cbb Mon Sep 17 00:00:00 2001 From: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp> Date: Thu, 23 Jan 2025 16:34:21 +0900 Subject: [PATCH 24/55] chore: updated config, schema, and handled the null case in a specialized way Signed-off-by: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp> --- .../cuda_pointcloud_preprocessor.param.yaml | 18 ++-- .../cuda_pointcloud_preprocessor.schema.json | 88 ++++++------------- .../cuda_pointcloud_preprocessor.cu | 10 ++- 3 files changed, 42 insertions(+), 74 deletions(-) diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/config/cuda_pointcloud_preprocessor.param.yaml b/sensing/autoware_cuda_pointcloud_preprocessor/config/cuda_pointcloud_preprocessor.param.yaml index 22358cfadf2a7..66ce309caa127 100644 --- a/sensing/autoware_cuda_pointcloud_preprocessor/config/cuda_pointcloud_preprocessor.param.yaml +++ b/sensing/autoware_cuda_pointcloud_preprocessor/config/cuda_pointcloud_preprocessor.param.yaml @@ -8,15 +8,9 @@ object_length_threshold: 0.1 num_points_threshold: 4 - self_crop.min_x: 1.0 - self_crop.min_y: 1.0 - self_crop.min_z: 1.0 - self_crop.max_x: -1.0 - self_crop.max_y: -1.0 - self_crop.max_z: -1.0 - mirror_crop.min_x: 1.0 - mirror_crop.min_y: 1.0 - mirror_crop.min_z: 1.0 - mirror_crop.max_x: -1.0 - mirror_crop.max_y: -1.0 - mirror_crop.max_z: -1.0 + crop_box.min_x: [1.0] + crop_box.min_y: [1.0] + crop_box.min_z: [1.0] + crop_box.max_x: [-1.0] + crop_box.max_y: [-1.0] + crop_box.max_z: [-1.0] diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/schema/cuda_pointcloud_preprocessor.schema.json b/sensing/autoware_cuda_pointcloud_preprocessor/schema/cuda_pointcloud_preprocessor.schema.json index cc94da3523512..b8e597b9afdb2 100644 --- a/sensing/autoware_cuda_pointcloud_preprocessor/schema/cuda_pointcloud_preprocessor.schema.json +++ b/sensing/autoware_cuda_pointcloud_preprocessor/schema/cuda_pointcloud_preprocessor.schema.json @@ -39,65 +39,35 @@ "default": "4", "minimum": 0 }, - "self_crop.min_x": { - "type": "number", - "description": "minimum x-coordinate value for self cropping in meters", - "default": "-1.0" - }, - "self_crop.min_y": { - "type": "number", - "description": "minimum y-coordinate value for self cropping in meters", - "default": "-1.0" - }, - "self_crop.min_z": { - "type": "number", - "description": "minimum z-coordinate value for self cropping in meters", - "default": "-1.0" - }, - "self_crop.max_x": { - "type": "number", - "description": "maximum x-coordinate value for self cropping in meters", - "default": "1.0" - }, - "self_crop.max_y": { - "type": "number", - "description": "maximum y-coordinate value for self cropping in meters", - "default": "1.0" - }, - "self_crop.max_z": { - "type": "number", - "description": "maximum z-coordinate value for self cropping in meters", - "default": "1.0" - }, - "mirror_crop.min_x": { - "type": "number", - "description": "minimum x-coordinate value for mirror cropping in meters", - "default": "-1.0" - }, - "mirror_crop.min_y": { - "type": "number", - "description": "minimum y-coordinate value for mirror cropping in meters", - "default": "-1.0" - }, - "mirror_crop.min_z": { - "type": "number", - "description": "minimum z-coordinate value for mirror cropping in meters", - "default": "-1.0" - }, - "mirror_crop.max_x": { - "type": "number", - "description": "maximum x-coordinate value for mirror cropping in meters", - "default": "1.0" - }, - "mirror_crop.max_y": { - "type": "number", - "description": "maximum y-coordinate value for mirror cropping in meters", - "default": "1.0" - }, - "mirror_crop.max_z": { - "type": "number", - "description": "maximum z-coordinate value for mirror cropping in meters", - "default": "1.0" + "crop_box.min_x": { + "type": "array", + "description": "An array in which each elemt is the minimum x value in a crop box", + "default": [1.0] + }, + "crop_box.min_y": { + "type": "array", + "description": "An array in which each elemt is the minimum y value in a crop box", + "default": [1.0] + }, + "crop_box.min_z": { + "type": "array", + "description": "An array in which each elemt is the minimum z value in a crop box", + "default": [1.0] + }, + "crop_box.max_x": { + "type": "array", + "description": "An array in which each elemt is the maximum x value in a crop box", + "default": [-1.0] + }, + "crop_box.max_y": { + "type": "array", + "description": "An array in which each elemt is the maximum y value in a crop box", + "default": [-1.0] + }, + "crop_box.max_z": { + "type": "array", + "description": "An array in which each elemt is the maximum z value in a crop box", + "default": [-1.0] } }, "required": [ diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu index 0e2cd45eda0f8..8d4526377ab44 100644 --- a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu +++ b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu @@ -657,9 +657,13 @@ std::unique_ptr<cuda_blackboard::CudaPointCloud2> CudaPointcloudPreprocessor::pr device_input_points, device_transformed_points, num_input_points, transform_struct); int crop_box_blocks_per_grid = std::min(blocks_per_grid, max_blocks_per_grid_); - cropBoxKernel<<<crop_box_blocks_per_grid, threads_per_block_, 0, stream_>>>( - device_transformed_points, device_crop_mask, num_input_points, - thrust::raw_pointer_cast(device_crop_box_structs_.data()), host_crop_box_structs_.size()); + if (host_crop_box_structs_.size() > 0) { + cropBoxKernel<<<crop_box_blocks_per_grid, threads_per_block_, 0, stream_>>>( + device_transformed_points, device_crop_mask, num_input_points, + thrust::raw_pointer_cast(device_crop_box_structs_.data()), host_crop_box_structs_.size()); + } else { + thrust::fill(thrust::device, device_crop_mask, device_crop_mask + num_input_points, 1); + } if (use_3d_undistortion_ && device_twist_3d_structs_.size() > 0) { undistort3dKernel<<<blocks_per_grid, threads_per_block_, 0, stream_>>>( From 59144d8ce1f9e4a6a33e5dfb493ab26fcefdc111 Mon Sep 17 00:00:00 2001 From: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp> Date: Tue, 4 Feb 2025 11:51:13 +0900 Subject: [PATCH 25/55] feat: moving the pointcloud organization into gpu Signed-off-by: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp> --- .../cuda_pointcloud_preprocessor.hpp | 21 + .../cuda_pointcloud_preprocessor_node.hpp | 50 +- .../cuda_pointcloud_preprocessor.cu | 104 +++++ .../cuda_pointcloud_preprocessor_node.cpp | 428 +++++++++++++++++- 4 files changed, 593 insertions(+), 10 deletions(-) diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.hpp b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.hpp index 94c5915d08619..3dbc968d9376a 100644 --- a/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.hpp +++ b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.hpp @@ -29,11 +29,32 @@ #include <thrust/device_vector.h> #include <cmath> +#include <cstdint> #include <deque> namespace autoware::cuda_pointcloud_preprocessor { +void organizeLaunch( + const InputPointType * __restrict__ input_points, std::uint32_t * index_tensor, + std::int32_t * ring_indexes, std::int32_t initial_max_rings, std::int32_t * output_max_rings, + std::int32_t initial_max_points_per_ring, std::int32_t * output_max_points_per_ring, + int num_points, cudaStream_t & stream); + +std::size_t querySortWorkspace( + int num_items, int num_segments, int * d_offsets, std::uint32_t * d_keys_in, + std::uint32_t * d_keys_out); + +void sortLaunch( + int num_items, int num_segments, int * d_offsets, std::uint32_t * d_keys_in, + std::uint32_t * d_keys_out, std::uint8_t * d_temp_storage, std::size_t temp_storage_bytes, + cudaStream_t & stream); // e.g., [-, -, -, -, -, -, -] + +void gatherLaunch( + const InputPointType * __restrict__ input_points, const std::uint32_t * __restrict__ index_tensor, + InputPointType * __restrict__ output_points, int num_rings, int max_points_per_ring, + cudaStream_t & stream); + struct TwistStruct2D { float cum_x; diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.hpp b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.hpp index a5c5361c92515..6a45a79b63c65 100644 --- a/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.hpp +++ b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.hpp @@ -81,8 +81,17 @@ class CudaPointcloudPreprocessorNode : public rclcpp::Node const std::string & target_frame, const std::string & source_frame, tf2::Transform * tf2_transform_ptr); + void estimatePointcloudRingInfo( + const sensor_msgs::msg::PointCloud2::ConstSharedPtr input_pointcloud_msg_ptr); + + bool orderPointcloud( + const sensor_msgs::msg::PointCloud2::ConstSharedPtr input_pointcloud_msg_ptr); + // Callback - void pointcloudCallback(const std::shared_ptr<const cuda_blackboard::CudaPointCloud2> cuda_msg); + void pointcloudCallback( + const sensor_msgs::msg::PointCloud2::ConstSharedPtr input_pointcloud_msg_ptr); + void cudaPointcloudCallback( + const std::shared_ptr<const cuda_blackboard::CudaPointCloud2> cuda_msg); void twistCallback( const geometry_msgs::msg::TwistWithCovarianceStamped::ConstSharedPtr pointcloud_msg); void imuCallback(const sensor_msgs::msg::Imu::ConstSharedPtr imu_msg); @@ -94,18 +103,55 @@ class CudaPointcloudPreprocessorNode : public rclcpp::Node std::deque<geometry_msgs::msg::TwistWithCovarianceStamped> twist_queue_; std::deque<geometry_msgs::msg::Vector3Stamped> angular_velocity_queue_; + std::size_t num_rings_{0}; + std::size_t max_points_per_ring_{0}; + std::vector<std::size_t> next_ring_index_; + // Subscriber rclcpp::Subscription<geometry_msgs::msg::TwistWithCovarianceStamped>::SharedPtr twist_sub_{}; rclcpp::Subscription<sensor_msgs::msg::Imu>::SharedPtr imu_sub_{}; + rclcpp::Subscription<sensor_msgs::msg::PointCloud2>::SharedPtr pointcloud_sub_{}; + + cuda_blackboard::CudaUniquePtr<InputPointType[]> points_device_; + cuda_blackboard::CudaUniquePtr<InputPointType[]> organized_points_device_; + cuda_blackboard::CudaUniquePtr<std::int32_t[]> ring_index_device_; + cuda_blackboard::CudaUniquePtr<std::uint32_t[]> indexes_tensor_device_; + cuda_blackboard::CudaUniquePtr<std::uint32_t[]> sorted_indexes_tensor_device_; + cuda_blackboard::CudaUniquePtr<std::int32_t[]> segment_offsets_device_; + cuda_blackboard::CudaUniquePtr<std::int32_t> max_rings_device_; + cuda_blackboard::CudaUniquePtr<std::int32_t> max_points_per_ring_device_; + + cuda_blackboard::CudaUniquePtr<std::uint8_t[]> sort_workspace_device_; + std::size_t sort_workspace_bytes_{0}; + + std::int32_t test_num_rings_{0}; + std::int32_t test_max_points_per_ring_{0}; + cudaStream_t stream_; + std::int32_t capacity_{0}; + + cuda_blackboard::HostUniquePtr<autoware::point_types::PointXYZIRCAEDT[]> host_buffer_; + // cuda_blackboard::CudaUniquePtr<std::uint8_t[]> device_buffer_; TODO(knzo25) replaced by a + // CudaPointCloud + std::shared_ptr<cuda_blackboard::CudaPointCloud2> input_pointcloud_ptr_; // CUDA pub & sub std::unique_ptr<cuda_blackboard::CudaBlackboardPublisher<cuda_blackboard::CudaPointCloud2>> pub_; - std::unique_ptr<cuda_blackboard::CudaBlackboardSubscriber<cuda_blackboard::CudaPointCloud2>> sub_; + // std::unique_ptr<cuda_blackboard::CudaBlackboardSubscriber<cuda_blackboard::CudaPointCloud2>> + // sub_; std::unique_ptr<CudaPointcloudPreprocessor> cuda_pointcloud_preprocessor_; std::unique_ptr<autoware::universe_utils::StopWatch<std::chrono::milliseconds>> stop_watch_ptr_; std::unique_ptr<autoware::universe_utils::DebugPublisher> debug_publisher_; + + double organize_time_cuda_ms_{0.0}; + double copy_time_cuda_ms_{0.0}; + + double organize_time_ms_{0.0}; + double copy_time_ms_{0.0}; + double processing_time_ms_{0.0}; + double kernel_time_ms_{0.0}; + int iteration_{0}; }; } // namespace autoware::cuda_pointcloud_preprocessor diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu index 8d4526377ab44..d99d384b0c9ab 100644 --- a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu +++ b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu @@ -18,6 +18,7 @@ #include <Eigen/Core> #include <Eigen/Dense> #include <Eigen/Geometry> +#include <cub/cub.cuh> #include <cuda_runtime.h> #include <tf2/utils.h> @@ -25,6 +26,7 @@ #include <thrust/host_vector.h> #include <thrust/reduce.h> #include <thrust/scan.h> +// #include <cub/device/device_segmented_radix_sort.cuh> namespace autoware::cuda_pointcloud_preprocessor { @@ -70,6 +72,108 @@ __host__ __device__ Eigen::Matrix4f transformationMatrixFromVelocity( return transformation; } +__global__ void organizeKernel( + const InputPointType * __restrict__ input_points, std::uint32_t * index_tensor, + std::int32_t * ring_indexes, std::int32_t initial_max_rings, std::int32_t * output_max_rings, + std::int32_t initial_max_points_per_ring, std::int32_t * output_max_points_per_ring, + int num_points) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= num_points) { + return; + } + + auto ring = input_points[idx].channel; + + if (ring >= initial_max_rings) { + atomicMax(output_max_rings, ring); + return; + } + + int next_offset = atomicAdd(&ring_indexes[ring], 1); + + if (next_offset >= initial_max_points_per_ring) { + atomicMax(output_max_points_per_ring, next_offset); + return; + } + + index_tensor[ring * initial_max_points_per_ring + next_offset] = idx; +} + +void organizeLaunch( + const InputPointType * __restrict__ input_points, std::uint32_t * index_tensor, + std::int32_t * ring_indexes, std::int32_t initial_max_rings, std::int32_t * output_max_rings, + std::int32_t initial_max_points_per_ring, std::int32_t * output_max_points_per_ring, + int num_points, cudaStream_t & stream) +{ + int threads_per_block = 256; + int blocks_per_grid = (num_points + threads_per_block - 1) / threads_per_block; + organizeKernel<<<blocks_per_grid, threads_per_block>>>( + input_points, index_tensor, ring_indexes, initial_max_rings, output_max_rings, + initial_max_points_per_ring, output_max_points_per_ring, num_points); +} + +std::size_t querySortWorkspace( + int num_items, int num_segments, int * d_offsets, std::uint32_t * d_keys_in, + std::uint32_t * d_keys_out) +{ + // Determine temporary device storage requirements + void * d_temp_storage = nullptr; + size_t temp_storage_bytes = 0; + cub::DeviceSegmentedRadixSort::SortKeys( + d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items, num_segments, d_offsets, + d_offsets + 1); + + return temp_storage_bytes; +} + +void sortLaunch( + int num_items, int num_segments, int * d_offsets, std::uint32_t * d_keys_in, + std::uint32_t * d_keys_out, std::uint8_t * d_temp_storage, std::size_t temp_storage_bytes, + cudaStream_t & stream) +{ + // Allocate temporary storage + // cudaMalloc(&d_temp_storage, temp_storage_bytes); + + // Run sorting operation + cub::DeviceSegmentedRadixSort::SortKeys( + reinterpret_cast<void *>(d_temp_storage), temp_storage_bytes, d_keys_in, d_keys_out, num_items, + num_segments, d_offsets, d_offsets + 1, 0, sizeof(std::uint32_t) * 8, stream); +} + +__global__ void gatherKernel( + const InputPointType * __restrict__ input_points, const std::uint32_t * __restrict__ index_tensor, + InputPointType * __restrict__ output_points, int num_rings, int max_points_per_ring) +{ + const int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= num_rings * max_points_per_ring) { + return; + } + + const int ring = idx / max_points_per_ring; + const int point = idx % max_points_per_ring; + + const std::uint32_t input_idx = index_tensor[ring * max_points_per_ring + point]; + + if (input_idx < std::numeric_limits<std::uint32_t>::max()) { + output_points[ring * max_points_per_ring + point] = input_points[input_idx]; + } else { + output_points[ring * max_points_per_ring + point].distance = 0.0f; + } +} + +void gatherLaunch( + const InputPointType * __restrict__ input_points, const std::uint32_t * __restrict__ index_tensor, + InputPointType * __restrict__ output_points, int num_rings, int max_points_per_ring, + cudaStream_t & stream) +{ + int threads_per_block = 256; + int blocks_per_grid = + (num_rings * max_points_per_ring + threads_per_block - 1) / threads_per_block; + gatherKernel<<<blocks_per_grid, threads_per_block, 0, stream>>>( + input_points, index_tensor, output_points, num_rings, max_points_per_ring); +} + __global__ void transformPointsKernel( const InputPointType * __restrict__ input_points, InputPointType * output_points, int num_points, TransformStruct transform) diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.cpp b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.cpp index 2f5cb881e97fa..1aa634d8807dd 100644 --- a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.cpp +++ b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.cpp @@ -27,6 +27,8 @@ #include <pcl/point_types.h> #include <chrono> +#include <chrono> // TODO(knzo25): remove this +#include <limits> // TODO(knzo25): remove this #include <string> #include <vector> @@ -85,24 +87,29 @@ CudaPointcloudPreprocessorNode::CudaPointcloudPreprocessorNode( bool use_3d_undistortion = declare_parameter<bool>("use_3d_distortion_correction"); bool use_imu = declare_parameter<bool>("use_imu"); - // Subscriber - sub_ = - std::make_unique<cuda_blackboard::CudaBlackboardSubscriber<cuda_blackboard::CudaPointCloud2>>( - *this, "~/input/pointcloud", false, - std::bind(&CudaPointcloudPreprocessorNode::pointcloudCallback, this, _1)); - // Publisher pub_ = std::make_unique<cuda_blackboard::CudaBlackboardPublisher<cuda_blackboard::CudaPointCloud2>>( *this, "~/output/pointcloud"); + + // Subscriber + rclcpp::SubscriptionOptions sub_options; + sub_options.use_intra_process_comm = rclcpp::IntraProcessSetting::Enable; + + pointcloud_sub_ = this->create_subscription<sensor_msgs::msg::PointCloud2>( + "~/input/pointcloud", rclcpp::SensorDataQoS{}.keep_last(1), + std::bind(&CudaPointcloudPreprocessorNode::pointcloudCallback, this, std::placeholders::_1), + sub_options); twist_sub_ = this->create_subscription<geometry_msgs::msg::TwistWithCovarianceStamped>( "~/input/twist", 10, - std::bind(&CudaPointcloudPreprocessorNode::twistCallback, this, std::placeholders::_1)); + std::bind(&CudaPointcloudPreprocessorNode::twistCallback, this, std::placeholders::_1), + sub_options); if (use_imu) { imu_sub_ = this->create_subscription<sensor_msgs::msg::Imu>( "~/input/imu", 10, - std::bind(&CudaPointcloudPreprocessorNode::imuCallback, this, std::placeholders::_1)); + std::bind(&CudaPointcloudPreprocessorNode::imuCallback, this, std::placeholders::_1), + sub_options); } cuda_pointcloud_preprocessor_ = std::make_unique<CudaPointcloudPreprocessor>(); @@ -119,6 +126,32 @@ CudaPointcloudPreprocessorNode::CudaPointcloudPreprocessorNode( debug_publisher_ = std::make_unique<DebugPublisher>(this, "cuda_pointcloud_preprocessor"); stop_watch_ptr_->tic("processing_time"); } + + // New organized pointcloud method + test_num_rings_ = 1; + test_max_points_per_ring_ = 1; + ring_index_device_ = cuda_blackboard::make_unique<std::int32_t[]>(test_num_rings_); + indexes_tensor_device_ = + cuda_blackboard::make_unique<std::uint32_t[]>(test_num_rings_ * test_max_points_per_ring_); + sorted_indexes_tensor_device_ = + cuda_blackboard::make_unique<std::uint32_t[]>(test_num_rings_ * test_max_points_per_ring_); + segment_offsets_device_ = cuda_blackboard::make_unique<std::int32_t[]>(test_num_rings_ + 1); + max_rings_device_ = cuda_blackboard::make_unique<std::int32_t>(); + max_points_per_ring_device_ = cuda_blackboard::make_unique<std::int32_t>(); + organized_points_device_ = + cuda_blackboard::make_unique<InputPointType[]>(test_num_rings_ * test_max_points_per_ring_); + + cudaStreamCreate(&stream_); + cudaMemsetAsync(max_rings_device_.get(), 0, sizeof(std::int32_t), stream_); + cudaMemsetAsync(max_points_per_ring_device_.get(), 0, sizeof(std::int32_t), stream_); + + sort_workspace_bytes_ = querySortWorkspace( + test_num_rings_ * test_max_points_per_ring_, test_num_rings_, segment_offsets_device_.get(), + indexes_tensor_device_.get(), sorted_indexes_tensor_device_.get()); + sort_workspace_device_ = cuda_blackboard::make_unique<std::uint8_t[]>(sort_workspace_bytes_); + + capacity_ = 8192; + points_device_ = cuda_blackboard::make_unique<InputPointType[]>(capacity_); } bool CudaPointcloudPreprocessorNode::getTransform( @@ -198,9 +231,356 @@ void CudaPointcloudPreprocessorNode::imuCallback( } } +void CudaPointcloudPreprocessorNode::estimatePointcloudRingInfo( + const sensor_msgs::msg::PointCloud2::ConstSharedPtr input_pointcloud_msg_ptr) +{ + const autoware::point_types::PointXYZIRCAEDT * input_buffer = + reinterpret_cast<const autoware::point_types::PointXYZIRCAEDT *>( + input_pointcloud_msg_ptr->data.data()); + + std::size_t max_ring = 0; + + for (std::size_t i = 0; i < input_pointcloud_msg_ptr->width * input_pointcloud_msg_ptr->height; + i++) { + const autoware::point_types::PointXYZIRCAEDT & point = input_buffer[i]; + const std::size_t ring = static_cast<std::size_t>(point.channel); + max_ring = std::max(max_ring, ring); + } + + // Set max rings to the next power of two + num_rings_ = std::pow(2, std::ceil(std::log2(max_ring + 1))); + num_rings_ = std::max(num_rings_, static_cast<std::size_t>(16)); + std::vector<std::size_t> ring_points(num_rings_, 0); + + for (std::size_t i = 0; i < input_pointcloud_msg_ptr->width * input_pointcloud_msg_ptr->height; + i++) { + const autoware::point_types::PointXYZIRCAEDT & point = input_buffer[i]; + const std::size_t ring = point.channel; + ring_points[ring]++; + } + + // Set max points per ring to the next multiple of 512 + max_points_per_ring_ = *std::max_element(ring_points.begin(), ring_points.end()); + max_points_per_ring_ = std::max(max_points_per_ring_, static_cast<std::size_t>(512)); + max_points_per_ring_ = (max_points_per_ring_ + 511) / 512 * 512; + + next_ring_index_.resize(num_rings_); + std::fill(next_ring_index_.begin(), next_ring_index_.end(), 0); + host_buffer_ = cuda_blackboard::make_host_unique<autoware::point_types::PointXYZIRCAEDT[]>( + num_rings_ * max_points_per_ring_); + + input_pointcloud_ptr_ = std::make_shared<cuda_blackboard::CudaPointCloud2>(); + input_pointcloud_ptr_->data = cuda_blackboard::make_unique<std::uint8_t[]>( + num_rings_ * max_points_per_ring_ * sizeof(autoware::point_types::PointXYZIRCAEDT)); + input_pointcloud_ptr_->width = max_points_per_ring_; + input_pointcloud_ptr_->height = num_rings_; + input_pointcloud_ptr_->point_step = sizeof(autoware::point_types::PointXYZIRCAEDT); + input_pointcloud_ptr_->row_step = + max_points_per_ring_ * sizeof(autoware::point_types::PointXYZIRCAEDT); + input_pointcloud_ptr_->is_dense = input_pointcloud_msg_ptr->is_dense; + input_pointcloud_ptr_->fields = input_pointcloud_msg_ptr->fields; + input_pointcloud_ptr_->header = input_pointcloud_msg_ptr->header; + + RCLCPP_INFO_STREAM( + get_logger(), "Estimated rings: " << num_rings_ + << ", max_points_per_ring: " << max_points_per_ring_ + << ". This should only be done during the first iterations. " + "Otherwise, performance will be affected."); +} + +bool CudaPointcloudPreprocessorNode::orderPointcloud( + const sensor_msgs::msg::PointCloud2::ConstSharedPtr input_pointcloud_msg_ptr) +{ + const autoware::point_types::PointXYZIRCAEDT * input_buffer = + reinterpret_cast<const autoware::point_types::PointXYZIRCAEDT *>( + input_pointcloud_msg_ptr->data.data()); + + bool ring_overflow = false; + bool point_overflow = false; + + autoware::point_types::PointXYZIRCAEDT * buffer = host_buffer_.get(); + + for (std::size_t i = 0; i < input_pointcloud_msg_ptr->width * input_pointcloud_msg_ptr->height; + i++) { + const autoware::point_types::PointXYZIRCAEDT & point = input_buffer[i]; + const std::size_t raw_ring = point.channel; + const std::size_t ring = raw_ring % num_rings_; + + const std::size_t raw_index = next_ring_index_[ring]; + const std::size_t index = raw_index % max_points_per_ring_; + + ring_overflow |= raw_ring >= num_rings_; + point_overflow |= raw_index >= max_points_per_ring_; + + buffer[ring * max_points_per_ring_ + index] = point; + next_ring_index_[ring] = raw_index + 1; + } + + return !ring_overflow && !point_overflow; +} + void CudaPointcloudPreprocessorNode::pointcloudCallback( + const sensor_msgs::msg::PointCloud2::ConstSharedPtr input_pointcloud_msg_ptr) +{ + // NEW Cuda version + if (input_pointcloud_msg_ptr->width * input_pointcloud_msg_ptr->height > capacity_) { + capacity_ = input_pointcloud_msg_ptr->width * input_pointcloud_msg_ptr->height; + capacity_ = (capacity_ + 1024) / 1024 * 1024; + points_device_ = cuda_blackboard::make_unique<InputPointType[]>(capacity_); + } + + double cuda_t0 = 1e-6 * std::chrono::duration_cast<std::chrono::nanoseconds>( + std::chrono::high_resolution_clock::now().time_since_epoch()) + .count(); + + cudaMemcpyAsync( + points_device_.get(), input_pointcloud_msg_ptr->data.data(), + input_pointcloud_msg_ptr->width * input_pointcloud_msg_ptr->height * sizeof(InputPointType), + cudaMemcpyHostToDevice, stream_); + cudaStreamSynchronize(stream_); + + double cuda_t1 = 1e-6 * std::chrono::duration_cast<std::chrono::nanoseconds>( + std::chrono::high_resolution_clock::now().time_since_epoch()) + .count(); + + // Estimate the number of rings and max points per ring + cudaMemsetAsync(ring_index_device_.get(), 0, test_num_rings_ * sizeof(std::int32_t), stream_); + cudaMemsetAsync( + indexes_tensor_device_.get(), 0xFF, + test_num_rings_ * test_max_points_per_ring_ * sizeof(std::uint32_t), stream_); + organizeLaunch( + points_device_.get(), indexes_tensor_device_.get(), ring_index_device_.get(), test_num_rings_, + max_rings_device_.get(), test_max_points_per_ring_, max_points_per_ring_device_.get(), + input_pointcloud_msg_ptr->width * input_pointcloud_msg_ptr->height, stream_); + cudaStreamSynchronize(stream_); + + std::int32_t max_ring_value; + std::int32_t max_points_per_ring; + cudaMemcpyAsync( + &max_ring_value, max_rings_device_.get(), sizeof(std::int32_t), cudaMemcpyDeviceToHost, + stream_); + cudaMemcpyAsync( + &max_points_per_ring, max_points_per_ring_device_.get(), sizeof(std::int32_t), + cudaMemcpyDeviceToHost, stream_); + cudaStreamSynchronize(stream_); + + if (max_ring_value >= test_num_rings_ || max_points_per_ring > test_max_points_per_ring_) { + RCLCPP_WARN( + get_logger(), "Current ring num: %d, Current Max points per ring: %d", test_num_rings_, + test_max_points_per_ring_); + test_num_rings_ = max_ring_value + 1; + test_max_points_per_ring_ = + std::max((max_points_per_ring + 511) / 512 * 512, 512); // Re think this equation + ring_index_device_ = cuda_blackboard::make_unique<std::int32_t[]>(test_num_rings_); + indexes_tensor_device_ = + cuda_blackboard::make_unique<std::uint32_t[]>(test_num_rings_ * test_max_points_per_ring_); + sorted_indexes_tensor_device_ = + cuda_blackboard::make_unique<std::uint32_t[]>(test_num_rings_ * test_max_points_per_ring_); + segment_offsets_device_ = cuda_blackboard::make_unique<std::int32_t[]>(test_num_rings_ + 1); + organized_points_device_ = + cuda_blackboard::make_unique<InputPointType[]>(test_num_rings_ * test_max_points_per_ring_); + RCLCPP_WARN( + get_logger(), "NEW Max ring value: %d, Max points per ring: %d", max_ring_value, + test_max_points_per_ring_); + + std::vector<std::int32_t> segment_offsets_host(test_num_rings_ + 1); + for (std::size_t i = 0; i < test_num_rings_ + 1; i++) { + segment_offsets_host[i] = i * test_max_points_per_ring_; + } + cudaMemcpyAsync( + segment_offsets_device_.get(), segment_offsets_host.data(), + (test_num_rings_ + 1) * sizeof(std::int32_t), cudaMemcpyHostToDevice, stream_); + + cudaMemsetAsync(ring_index_device_.get(), 0, test_num_rings_ * sizeof(std::int32_t), stream_); + cudaMemsetAsync( + indexes_tensor_device_.get(), 0xFF, + test_num_rings_ * test_max_points_per_ring_ * sizeof(std::int32_t), stream_); + + sort_workspace_bytes_ = querySortWorkspace( + test_num_rings_ * test_max_points_per_ring_, test_num_rings_, segment_offsets_device_.get(), + indexes_tensor_device_.get(), sorted_indexes_tensor_device_.get()); + sort_workspace_device_ = cuda_blackboard::make_unique<std::uint8_t[]>(sort_workspace_bytes_); + + organizeLaunch( + points_device_.get(), indexes_tensor_device_.get(), ring_index_device_.get(), test_num_rings_, + max_rings_device_.get(), test_max_points_per_ring_, max_points_per_ring_device_.get(), + input_pointcloud_msg_ptr->width * input_pointcloud_msg_ptr->height, stream_); + // cudaStreamSynchronize(stream_); + } + + /* std::vector<std::int32_t> ring_index_host(test_num_rings_); + cudaMemcpyAsync( + ring_index_host.data(), ring_index_device_.get(), test_num_rings_ * sizeof(std::int32_t), + cudaMemcpyDeviceToHost, stream_); cudaStreamSynchronize(stream_); + + for (std::size_t i = 0; i < test_num_rings_; i++) { + if (ring_index_host[i] > 0) { + std::cout << "Ring id=" << i << " has " << ring_index_host[i] << " points" << std::endl; + } + } */ + + sortLaunch( + test_num_rings_ * test_max_points_per_ring_, test_num_rings_, segment_offsets_device_.get(), + indexes_tensor_device_.get(), sorted_indexes_tensor_device_.get(), sort_workspace_device_.get(), + sort_workspace_bytes_, stream_); + cudaStreamSynchronize(stream_); + + /* std::vector<std::uint32_t> indexes_tensor_host(test_num_rings_ * test_max_points_per_ring_); + cudaMemcpyAsync( + indexes_tensor_host.data(), sorted_indexes_tensor_device_.get(), + test_num_rings_ * test_max_points_per_ring_ * sizeof(std::uint32_t), cudaMemcpyDeviceToHost, + stream_); cudaStreamSynchronize(stream_); + + if (test_num_rings_ < 64) { + for (std::size_t i = 0; i < test_num_rings_; i++) { + std::cout << "Ring id=" << i << "\n\t"; + for (std::size_t j = 0; j < test_max_points_per_ring_; j++) { + if (indexes_tensor_host[i * test_max_points_per_ring_ + j] < + std::numeric_limits<std::uint32_t>::max()) { std::cout << indexes_tensor_host[i * + test_max_points_per_ring_ + j] << " "; + } + } + std::cout << std::endl << std::flush; + } + } */ + + gatherLaunch( + points_device_.get(), sorted_indexes_tensor_device_.get(), organized_points_device_.get(), + test_num_rings_, test_max_points_per_ring_, stream_); + cudaStreamSynchronize( + stream_); // this will not be needed in the final implementation. for now we benchmark + + double cuda_t2 = 1e-6 * std::chrono::duration_cast<std::chrono::nanoseconds>( + std::chrono::high_resolution_clock::now().time_since_epoch()) + .count(); + + if (iteration_ >= 0) { + organize_time_cuda_ms_ += (cuda_t1 - cuda_t0); + copy_time_cuda_ms_ += (cuda_t2 - cuda_t1); + } + + stop_watch_ptr_->toc("processing_time", true); + + // TODO(knzo25): check the pointcloud layout at least once + + assert(input_pointcloud_msg_ptr->point_step == sizeof(autoware::point_types::PointXYZIRCAEDT)); + + if (num_rings_ == 0 || max_points_per_ring_ == 0) { + estimatePointcloudRingInfo(input_pointcloud_msg_ptr); + } + + double t1 = 1e-6 * std::chrono::duration_cast<std::chrono::nanoseconds>( + std::chrono::high_resolution_clock::now().time_since_epoch()) + .count(); + if (!orderPointcloud(input_pointcloud_msg_ptr)) { + estimatePointcloudRingInfo(input_pointcloud_msg_ptr); + orderPointcloud(input_pointcloud_msg_ptr); + } + double t2 = 1e-6 * std::chrono::duration_cast<std::chrono::nanoseconds>( + std::chrono::high_resolution_clock::now().time_since_epoch()) + .count(); + + // Copy to cuda memory + cudaMemcpy( + input_pointcloud_ptr_->data.get(), host_buffer_.get(), + num_rings_ * max_points_per_ring_ * sizeof(autoware::point_types::PointXYZIRCAEDT), + cudaMemcpyHostToDevice); + + double t3 = 1e-6 * std::chrono::duration_cast<std::chrono::nanoseconds>( + std::chrono::high_resolution_clock::now().time_since_epoch()) + .count(); + + if (iteration_ >= 0) { + organize_time_ms_ += (t2 - t1); + copy_time_ms_ += (t3 - t2); + } + + // Test -> this would replace the cpu version + cudaMemcpyAsync( + input_pointcloud_ptr_->data.get(), organized_points_device_.get(), + test_num_rings_ * test_max_points_per_ring_ * sizeof(autoware::point_types::PointXYZIRCAEDT), + cudaMemcpyHostToDevice, stream_); + input_pointcloud_ptr_->height = test_num_rings_; + input_pointcloud_ptr_->width = test_max_points_per_ring_; + cudaStreamSynchronize(stream_); + + // TEST CODE + { + std::vector<autoware::point_types::PointXYZIRCAEDT> host_buffer_test( + test_num_rings_ * test_max_points_per_ring_); + cudaMemcpyAsync( + host_buffer_test.data(), organized_points_device_.get(), + test_num_rings_ * test_max_points_per_ring_ * sizeof(autoware::point_types::PointXYZIRCAEDT), + cudaMemcpyDeviceToHost, stream_); + cudaStreamSynchronize(stream_); + + if ( + test_num_rings_ > 1 && test_max_points_per_ring_ == max_points_per_ring_ && + test_num_rings_ == num_rings_) { + for (std::size_t i = 0; i < num_rings_; i++) { + for (std::size_t j = 0; j < max_points_per_ring_; j++) { + // Compare the implementations + if ( + (host_buffer_.get()[i * max_points_per_ring_ + j].x != + host_buffer_test[i * max_points_per_ring_ + j].x || + host_buffer_.get()[i * max_points_per_ring_ + j].y != + host_buffer_test[i * max_points_per_ring_ + j].y || + host_buffer_.get()[i * max_points_per_ring_ + j].z != + host_buffer_test[i * max_points_per_ring_ + j].z) && + host_buffer_test[i * max_points_per_ring_ + j].distance > 0) { + RCLCPP_ERROR(get_logger(), "Mismatch at ring=%d, point=%d", i, j); + RCLCPP_ERROR( + get_logger(), "\thost_buffer_test: x=%.2f y=%.2f z=%.2f channel=%d distance=%.2f", + host_buffer_test[i * max_points_per_ring_ + j].x, + host_buffer_test[i * max_points_per_ring_ + j].y, + host_buffer_test[i * max_points_per_ring_ + j].z, + host_buffer_test[i * max_points_per_ring_ + j].channel, + host_buffer_test[i * max_points_per_ring_ + j].distance); + RCLCPP_ERROR( + get_logger(), "\thost_buffer: x=%.2f y=%.2f z=%.2f channel=%d distance=%.2f", + host_buffer_.get()[i * max_points_per_ring_ + j].x, + host_buffer_.get()[i * max_points_per_ring_ + j].y, + host_buffer_.get()[i * max_points_per_ring_ + j].z, + host_buffer_.get()[i * max_points_per_ring_ + j].channel, + host_buffer_.get()[i * max_points_per_ring_ + j].distance); + } + } + } + } + } + + input_pointcloud_ptr_->header = input_pointcloud_msg_ptr->header; + + cudaPointcloudCallback(input_pointcloud_ptr_); + + if (debug_publisher_) { + const double processing_time_ms = stop_watch_ptr_->toc("processing_time", true); + debug_publisher_->publish<tier4_debug_msgs::msg::Float64Stamped>( + "debug/processing_time_ms", processing_time_ms); + + double now_stamp_seconds = rclcpp::Time(this->get_clock()->now()).seconds(); + double cloud_stamp_seconds = rclcpp::Time(input_pointcloud_msg_ptr->header.stamp).seconds(); + + debug_publisher_->publish<tier4_debug_msgs::msg::Float64Stamped>( + "debug/latency_ms", 1000.f * (now_stamp_seconds - cloud_stamp_seconds)); + } + + // Clear indexes + std::fill(next_ring_index_.begin(), next_ring_index_.end(), 0); + + // Clear pointcloud buffer + auto host_buffer = host_buffer_.get(); + for (std::size_t i = 0; i < num_rings_ * max_points_per_ring_; i++) { + host_buffer[i] = autoware::point_types::PointXYZIRCAEDT{}; + } +} + +void CudaPointcloudPreprocessorNode::cudaPointcloudCallback( const std::shared_ptr<const cuda_blackboard::CudaPointCloud2> input_pointcloud_msg_ptr) { + double t1 = 1e-6 * std::chrono::duration_cast<std::chrono::nanoseconds>( + std::chrono::high_resolution_clock::now().time_since_epoch()) + .count(); static_assert( sizeof(InputPointType) == sizeof(autoware::point_types::PointXYZIRCAEDT), "PointStruct and PointXYZIRCAEDT must have the same size"); @@ -237,10 +617,42 @@ void CudaPointcloudPreprocessorNode::pointcloudCallback( return; } + double t2 = 1e-6 * std::chrono::duration_cast<std::chrono::nanoseconds>( + std::chrono::high_resolution_clock::now().time_since_epoch()) + .count(); + auto output_pointcloud_ptr = cuda_pointcloud_preprocessor_->process( *input_pointcloud_msg_ptr, transform_msg, twist_queue_, angular_velocity_queue_); output_pointcloud_ptr->header.frame_id = base_frame_; + double t3 = 1e-6 * std::chrono::duration_cast<std::chrono::nanoseconds>( + std::chrono::high_resolution_clock::now().time_since_epoch()) + .count(); + + if (iteration_ >= 0) { + processing_time_ms_ += (t2 - t1); + kernel_time_ms_ += (t3 - t2); + } + iteration_++; + + if (num_rings_ > 64) { + RCLCPP_INFO( + get_logger(), + "Organize time: %f ms, Copy time: %f ms, Process time: %f ms, Kernel time: %f ms Total time: " + "%f ms", + organize_time_ms_ / iteration_, copy_time_ms_ / iteration_, processing_time_ms_ / iteration_, + kernel_time_ms_ / iteration_, + (organize_time_ms_ + copy_time_ms_ + processing_time_ms_ + kernel_time_ms_) / iteration_); + RCLCPP_INFO( + get_logger(), + "\t Cuda Organize time: %f ms, Cuda Copy time: %f ms, Cuda Process time: %f ms, Cuda Kernel " + "time: %f ms Total time: %f ms", + organize_time_cuda_ms_ / iteration_, copy_time_cuda_ms_ / iteration_, + processing_time_ms_ / iteration_, kernel_time_ms_ / iteration_, + (organize_time_cuda_ms_ + copy_time_cuda_ms_ + processing_time_ms_ + kernel_time_ms_) / + iteration_); + } + // Publish pub_->publish(std::move(output_pointcloud_ptr)); From 5482f9c4cd7d9764e80eb937ddf73e2ca702d76e Mon Sep 17 00:00:00 2001 From: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp> Date: Wed, 5 Feb 2025 15:47:33 +0900 Subject: [PATCH 26/55] feat: reimplemented the organized pointcloud adapter in cuda. the only bottleneck is the H->D copy Signed-off-by: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp> --- .../CMakeLists.txt | 13 - ...cuda_organized_pointcloud_adapter_node.hpp | 76 --- .../cuda_pointcloud_preprocessor.hpp | 68 +-- .../cuda_pointcloud_preprocessor_node.hpp | 48 +- ...cuda_organized_pointcloud_adapter_node.cpp | 200 -------- .../cuda_pointcloud_preprocessor.cu | 434 ++++++++++++------ .../cuda_pointcloud_preprocessor_node.cpp | 422 +---------------- 7 files changed, 331 insertions(+), 930 deletions(-) delete mode 100644 sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_organized_pointcloud_adapter/cuda_organized_pointcloud_adapter_node.hpp delete mode 100644 sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_organized_pointcloud_adapter/cuda_organized_pointcloud_adapter_node.cpp diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/CMakeLists.txt b/sensing/autoware_cuda_pointcloud_preprocessor/CMakeLists.txt index d09f787a3b277..8abff042d6a3a 100644 --- a/sensing/autoware_cuda_pointcloud_preprocessor/CMakeLists.txt +++ b/sensing/autoware_cuda_pointcloud_preprocessor/CMakeLists.txt @@ -54,14 +54,6 @@ target_include_directories(cuda_pointcloud_preprocessor_lib SYSTEM PRIVATE ) # Targets -ament_auto_add_library(cuda_organized_pointcloud_adapter SHARED - src/cuda_organized_pointcloud_adapter/cuda_organized_pointcloud_adapter_node.cpp -) - -target_link_libraries(cuda_organized_pointcloud_adapter - ${CUDA_LIBRARIES} -) - ament_auto_add_library(cuda_pointcloud_preprocessor SHARED src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.cpp ) @@ -71,11 +63,6 @@ target_link_libraries(cuda_pointcloud_preprocessor cuda_pointcloud_preprocessor_lib ) -rclcpp_components_register_node(cuda_organized_pointcloud_adapter - PLUGIN "autoware::cuda_organized_pointcloud_adapter::CudaOrganizedPointcloudAdapterNode" - EXECUTABLE cuda_organized_pointcloud_adapter_node -) - rclcpp_components_register_node(cuda_pointcloud_preprocessor PLUGIN "autoware::cuda_pointcloud_preprocessor::CudaPointcloudPreprocessorNode" EXECUTABLE cuda_pointcloud_preprocessor_node diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_organized_pointcloud_adapter/cuda_organized_pointcloud_adapter_node.hpp b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_organized_pointcloud_adapter/cuda_organized_pointcloud_adapter_node.hpp deleted file mode 100644 index 7a31442f26f1d..0000000000000 --- a/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_organized_pointcloud_adapter/cuda_organized_pointcloud_adapter_node.hpp +++ /dev/null @@ -1,76 +0,0 @@ -// Copyright 2024 TIER IV, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef AUTOWARE__CUDA_ORGANIZED_POINTCLOUD_ADAPTER__CUDA_ORGANIZED_POINTCLOUD_ADAPTER_NODE_HPP_ -#define AUTOWARE__CUDA_ORGANIZED_POINTCLOUD_ADAPTER__CUDA_ORGANIZED_POINTCLOUD_ADAPTER_NODE_HPP_ - -#include <autoware/point_types/types.hpp> -#include <autoware/universe_utils/ros/debug_publisher.hpp> -#include <autoware/universe_utils/system/stop_watch.hpp> -#include <cuda_blackboard/cuda_adaptation.hpp> -#include <cuda_blackboard/cuda_blackboard_publisher.hpp> -#include <cuda_blackboard/cuda_pointcloud2.hpp> -#include <rclcpp/rclcpp.hpp> - -#include <sensor_msgs/msg/point_cloud2.hpp> - -#include <tf2/transform_datatypes.h> -#include <tf2_ros/buffer.h> -#include <tf2_ros/transform_listener.h> - -#include <chrono> -#include <deque> -#include <memory> -#include <vector> - -namespace autoware::cuda_organized_pointcloud_adapter -{ - -class CudaOrganizedPointcloudAdapterNode : public rclcpp::Node -{ -public: - explicit CudaOrganizedPointcloudAdapterNode(const rclcpp::NodeOptions & node_options); - ~CudaOrganizedPointcloudAdapterNode() = default; - -private: - void estimatePointcloudRingInfo( - const sensor_msgs::msg::PointCloud2::ConstSharedPtr input_pointcloud_msg_ptr); - - bool orderPointcloud( - const sensor_msgs::msg::PointCloud2::ConstSharedPtr input_pointcloud_msg_ptr); - - // Callback - void pointcloudCallback( - const sensor_msgs::msg::PointCloud2::ConstSharedPtr input_pointcloud_msg_ptr); - - // Subscriber - rclcpp::Subscription<sensor_msgs::msg::PointCloud2>::SharedPtr pointcloud_sub_{}; - - // CUDA pub - std::unique_ptr<cuda_blackboard::CudaBlackboardPublisher<cuda_blackboard::CudaPointCloud2>> pub_; - - std::size_t num_rings_{0}; - std::size_t max_points_per_ring_{0}; - - std::vector<std::size_t> next_ring_index_; - cuda_blackboard::HostUniquePtr<autoware::point_types::PointXYZIRCAEDT[]> host_buffer_; - cuda_blackboard::CudaUniquePtr<std::uint8_t[]> device_buffer_; - - std::unique_ptr<autoware::universe_utils::StopWatch<std::chrono::milliseconds>> stop_watch_ptr_; - std::unique_ptr<autoware::universe_utils::DebugPublisher> debug_publisher_; -}; - -} // namespace autoware::cuda_organized_pointcloud_adapter - -#endif // AUTOWARE__CUDA_ORGANIZED_POINTCLOUD_ADAPTER__CUDA_ORGANIZED_POINTCLOUD_ADAPTER_NODE_HPP_ diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.hpp b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.hpp index 3dbc968d9376a..7c4c0113d3618 100644 --- a/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.hpp +++ b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.hpp @@ -35,26 +35,6 @@ namespace autoware::cuda_pointcloud_preprocessor { -void organizeLaunch( - const InputPointType * __restrict__ input_points, std::uint32_t * index_tensor, - std::int32_t * ring_indexes, std::int32_t initial_max_rings, std::int32_t * output_max_rings, - std::int32_t initial_max_points_per_ring, std::int32_t * output_max_points_per_ring, - int num_points, cudaStream_t & stream); - -std::size_t querySortWorkspace( - int num_items, int num_segments, int * d_offsets, std::uint32_t * d_keys_in, - std::uint32_t * d_keys_out); - -void sortLaunch( - int num_items, int num_segments, int * d_offsets, std::uint32_t * d_keys_in, - std::uint32_t * d_keys_out, std::uint8_t * d_temp_storage, std::size_t temp_storage_bytes, - cudaStream_t & stream); // e.g., [-, -, -, -, -, -, -] - -void gatherLaunch( - const InputPointType * __restrict__ input_points, const std::uint32_t * __restrict__ index_tensor, - InputPointType * __restrict__ output_points, int num_rings, int max_points_per_ring, - cudaStream_t & stream); - struct TwistStruct2D { float cum_x; @@ -62,8 +42,8 @@ struct TwistStruct2D float cum_theta; float cum_cos_theta; float cum_sin_theta; - uint32_t last_stamp_nsec; // relative to the start of the pointcloud - uint32_t stamp_nsec; // relative to the start of the pointcloud + std::uint32_t last_stamp_nsec; // relative to the start of the pointcloud + std::uint32_t stamp_nsec; // relative to the start of the pointcloud float v_x; float v_theta; }; @@ -71,8 +51,8 @@ struct TwistStruct2D struct TwistStruct3D { float cum_transform_buffer[16]; - uint32_t last_stamp_nsec; // relative to the start of the pointcloud - uint32_t stamp_nsec; // relative to the start of the pointcloud + std::uint32_t last_stamp_nsec; // relative to the start of the pointcloud + std::uint32_t stamp_nsec; // relative to the start of the pointcloud float v[3]; float w[3]; }; @@ -143,29 +123,38 @@ class CudaPointcloudPreprocessor void preallocateOutput(); std::unique_ptr<cuda_blackboard::CudaPointCloud2> process( - const cuda_blackboard::CudaPointCloud2 & input_pointcloud_msg, + const sensor_msgs::msg::PointCloud2::ConstSharedPtr input_pointcloud_msg_ptr, const geometry_msgs::msg::TransformStamped & transform_msg, const std::deque<geometry_msgs::msg::TwistWithCovarianceStamped> & twist_queue, - const std::deque<geometry_msgs::msg::Vector3Stamped> & angular_velocity_queue); + const std::deque<geometry_msgs::msg::Vector3Stamped> & angular_velocity_queue, + const std::uint32_t first_point_rel_stamp_nsec); private: void setupTwist2DStructs( - const cuda_blackboard::CudaPointCloud2 & input_pointcloud_msg, const std::deque<geometry_msgs::msg::TwistWithCovarianceStamped> & twist_queue, - const std::deque<geometry_msgs::msg::Vector3Stamped> & angular_velocity_queue); + const std::deque<geometry_msgs::msg::Vector3Stamped> & angular_velocity_queue, + const std::uint64_t pointcloud_stamp_nsec, const std::uint32_t first_point_rel_stamp_nsec); void setupTwist3DStructs( - const cuda_blackboard::CudaPointCloud2 & input_pointcloud_msg, const std::deque<geometry_msgs::msg::TwistWithCovarianceStamped> & twist_queue, - const std::deque<geometry_msgs::msg::Vector3Stamped> & angular_velocity_queue); + const std::deque<geometry_msgs::msg::Vector3Stamped> & angular_velocity_queue, + const std::uint64_t pointcloud_stamp_nsec, const std::uint32_t first_point_rel_stamp_nsec); + + std::size_t querySortWorkspace( + int num_items, int num_segments, int * offsets_device, std::uint32_t * keys_in_device, + std::uint32_t * keys_out_device); + + void organizePointcloud(); CropBoxParameters self_crop_box_parameters_{}; CropBoxParameters mirror_crop_box_parameters_{}; RingOutlierFilterParameters ring_outlier_parameters_{}; bool use_3d_undistortion_{false}; - int max_rings_{}; + int num_rings_{}; int max_points_per_ring_{}; + std::size_t num_raw_points_{}; + std::size_t num_organized_points_{}; std::vector<sensor_msgs::msg::PointField> point_fields_{}; std::unique_ptr<cuda_blackboard::CudaPointCloud2> output_pointcloud_ptr_{}; @@ -174,6 +163,21 @@ class CudaPointcloudPreprocessor int max_blocks_per_grid_{}; const int threads_per_block_{256}; cudaMemPool_t device_memory_pool_; + + // Organizing buffers + thrust::device_vector<InputPointType> device_input_points_; + thrust::device_vector<InputPointType> device_organized_points_; + thrust::device_vector<std::int32_t> device_ring_index_; + thrust::device_vector<std::uint32_t> device_indexes_tensor_; + thrust::device_vector<std::uint32_t> device_sorted_indexes_tensor_; + thrust::device_vector<std::int32_t> device_segment_offsets_; + thrust::device_vector<std::int32_t> device_max_ring_; + thrust::device_vector<std::int32_t> device_max_points_per_ring_; + + thrust::device_vector<std::uint8_t> device_sort_workspace_; + std::size_t sort_workspace_bytes_{0}; + + // Pointcloud preprocessing buffers thrust::device_vector<InputPointType> device_transformed_points_{}; thrust::device_vector<OutputPointType> device_output_points_{}; thrust::device_vector<std::uint32_t> device_crop_mask_{}; @@ -184,6 +188,8 @@ class CudaPointcloudPreprocessor thrust::device_vector<CropBoxParameters> host_crop_box_structs_{}; thrust::device_vector<CropBoxParameters> device_crop_box_structs_{}; + std::vector<TwistStruct2D> host_twist_2d_structs_; + std::vector<TwistStruct3D> host_twist_3d_structs_; }; } // namespace autoware::cuda_pointcloud_preprocessor diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.hpp b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.hpp index 6a45a79b63c65..afc2f6cd74de0 100644 --- a/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.hpp +++ b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.hpp @@ -36,7 +36,6 @@ #include <tf2_ros/buffer.h> #include <tf2_ros/transform_listener.h> -#include <chrono> #include <deque> #include <memory> #include <vector> @@ -81,12 +80,6 @@ class CudaPointcloudPreprocessorNode : public rclcpp::Node const std::string & target_frame, const std::string & source_frame, tf2::Transform * tf2_transform_ptr); - void estimatePointcloudRingInfo( - const sensor_msgs::msg::PointCloud2::ConstSharedPtr input_pointcloud_msg_ptr); - - bool orderPointcloud( - const sensor_msgs::msg::PointCloud2::ConstSharedPtr input_pointcloud_msg_ptr); - // Callback void pointcloudCallback( const sensor_msgs::msg::PointCloud2::ConstSharedPtr input_pointcloud_msg_ptr); @@ -103,55 +96,18 @@ class CudaPointcloudPreprocessorNode : public rclcpp::Node std::deque<geometry_msgs::msg::TwistWithCovarianceStamped> twist_queue_; std::deque<geometry_msgs::msg::Vector3Stamped> angular_velocity_queue_; - std::size_t num_rings_{0}; - std::size_t max_points_per_ring_{0}; - std::vector<std::size_t> next_ring_index_; - - // Subscriber + // Subscribers rclcpp::Subscription<geometry_msgs::msg::TwistWithCovarianceStamped>::SharedPtr twist_sub_{}; rclcpp::Subscription<sensor_msgs::msg::Imu>::SharedPtr imu_sub_{}; rclcpp::Subscription<sensor_msgs::msg::PointCloud2>::SharedPtr pointcloud_sub_{}; - cuda_blackboard::CudaUniquePtr<InputPointType[]> points_device_; - cuda_blackboard::CudaUniquePtr<InputPointType[]> organized_points_device_; - cuda_blackboard::CudaUniquePtr<std::int32_t[]> ring_index_device_; - cuda_blackboard::CudaUniquePtr<std::uint32_t[]> indexes_tensor_device_; - cuda_blackboard::CudaUniquePtr<std::uint32_t[]> sorted_indexes_tensor_device_; - cuda_blackboard::CudaUniquePtr<std::int32_t[]> segment_offsets_device_; - cuda_blackboard::CudaUniquePtr<std::int32_t> max_rings_device_; - cuda_blackboard::CudaUniquePtr<std::int32_t> max_points_per_ring_device_; - - cuda_blackboard::CudaUniquePtr<std::uint8_t[]> sort_workspace_device_; - std::size_t sort_workspace_bytes_{0}; - - std::int32_t test_num_rings_{0}; - std::int32_t test_max_points_per_ring_{0}; - cudaStream_t stream_; - std::int32_t capacity_{0}; - - cuda_blackboard::HostUniquePtr<autoware::point_types::PointXYZIRCAEDT[]> host_buffer_; - // cuda_blackboard::CudaUniquePtr<std::uint8_t[]> device_buffer_; TODO(knzo25) replaced by a - // CudaPointCloud - std::shared_ptr<cuda_blackboard::CudaPointCloud2> input_pointcloud_ptr_; - - // CUDA pub & sub + // CUDA pub std::unique_ptr<cuda_blackboard::CudaBlackboardPublisher<cuda_blackboard::CudaPointCloud2>> pub_; - // std::unique_ptr<cuda_blackboard::CudaBlackboardSubscriber<cuda_blackboard::CudaPointCloud2>> - // sub_; std::unique_ptr<CudaPointcloudPreprocessor> cuda_pointcloud_preprocessor_; std::unique_ptr<autoware::universe_utils::StopWatch<std::chrono::milliseconds>> stop_watch_ptr_; std::unique_ptr<autoware::universe_utils::DebugPublisher> debug_publisher_; - - double organize_time_cuda_ms_{0.0}; - double copy_time_cuda_ms_{0.0}; - - double organize_time_ms_{0.0}; - double copy_time_ms_{0.0}; - double processing_time_ms_{0.0}; - double kernel_time_ms_{0.0}; - int iteration_{0}; }; } // namespace autoware::cuda_pointcloud_preprocessor diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_organized_pointcloud_adapter/cuda_organized_pointcloud_adapter_node.cpp b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_organized_pointcloud_adapter/cuda_organized_pointcloud_adapter_node.cpp deleted file mode 100644 index b7eeccafa2d5a..0000000000000 --- a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_organized_pointcloud_adapter/cuda_organized_pointcloud_adapter_node.cpp +++ /dev/null @@ -1,200 +0,0 @@ -// Copyright 2024 TIER IV, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "autoware/cuda_organized_pointcloud_adapter/cuda_organized_pointcloud_adapter_node.hpp" - -#include <cuda_runtime.h> - -#include <vector> - -namespace autoware::cuda_organized_pointcloud_adapter -{ -using sensor_msgs::msg::PointCloud2; - -CudaOrganizedPointcloudAdapterNode::CudaOrganizedPointcloudAdapterNode( - const rclcpp::NodeOptions & node_options) -: Node("cuda_organized_pointcloud_adapter", node_options) -{ - pub_ = - std::make_unique<cuda_blackboard::CudaBlackboardPublisher<cuda_blackboard::CudaPointCloud2>>( - *this, "~/output/pointcloud"); - - rclcpp::SubscriptionOptions sub_options; - sub_options.use_intra_process_comm = rclcpp::IntraProcessSetting::Enable; - - pointcloud_sub_ = this->create_subscription<sensor_msgs::msg::PointCloud2>( - "~/input/pointcloud", rclcpp::SensorDataQoS{}.keep_last(1), - std::bind(&CudaOrganizedPointcloudAdapterNode::pointcloudCallback, this, std::placeholders::_1), - sub_options); - - // initialize debug tool - { - using autoware::universe_utils::DebugPublisher; - using autoware::universe_utils::StopWatch; - - stop_watch_ptr_ = std::make_unique<StopWatch<std::chrono::milliseconds>>(); - debug_publisher_ = std::make_unique<DebugPublisher>(this, "cuda_organized_pointcloud_adapter"); - stop_watch_ptr_->tic("processing_time"); - } -} - -void CudaOrganizedPointcloudAdapterNode::estimatePointcloudRingInfo( - const sensor_msgs::msg::PointCloud2::ConstSharedPtr input_pointcloud_msg_ptr) -{ - const autoware::point_types::PointXYZIRCAEDT * input_buffer = - reinterpret_cast<const autoware::point_types::PointXYZIRCAEDT *>( - input_pointcloud_msg_ptr->data.data()); - - std::size_t max_ring = 0; - - for (std::size_t i = 0; i < input_pointcloud_msg_ptr->width * input_pointcloud_msg_ptr->height; - i++) { - const autoware::point_types::PointXYZIRCAEDT & point = input_buffer[i]; - const std::size_t ring = static_cast<std::size_t>(point.channel); - max_ring = std::max(max_ring, ring); - } - - // Set max rings to the next power of two - num_rings_ = std::pow(2, std::ceil(std::log2(max_ring + 1))); - num_rings_ = std::max(num_rings_, static_cast<std::size_t>(16)); - std::vector<std::size_t> ring_points(num_rings_, 0); - - for (std::size_t i = 0; i < input_pointcloud_msg_ptr->width * input_pointcloud_msg_ptr->height; - i++) { - const autoware::point_types::PointXYZIRCAEDT & point = input_buffer[i]; - const std::size_t ring = point.channel; - ring_points[ring]++; - } - - // Set max points per ring to the next multiple of 512 - max_points_per_ring_ = *std::max_element(ring_points.begin(), ring_points.end()); - max_points_per_ring_ = std::max(max_points_per_ring_, static_cast<std::size_t>(512)); - max_points_per_ring_ = (max_points_per_ring_ + 511) / 512 * 512; - - next_ring_index_.resize(num_rings_); - std::fill(next_ring_index_.begin(), next_ring_index_.end(), 0); - host_buffer_ = cuda_blackboard::make_host_unique<autoware::point_types::PointXYZIRCAEDT[]>( - num_rings_ * max_points_per_ring_); - - device_buffer_ = cuda_blackboard::make_unique<std::uint8_t[]>( - num_rings_ * max_points_per_ring_ * sizeof(autoware::point_types::PointXYZIRCAEDT)); - - RCLCPP_INFO_STREAM( - get_logger(), "Estimated rings: " << num_rings_ - << ", max_points_per_ring: " << max_points_per_ring_ - << ". This should only be done during the first iterations. " - "Otherwise, performance will be affected."); -} - -bool CudaOrganizedPointcloudAdapterNode::orderPointcloud( - const sensor_msgs::msg::PointCloud2::ConstSharedPtr input_pointcloud_msg_ptr) -{ - const autoware::point_types::PointXYZIRCAEDT * input_buffer = - reinterpret_cast<const autoware::point_types::PointXYZIRCAEDT *>( - input_pointcloud_msg_ptr->data.data()); - - bool ring_overflow = false; - bool point_overflow = false; - - autoware::point_types::PointXYZIRCAEDT * buffer = host_buffer_.get(); - - for (std::size_t i = 0; i < input_pointcloud_msg_ptr->width * input_pointcloud_msg_ptr->height; - i++) { - const autoware::point_types::PointXYZIRCAEDT & point = input_buffer[i]; - const std::size_t raw_ring = point.channel; - const std::size_t ring = raw_ring % num_rings_; - - const std::size_t raw_index = next_ring_index_[ring]; - const std::size_t index = raw_index % max_points_per_ring_; - - ring_overflow |= raw_ring >= num_rings_; - point_overflow |= raw_index >= max_points_per_ring_; - - buffer[ring * max_points_per_ring_ + index] = point; - next_ring_index_[ring] = raw_index + 1; - } - - return !ring_overflow && !point_overflow; -} - -void CudaOrganizedPointcloudAdapterNode::pointcloudCallback( - const sensor_msgs::msg::PointCloud2::ConstSharedPtr input_pointcloud_msg_ptr) -{ - stop_watch_ptr_->toc("processing_time", true); - - // TODO(knzo25): check the pointcloud layout at least once - - assert(input_pointcloud_msg_ptr->point_step == sizeof(autoware::point_types::PointXYZIRCAEDT)); - - if (num_rings_ == 0 || max_points_per_ring_ == 0) { - estimatePointcloudRingInfo(input_pointcloud_msg_ptr); - } - - if (!orderPointcloud(input_pointcloud_msg_ptr)) { - estimatePointcloudRingInfo(input_pointcloud_msg_ptr); - orderPointcloud(input_pointcloud_msg_ptr); - } - - // Copy to cuda memory - cudaMemcpy( - device_buffer_.get(), host_buffer_.get(), - num_rings_ * max_points_per_ring_ * sizeof(autoware::point_types::PointXYZIRCAEDT), - cudaMemcpyHostToDevice); - - auto cuda_pointcloud_msg_ptr = std::make_unique<cuda_blackboard::CudaPointCloud2>(); - cuda_pointcloud_msg_ptr->fields = input_pointcloud_msg_ptr->fields; - cuda_pointcloud_msg_ptr->width = max_points_per_ring_; - cuda_pointcloud_msg_ptr->height = num_rings_; - cuda_pointcloud_msg_ptr->point_step = sizeof(autoware::point_types::PointXYZIRCAEDT); - cuda_pointcloud_msg_ptr->row_step = - max_points_per_ring_ * sizeof(autoware::point_types::PointXYZIRCAEDT); - cuda_pointcloud_msg_ptr->data = - std::move(device_buffer_); /*reinterpret_cast<uint8_t *>(device_buffer_)*/ - ; - cuda_pointcloud_msg_ptr->is_dense = input_pointcloud_msg_ptr->is_dense; - cuda_pointcloud_msg_ptr->header = input_pointcloud_msg_ptr->header; - - pub_->publish(std::move(cuda_pointcloud_msg_ptr)); - - if (debug_publisher_) { - const double processing_time_ms = stop_watch_ptr_->toc("processing_time", true); - debug_publisher_->publish<tier4_debug_msgs::msg::Float64Stamped>( - "debug/processing_time_ms", processing_time_ms); - - double now_stamp_seconds = rclcpp::Time(this->get_clock()->now()).seconds(); - double cloud_stamp_seconds = rclcpp::Time(input_pointcloud_msg_ptr->header.stamp).seconds(); - - debug_publisher_->publish<tier4_debug_msgs::msg::Float64Stamped>( - "debug/latency_ms", 1000.f * (now_stamp_seconds - cloud_stamp_seconds)); - } - - // Allocate cuda memory - device_buffer_ = cuda_blackboard::make_unique<std::uint8_t[]>( - num_rings_ * max_points_per_ring_ * sizeof(autoware::point_types::PointXYZIRCAEDT)); - - // Clear indexes - std::fill(next_ring_index_.begin(), next_ring_index_.end(), 0); - - // Clear pointcloud buffer - auto host_buffer = host_buffer_.get(); - for (std::size_t i = 0; i < num_rings_ * max_points_per_ring_; i++) { - host_buffer[i] = autoware::point_types::PointXYZIRCAEDT{}; - } -} - -} // namespace autoware::cuda_organized_pointcloud_adapter - -#include "rclcpp_components/register_node_macro.hpp" -RCLCPP_COMPONENTS_REGISTER_NODE( - autoware::cuda_organized_pointcloud_adapter::CudaOrganizedPointcloudAdapterNode) diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu index d99d384b0c9ab..d49f6eeccce42 100644 --- a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu +++ b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu @@ -23,10 +23,8 @@ #include <cuda_runtime.h> #include <tf2/utils.h> #include <thrust/execution_policy.h> -#include <thrust/host_vector.h> #include <thrust/reduce.h> #include <thrust/scan.h> -// #include <cub/device/device_segmented_radix_sort.cuh> namespace autoware::cuda_pointcloud_preprocessor { @@ -100,47 +98,6 @@ __global__ void organizeKernel( index_tensor[ring * initial_max_points_per_ring + next_offset] = idx; } -void organizeLaunch( - const InputPointType * __restrict__ input_points, std::uint32_t * index_tensor, - std::int32_t * ring_indexes, std::int32_t initial_max_rings, std::int32_t * output_max_rings, - std::int32_t initial_max_points_per_ring, std::int32_t * output_max_points_per_ring, - int num_points, cudaStream_t & stream) -{ - int threads_per_block = 256; - int blocks_per_grid = (num_points + threads_per_block - 1) / threads_per_block; - organizeKernel<<<blocks_per_grid, threads_per_block>>>( - input_points, index_tensor, ring_indexes, initial_max_rings, output_max_rings, - initial_max_points_per_ring, output_max_points_per_ring, num_points); -} - -std::size_t querySortWorkspace( - int num_items, int num_segments, int * d_offsets, std::uint32_t * d_keys_in, - std::uint32_t * d_keys_out) -{ - // Determine temporary device storage requirements - void * d_temp_storage = nullptr; - size_t temp_storage_bytes = 0; - cub::DeviceSegmentedRadixSort::SortKeys( - d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items, num_segments, d_offsets, - d_offsets + 1); - - return temp_storage_bytes; -} - -void sortLaunch( - int num_items, int num_segments, int * d_offsets, std::uint32_t * d_keys_in, - std::uint32_t * d_keys_out, std::uint8_t * d_temp_storage, std::size_t temp_storage_bytes, - cudaStream_t & stream) -{ - // Allocate temporary storage - // cudaMalloc(&d_temp_storage, temp_storage_bytes); - - // Run sorting operation - cub::DeviceSegmentedRadixSort::SortKeys( - reinterpret_cast<void *>(d_temp_storage), temp_storage_bytes, d_keys_in, d_keys_out, num_items, - num_segments, d_offsets, d_offsets + 1, 0, sizeof(std::uint32_t) * 8, stream); -} - __global__ void gatherKernel( const InputPointType * __restrict__ input_points, const std::uint32_t * __restrict__ index_tensor, InputPointType * __restrict__ output_points, int num_rings, int max_points_per_ring) @@ -162,18 +119,6 @@ __global__ void gatherKernel( } } -void gatherLaunch( - const InputPointType * __restrict__ input_points, const std::uint32_t * __restrict__ index_tensor, - InputPointType * __restrict__ output_points, int num_rings, int max_points_per_ring, - cudaStream_t & stream) -{ - int threads_per_block = 256; - int blocks_per_grid = - (num_rings * max_points_per_ring + threads_per_block - 1) / threads_per_block; - gatherKernel<<<blocks_per_grid, threads_per_block, 0, stream>>>( - input_points, index_tensor, output_points, num_rings, max_points_per_ring); -} - __global__ void transformPointsKernel( const InputPointType * __restrict__ input_points, InputPointType * output_points, int num_points, TransformStruct transform) @@ -221,8 +166,8 @@ __global__ void cropBoxKernel( } __global__ void combineMasksKernel( - const uint32_t * __restrict__ mask1, const uint32_t * __restrict__ mask2, int num_points, - uint32_t * __restrict__ output_mask) + const std::uint32_t * __restrict__ mask1, const std::uint32_t * __restrict__ mask2, + int num_points, std::uint32_t * __restrict__ output_mask) { int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < num_points) { @@ -231,7 +176,7 @@ __global__ void combineMasksKernel( } __global__ void extractInputPointIndicesKernel( - InputPointType * input_points, uint32_t * masks, uint32_t * indices, int num_points, + InputPointType * input_points, std::uint32_t * masks, std::uint32_t * indices, int num_points, InputPointType * output_points) { int idx = blockIdx.x * blockDim.x + threadIdx.x; @@ -241,7 +186,7 @@ __global__ void extractInputPointIndicesKernel( } __global__ void extractOutputPointIndicesKernel( - OutputPointType * input_points, uint32_t * masks, uint32_t * indices, int num_points, + OutputPointType * input_points, std::uint32_t * masks, std::uint32_t * indices, int num_points, OutputPointType * output_points) { int idx = blockIdx.x * blockDim.x + threadIdx.x; @@ -251,7 +196,7 @@ __global__ void extractOutputPointIndicesKernel( } __global__ void extractInputPointsToOutputPoints_indicesKernel( - InputPointType * input_points, uint32_t * masks, uint32_t * indices, int num_points, + InputPointType * input_points, std::uint32_t * masks, std::uint32_t * indices, int num_points, OutputPointType * output_points) { int idx = blockIdx.x * blockDim.x + threadIdx.x; @@ -340,8 +285,9 @@ __global__ void undistort3dKernel( } __global__ void ringOutlierFilterKernel( - const InputPointType * d_points, uint32_t * output_mask, int num_rings, int max_points_per_ring, - float distance_ratio, float object_length_threshold_squared, int num_points_threshold) + const InputPointType * d_points, std::uint32_t * output_mask, int num_rings, + int max_points_per_ring, float distance_ratio, float object_length_threshold_squared, + int num_points_threshold) { int idx = blockIdx.x * blockDim.x + threadIdx.x; int j = idx / max_points_per_ring; @@ -476,10 +422,73 @@ CudaPointcloudPreprocessor::CudaPointcloudPreprocessor() cudaMemPoolCreate(&device_memory_pool_, &pool_props); MemoryPoolAllocator<TwistStruct2D> allocator_2d(device_memory_pool_); MemoryPoolAllocator<TwistStruct3D> allocator_3d(device_memory_pool_); + MemoryPoolAllocator<std::int32_t> allocator_int32(device_memory_pool_); + MemoryPoolAllocator<std::uint32_t> allocator_uint32(device_memory_pool_); + MemoryPoolAllocator<std::uint8_t> allocator_uint8(device_memory_pool_); + MemoryPoolAllocator<InputPointType> allocator_points(device_memory_pool_); + device_twist_2d_structs_ = thrust::device_vector<TwistStruct2D, MemoryPoolAllocator<TwistStruct2D>>(allocator_2d); device_twist_3d_structs_ = thrust::device_vector<TwistStruct3D, MemoryPoolAllocator<TwistStruct3D>>(allocator_3d); + + num_rings_ = 1; + max_points_per_ring_ = 1; + num_organized_points_ = num_rings_ * max_points_per_ring_; + device_ring_index_ = + thrust::device_vector<std::int32_t, MemoryPoolAllocator<std::int32_t>>(allocator_int32); + device_ring_index_.resize(num_rings_); + + device_indexes_tensor_ = + thrust::device_vector<std::uint32_t, MemoryPoolAllocator<std::uint32_t>>(allocator_uint32); + device_sorted_indexes_tensor_ = + thrust::device_vector<std::uint32_t, MemoryPoolAllocator<std::uint32_t>>(allocator_uint32); + + device_indexes_tensor_.resize(num_organized_points_); + device_sorted_indexes_tensor_.resize(num_organized_points_); + + device_segment_offsets_ = + thrust::device_vector<std::int32_t, MemoryPoolAllocator<std::int32_t>>(allocator_int32); + device_segment_offsets_.resize(num_rings_ + 1); + device_segment_offsets_[0] = 0; + device_segment_offsets_[1] = 1; + + device_max_ring_ = + thrust::device_vector<std::int32_t, MemoryPoolAllocator<std::int32_t>>(allocator_int32); + device_max_ring_.resize(1); + + device_max_points_per_ring_ = + thrust::device_vector<std::int32_t, MemoryPoolAllocator<std::int32_t>>(allocator_int32); + device_max_points_per_ring_.resize(1); + + device_input_points_ = + thrust::device_vector<InputPointType, MemoryPoolAllocator<InputPointType>>(allocator_points); + device_organized_points_ = + thrust::device_vector<InputPointType, MemoryPoolAllocator<InputPointType>>(allocator_points); + device_organized_points_.resize(num_organized_points_); + + cudaMemsetAsync( + thrust::raw_pointer_cast(device_max_ring_.data()), 0, sizeof(std::int32_t), stream_); + cudaMemsetAsync( + thrust::raw_pointer_cast(device_max_points_per_ring_.data()), 0, sizeof(std::int32_t), stream_); + cudaMemsetAsync( + thrust::raw_pointer_cast(device_indexes_tensor_.data()), 0x255, sizeof(std::uint32_t), stream_); + + sort_workspace_bytes_ = querySortWorkspace( + num_rings_ * max_points_per_ring_, num_rings_, + thrust::raw_pointer_cast(device_segment_offsets_.data()), + thrust::raw_pointer_cast(device_indexes_tensor_.data()), + thrust::raw_pointer_cast(device_sorted_indexes_tensor_.data())); + + device_sort_workspace_ = + thrust::device_vector<std::uint8_t, MemoryPoolAllocator<std::uint8_t>>(allocator_uint8); + + device_transformed_points_.resize(num_organized_points_); + device_crop_mask_.resize(num_organized_points_); + device_ring_outlier_mask_.resize(num_organized_points_); + device_indices_.resize(num_organized_points_); + + preallocateOutput(); } void CudaPointcloudPreprocessor::setCropBoxParameters( @@ -504,58 +513,51 @@ void CudaPointcloudPreprocessor::preallocateOutput() { output_pointcloud_ptr_ = std::make_unique<cuda_blackboard::CudaPointCloud2>(); output_pointcloud_ptr_->data = cuda_blackboard::make_unique<std::uint8_t[]>( - max_rings_ * max_points_per_ring_ * sizeof(OutputPointType)); + num_rings_ * max_points_per_ring_ * sizeof(OutputPointType)); } void CudaPointcloudPreprocessor::setupTwist2DStructs( - const cuda_blackboard::CudaPointCloud2 & input_pointcloud_msg, const std::deque<geometry_msgs::msg::TwistWithCovarianceStamped> & twist_queue, - const std::deque<geometry_msgs::msg::Vector3Stamped> & angular_velocity_queue) + const std::deque<geometry_msgs::msg::Vector3Stamped> & angular_velocity_queue, + const std::uint64_t pointcloud_stamp_nsec, const std::uint32_t first_point_rel_stamp_nsec) { - const InputPointType * device_input_points = - reinterpret_cast<const InputPointType *>(input_pointcloud_msg.data.get()); - InputPointType first_point; - cudaMemcpy(&first_point, &device_input_points[0], sizeof(InputPointType), cudaMemcpyDeviceToHost); + host_twist_2d_structs_.clear(); + host_twist_2d_structs_.reserve(twist_queue.size() + angular_velocity_queue.size()); // Twist preprocessing - - uint64_t pointcloud_stamp_nsec = 1'000'000'000 * input_pointcloud_msg.header.stamp.sec + - input_pointcloud_msg.header.stamp.nanosec; - - thrust::host_vector<TwistStruct2D> host_twist_structs; - float cum_x = 0; float cum_y = 0; float cum_theta = 0; // All time stamps from now on are in nsec from the "beginning of the pointcloud" - uint32_t last_stamp_nsec = first_point.time_stamp; + std::uint32_t last_stamp_nsec = first_point_rel_stamp_nsec; std::size_t twist_index = 0; std::size_t angular_velocity_index = 0; for (; twist_index < twist_queue.size() || angular_velocity_index < angular_velocity_queue.size();) { - uint64_t twist_stamp, input_twist_global_stamp_nsec, angular_velocity_global_stamp_nsec; + std::uint64_t twist_stamp, input_twist_global_stamp_nsec, angular_velocity_global_stamp_nsec; float v_x, v_theta; if (twist_index < twist_queue.size()) { input_twist_global_stamp_nsec = - 1'000'000'000 * static_cast<uint64_t>(twist_queue[twist_index].header.stamp.sec) + - static_cast<uint64_t>(twist_queue[twist_index].header.stamp.nanosec); + 1'000'000'000 * static_cast<std::uint64_t>(twist_queue[twist_index].header.stamp.sec) + + static_cast<std::uint64_t>(twist_queue[twist_index].header.stamp.nanosec); v_x = twist_queue[twist_index].twist.twist.linear.x; } else { - input_twist_global_stamp_nsec = std::numeric_limits<uint64_t>::max(); + input_twist_global_stamp_nsec = std::numeric_limits<std::uint64_t>::max(); v_x = 0.0; } if (angular_velocity_index < angular_velocity_queue.size()) { angular_velocity_global_stamp_nsec = - 1'000'000'000 * - static_cast<uint64_t>(angular_velocity_queue[angular_velocity_index].header.stamp.sec) + - static_cast<uint64_t>(angular_velocity_queue[angular_velocity_index].header.stamp.nanosec); + 1'000'000'000 * static_cast<std::uint64_t>( + angular_velocity_queue[angular_velocity_index].header.stamp.sec) + + static_cast<std::uint64_t>( + angular_velocity_queue[angular_velocity_index].header.stamp.nanosec); v_theta = angular_velocity_queue[angular_velocity_index].vector.z; } else { - angular_velocity_global_stamp_nsec = std::numeric_limits<uint64_t>::max(); + angular_velocity_global_stamp_nsec = std::numeric_limits<std::uint64_t>::max(); v_theta = 0.0; } @@ -575,15 +577,16 @@ void CudaPointcloudPreprocessor::setupTwist2DStructs( twist.cum_y = cum_y; twist.cum_theta = cum_theta; - uint64_t twist_global_stamp_nsec = twist_stamp; + std::uint64_t twist_global_stamp_nsec = twist_stamp; assert(twist_global_stamp_nsec > pointcloud_stamp_nsec); // by construction - uint32_t twist_from_pointcloud_start_nsec = twist_global_stamp_nsec - pointcloud_stamp_nsec; + std::uint32_t twist_from_pointcloud_start_nsec = + twist_global_stamp_nsec - pointcloud_stamp_nsec; twist.stamp_nsec = twist_from_pointcloud_start_nsec; twist.v_x = v_x; twist.v_theta = v_theta; twist.last_stamp_nsec = last_stamp_nsec; - host_twist_structs.push_back(twist); + host_twist_2d_structs_.push_back(twist); double dt_seconds = 1e-9 * (twist.stamp_nsec - last_stamp_nsec); last_stamp_nsec = twist.stamp_nsec; @@ -594,44 +597,39 @@ void CudaPointcloudPreprocessor::setupTwist2DStructs( } // Copy to device - device_twist_2d_structs_ = host_twist_structs; + device_twist_2d_structs_.resize(host_twist_2d_structs_.size()); + cudaMemcpyAsync( + thrust::raw_pointer_cast(device_twist_2d_structs_.data()), host_twist_2d_structs_.data(), + host_twist_2d_structs_.size() * sizeof(TwistStruct2D), cudaMemcpyHostToDevice, stream_); } void CudaPointcloudPreprocessor::setupTwist3DStructs( - const cuda_blackboard::CudaPointCloud2 & input_pointcloud_msg, const std::deque<geometry_msgs::msg::TwistWithCovarianceStamped> & twist_queue, - const std::deque<geometry_msgs::msg::Vector3Stamped> & angular_velocity_queue) + const std::deque<geometry_msgs::msg::Vector3Stamped> & angular_velocity_queue, + const std::uint64_t pointcloud_stamp_nsec, const std::uint32_t first_point_rel_stamp_nsec) { - const InputPointType * device_input_points = - reinterpret_cast<const InputPointType *>(input_pointcloud_msg.data.get()); - InputPointType first_point; - cudaMemcpy(&first_point, &device_input_points[0], sizeof(InputPointType), cudaMemcpyDeviceToHost); - // Twist preprocessing - - uint64_t pointcloud_stamp_nsec = 1'000'000'000 * input_pointcloud_msg.header.stamp.sec + - input_pointcloud_msg.header.stamp.nanosec; - - thrust::host_vector<TwistStruct3D> host_twist_structs; + host_twist_3d_structs_.clear(); + host_twist_3d_structs_.reserve(twist_queue.size() + angular_velocity_queue.size()); Eigen::Matrix4f cum_transform = Eigen::Matrix4f::Identity(); Eigen::Vector3f v = Eigen::Vector3f::Zero(); Eigen::Vector3f w = Eigen::Vector3f::Zero(); // All time stamps from now on are in nsec from the "beginning of the pointcloud" - uint32_t last_stamp_nsec = first_point.time_stamp; + std::uint32_t last_stamp_nsec = first_point_rel_stamp_nsec; std::size_t twist_index = 0; std::size_t angular_velocity_index = 0; for (; twist_index < twist_queue.size() || angular_velocity_index < angular_velocity_queue.size();) { - uint64_t twist_stamp, input_twist_global_stamp_nsec, angular_velocity_global_stamp_nsec; + std::uint64_t twist_stamp, input_twist_global_stamp_nsec, angular_velocity_global_stamp_nsec; if (twist_index < twist_queue.size()) { input_twist_global_stamp_nsec = - 1'000'000'000 * static_cast<uint64_t>(twist_queue[twist_index].header.stamp.sec) + - static_cast<uint64_t>(twist_queue[twist_index].header.stamp.nanosec); + 1'000'000'000 * static_cast<std::uint64_t>(twist_queue[twist_index].header.stamp.sec) + + static_cast<std::uint64_t>(twist_queue[twist_index].header.stamp.nanosec); v.x() = twist_queue[twist_index].twist.twist.linear.x; v.y() = twist_queue[twist_index].twist.twist.linear.y; v.z() = twist_queue[twist_index].twist.twist.linear.z; @@ -642,14 +640,15 @@ void CudaPointcloudPreprocessor::setupTwist3DStructs( if (angular_velocity_index < angular_velocity_queue.size()) { angular_velocity_global_stamp_nsec = - 1'000'000'000 * - static_cast<uint64_t>(angular_velocity_queue[angular_velocity_index].header.stamp.sec) + - static_cast<uint64_t>(angular_velocity_queue[angular_velocity_index].header.stamp.nanosec); + 1'000'000'000 * static_cast<std::uint64_t>( + angular_velocity_queue[angular_velocity_index].header.stamp.sec) + + static_cast<std::uint64_t>( + angular_velocity_queue[angular_velocity_index].header.stamp.nanosec); w.x() = angular_velocity_queue[angular_velocity_index].vector.x; w.y() = angular_velocity_queue[angular_velocity_index].vector.y; w.z() = angular_velocity_queue[angular_velocity_index].vector.z; } else { - angular_velocity_global_stamp_nsec = std::numeric_limits<uint64_t>::max(); + angular_velocity_global_stamp_nsec = std::numeric_limits<std::uint64_t>::max(); w = Eigen::Vector3f::Zero(); } @@ -671,15 +670,16 @@ void CudaPointcloudPreprocessor::setupTwist3DStructs( Eigen::Map<Eigen::Vector3f> w_map(twist.w); cum_transform_buffer_map = cum_transform; - uint64_t twist_global_stamp_nsec = twist_stamp; + std::uint64_t twist_global_stamp_nsec = twist_stamp; assert(twist_global_stamp_nsec > pointcloud_stamp_nsec); // by construction - uint32_t twist_from_pointcloud_start_nsec = twist_global_stamp_nsec - pointcloud_stamp_nsec; + std::uint32_t twist_from_pointcloud_start_nsec = + twist_global_stamp_nsec - pointcloud_stamp_nsec; twist.stamp_nsec = twist_from_pointcloud_start_nsec; v_map = v; w_map = w; twist.last_stamp_nsec = last_stamp_nsec; - host_twist_structs.push_back(twist); + host_twist_3d_structs_.push_back(twist); double dt_seconds = 1e-9 * (twist.stamp_nsec - last_stamp_nsec); last_stamp_nsec = twist.stamp_nsec; @@ -689,30 +689,154 @@ void CudaPointcloudPreprocessor::setupTwist3DStructs( } // Copy to device - device_twist_3d_structs_ = host_twist_structs; + device_twist_3d_structs_.resize(host_twist_3d_structs_.size()); + cudaMemcpyAsync( + thrust::raw_pointer_cast(device_twist_3d_structs_.data()), host_twist_3d_structs_.data(), + host_twist_3d_structs_.size() * sizeof(TwistStruct3D), cudaMemcpyHostToDevice, stream_); +} + +std::size_t CudaPointcloudPreprocessor::querySortWorkspace( + int num_items, int num_segments, int * offsets_device, std::uint32_t * keys_in_device, + std::uint32_t * keys_out_device) +{ + // Determine temporary device storage requirements + void * temp_storage = nullptr; + size_t temp_storage_bytes = 0; + cub::DeviceSegmentedRadixSort::SortKeys( + temp_storage, temp_storage_bytes, keys_in_device, keys_out_device, num_items, num_segments, + offsets_device, offsets_device + 1); + + return temp_storage_bytes; +} + +void CudaPointcloudPreprocessor::organizePointcloud() +{ + cudaMemsetAsync( + thrust::raw_pointer_cast(device_ring_index_.data()), 0, num_rings_ * sizeof(std::int32_t), + stream_); + cudaMemsetAsync( + thrust::raw_pointer_cast(device_indexes_tensor_.data()), 0xFF, + num_organized_points_ * sizeof(std::uint32_t), stream_); + + if (num_raw_points_ == 0) { + return; + } + + const int raw_points_blocks_per_grid = + (num_raw_points_ + threads_per_block_ - 1) / threads_per_block_; + + organizeKernel<<<raw_points_blocks_per_grid, threads_per_block_, 0, stream_>>>( + thrust::raw_pointer_cast(device_input_points_.data()), + thrust::raw_pointer_cast(device_indexes_tensor_.data()), + thrust::raw_pointer_cast(device_ring_index_.data()), num_rings_, + thrust::raw_pointer_cast(device_max_ring_.data()), max_points_per_ring_, + thrust::raw_pointer_cast(device_max_points_per_ring_.data()), num_raw_points_); + + std::int32_t max_ring_value; + std::int32_t max_points_per_ring; + + cudaMemcpyAsync( + &max_ring_value, thrust::raw_pointer_cast(device_max_ring_.data()), sizeof(std::int32_t), + cudaMemcpyDeviceToHost, stream_); + cudaMemcpyAsync( + &max_points_per_ring, thrust::raw_pointer_cast(device_max_points_per_ring_.data()), + sizeof(std::int32_t), cudaMemcpyDeviceToHost, stream_); + cudaStreamSynchronize(stream_); + + if (max_ring_value >= num_rings_ || max_points_per_ring > max_points_per_ring_) { + num_rings_ = max_ring_value + 1; + max_points_per_ring_ = std::max((max_points_per_ring + 511) / 512 * 512, 512); + num_organized_points_ = num_rings_ * max_points_per_ring_; + + device_ring_index_.resize(num_rings_); + device_indexes_tensor_.resize(num_organized_points_); + device_sorted_indexes_tensor_.resize(num_organized_points_); + device_segment_offsets_.resize(num_rings_ + 1); + device_organized_points_.resize(num_organized_points_); + + device_transformed_points_.resize(num_organized_points_); + device_crop_mask_.resize(num_organized_points_); + device_ring_outlier_mask_.resize(num_organized_points_); + device_indices_.resize(num_organized_points_); + + preallocateOutput(); + + std::vector<std::int32_t> segment_offsets_host(num_rings_ + 1); + for (std::size_t i = 0; i < num_rings_ + 1; i++) { + segment_offsets_host[i] = i * max_points_per_ring_; + } + + cudaMemcpyAsync( + thrust::raw_pointer_cast(device_segment_offsets_.data()), segment_offsets_host.data(), + (num_rings_ + 1) * sizeof(std::int32_t), cudaMemcpyHostToDevice, stream_); + + cudaMemsetAsync( + thrust::raw_pointer_cast(device_ring_index_.data()), 0, num_rings_ * sizeof(std::int32_t), + stream_); + cudaMemsetAsync( + thrust::raw_pointer_cast(device_indexes_tensor_.data()), 0xFF, + num_organized_points_ * sizeof(std::int32_t), stream_); + + sort_workspace_bytes_ = querySortWorkspace( + num_organized_points_, num_rings_, thrust::raw_pointer_cast(device_segment_offsets_.data()), + thrust::raw_pointer_cast(device_indexes_tensor_.data()), + thrust::raw_pointer_cast(device_sorted_indexes_tensor_.data())); + device_sort_workspace_.resize(sort_workspace_bytes_); + + organizeKernel<<<raw_points_blocks_per_grid, threads_per_block_, 0, stream_>>>( + thrust::raw_pointer_cast(device_input_points_.data()), + thrust::raw_pointer_cast(device_indexes_tensor_.data()), + thrust::raw_pointer_cast(device_ring_index_.data()), num_rings_, + thrust::raw_pointer_cast(device_max_ring_.data()), max_points_per_ring_, + thrust::raw_pointer_cast(device_max_points_per_ring_.data()), num_raw_points_); + } + + if (num_organized_points_ == num_rings_) { + cudaMemcpyAsync( + thrust::raw_pointer_cast(device_sorted_indexes_tensor_.data()), + thrust::raw_pointer_cast(device_indexes_tensor_.data()), + num_organized_points_ * sizeof(std::uint32_t), cudaMemcpyDeviceToDevice, stream_); + } else { + cub::DeviceSegmentedRadixSort::SortKeys( + reinterpret_cast<void *>(thrust::raw_pointer_cast(device_sort_workspace_.data())), + sort_workspace_bytes_, thrust::raw_pointer_cast(device_indexes_tensor_.data()), + thrust::raw_pointer_cast(device_sorted_indexes_tensor_.data()), num_organized_points_, + num_rings_, thrust::raw_pointer_cast(device_segment_offsets_.data()), + thrust::raw_pointer_cast(device_segment_offsets_.data()) + 1, 0, sizeof(std::uint32_t) * 8, + stream_); + } + + const int organized_points_blocks_per_grid = + (num_organized_points_ + threads_per_block_ - 1) / threads_per_block_; + gatherKernel<<<organized_points_blocks_per_grid, threads_per_block_, 0, stream_>>>( + thrust::raw_pointer_cast(device_input_points_.data()), + thrust::raw_pointer_cast(device_sorted_indexes_tensor_.data()), + thrust::raw_pointer_cast(device_organized_points_.data()), num_rings_, max_points_per_ring_); } std::unique_ptr<cuda_blackboard::CudaPointCloud2> CudaPointcloudPreprocessor::process( - const cuda_blackboard::CudaPointCloud2 & input_pointcloud_msg, + const sensor_msgs::msg::PointCloud2::ConstSharedPtr input_pointcloud_msg_ptr, const geometry_msgs::msg::TransformStamped & transform_msg, const std::deque<geometry_msgs::msg::TwistWithCovarianceStamped> & twist_queue, - const std::deque<geometry_msgs::msg::Vector3Stamped> & angular_velocity_queue) + const std::deque<geometry_msgs::msg::Vector3Stamped> & angular_velocity_queue, + const std::uint32_t first_point_rel_stamp_nsec) { - auto frame_id = input_pointcloud_msg.header.frame_id; + auto frame_id = input_pointcloud_msg_ptr->header.frame_id; + num_raw_points_ = input_pointcloud_msg_ptr->width * input_pointcloud_msg_ptr->height; + num_organized_points_ = num_rings_ * max_points_per_ring_; - auto num_input_points = input_pointcloud_msg.width * input_pointcloud_msg.height; - if ( - input_pointcloud_msg.width * input_pointcloud_msg.height > max_rings_ * max_points_per_ring_) { - max_rings_ = input_pointcloud_msg.height; - max_points_per_ring_ = input_pointcloud_msg.width; + if (num_raw_points_ > device_input_points_.size()) { + std::size_t new_capacity = (num_raw_points_ + 1024) / 1024 * 1024; + device_input_points_.resize(new_capacity); + } - device_transformed_points_.resize(num_input_points); - device_crop_mask_.resize(num_input_points); - device_ring_outlier_mask_.resize(num_input_points); - device_indices_.resize(num_input_points); + cudaMemcpyAsync( + thrust::raw_pointer_cast(device_input_points_.data()), input_pointcloud_msg_ptr->data.data(), + num_raw_points_ * sizeof(InputPointType), cudaMemcpyHostToDevice, stream_); - preallocateOutput(); - } + cudaStreamSynchronize(stream_); + + organizePointcloud(); tf2::Quaternion rotation_quaternion( transform_msg.transform.rotation.x, transform_msg.transform.rotation.y, @@ -735,15 +859,19 @@ std::unique_ptr<cuda_blackboard::CudaPointCloud2> CudaPointcloudPreprocessor::pr transform_struct.m33 = static_cast<float>(rotation_matrix[2][2]); // Twist preprocessing - const InputPointType * device_input_points = - reinterpret_cast<const InputPointType *>(input_pointcloud_msg.data.get()); + std::uint64_t pointcloud_stamp_nsec = 1'000'000'000 * input_pointcloud_msg_ptr->header.stamp.sec + + input_pointcloud_msg_ptr->header.stamp.nanosec; if (use_3d_undistortion_) { - setupTwist3DStructs(input_pointcloud_msg, twist_queue, angular_velocity_queue); + setupTwist3DStructs( + twist_queue, angular_velocity_queue, pointcloud_stamp_nsec, first_point_rel_stamp_nsec); } else { - setupTwist2DStructs(input_pointcloud_msg, twist_queue, angular_velocity_queue); + setupTwist2DStructs( + twist_queue, angular_velocity_queue, pointcloud_stamp_nsec, first_point_rel_stamp_nsec); } + // check_error(frame_id, "After setup"); + // Obtain raw pointers for the kernels TwistStruct2D * device_twist_2d_structs = thrust::raw_pointer_cast(device_twist_2d_structs_.data()); @@ -752,60 +880,64 @@ std::unique_ptr<cuda_blackboard::CudaPointCloud2> CudaPointcloudPreprocessor::pr InputPointType * device_transformed_points = thrust::raw_pointer_cast(device_transformed_points_.data()); std::uint32_t * device_crop_mask = thrust::raw_pointer_cast(device_crop_mask_.data()); - uint32_t * device_ring_outlier_mask = thrust::raw_pointer_cast(device_ring_outlier_mask_.data()); - uint32_t * device_indices = thrust::raw_pointer_cast(device_indices_.data()); + std::uint32_t * device_ring_outlier_mask = + thrust::raw_pointer_cast(device_ring_outlier_mask_.data()); + std::uint32_t * device_indices = thrust::raw_pointer_cast(device_indices_.data()); - const int blocks_per_grid = (num_input_points + threads_per_block_ - 1) / threads_per_block_; + const int blocks_per_grid = (num_organized_points_ + threads_per_block_ - 1) / threads_per_block_; transformPointsKernel<<<blocks_per_grid, threads_per_block_, 0, stream_>>>( - device_input_points, device_transformed_points, num_input_points, transform_struct); + thrust::raw_pointer_cast(device_organized_points_.data()), device_transformed_points, + num_organized_points_, transform_struct); int crop_box_blocks_per_grid = std::min(blocks_per_grid, max_blocks_per_grid_); if (host_crop_box_structs_.size() > 0) { cropBoxKernel<<<crop_box_blocks_per_grid, threads_per_block_, 0, stream_>>>( - device_transformed_points, device_crop_mask, num_input_points, + device_transformed_points, device_crop_mask, num_organized_points_, thrust::raw_pointer_cast(device_crop_box_structs_.data()), host_crop_box_structs_.size()); } else { - thrust::fill(thrust::device, device_crop_mask, device_crop_mask + num_input_points, 1); + thrust::fill(thrust::device, device_crop_mask, device_crop_mask + num_organized_points_, 1); } if (use_3d_undistortion_ && device_twist_3d_structs_.size() > 0) { undistort3dKernel<<<blocks_per_grid, threads_per_block_, 0, stream_>>>( - device_transformed_points, num_input_points, device_twist_3d_structs, + device_transformed_points, num_organized_points_, device_twist_3d_structs, device_twist_3d_structs_.size()); } else if (!use_3d_undistortion_ && device_twist_2d_structs_.size() > 0) { undistort2dKernel<<<blocks_per_grid, threads_per_block_, 0, stream_>>>( - device_transformed_points, num_input_points, device_twist_2d_structs, + device_transformed_points, num_organized_points_, device_twist_2d_structs, device_twist_2d_structs_.size()); } ringOutlierFilterKernel<<<blocks_per_grid, threads_per_block_, 0, stream_>>>( - device_transformed_points, device_ring_outlier_mask, max_rings_, max_points_per_ring_, + device_transformed_points, device_ring_outlier_mask, num_rings_, max_points_per_ring_, ring_outlier_parameters_.distance_ratio, ring_outlier_parameters_.object_length_threshold * ring_outlier_parameters_.object_length_threshold, ring_outlier_parameters_.num_points_threshold); combineMasksKernel<<<blocks_per_grid, threads_per_block_, 0, stream_>>>( - device_crop_mask, device_ring_outlier_mask, num_input_points, device_ring_outlier_mask); - - cudaStreamSynchronize(stream_); + device_crop_mask, device_ring_outlier_mask, num_organized_points_, device_ring_outlier_mask); thrust::inclusive_scan( - thrust::device, device_ring_outlier_mask, device_ring_outlier_mask + num_input_points, + thrust::device, device_ring_outlier_mask, device_ring_outlier_mask + num_organized_points_, device_indices); - uint32_t num_output_points; - cudaMemcpy( - &num_output_points, device_indices + num_input_points - 1, sizeof(uint32_t), - cudaMemcpyDeviceToHost); + std::uint32_t num_output_points; + cudaMemcpyAsync( + &num_output_points, device_indices + num_organized_points_ - 1, sizeof(std::uint32_t), + cudaMemcpyDeviceToHost, stream_); + cudaStreamSynchronize(stream_); if (num_output_points > 0) { - extractInputPointsToOutputPoints_indicesKernel<<<blocks_per_grid, threads_per_block_>>>( - device_transformed_points, device_ring_outlier_mask, device_indices, num_input_points, + extractInputPointsToOutputPoints_indicesKernel<<< + blocks_per_grid, threads_per_block_, 0, stream_>>>( + device_transformed_points, device_ring_outlier_mask, device_indices, num_organized_points_, reinterpret_cast<OutputPointType *>(output_pointcloud_ptr_->data.get())); } + cudaStreamSynchronize(stream_); + // Copy the transformed points back output_pointcloud_ptr_->row_step = num_output_points * sizeof(OutputPointType); output_pointcloud_ptr_->width = num_output_points; @@ -813,9 +945,9 @@ std::unique_ptr<cuda_blackboard::CudaPointCloud2> CudaPointcloudPreprocessor::pr output_pointcloud_ptr_->fields = point_fields_; output_pointcloud_ptr_->is_dense = true; - output_pointcloud_ptr_->is_bigendian = input_pointcloud_msg.is_bigendian; + output_pointcloud_ptr_->is_bigendian = input_pointcloud_msg_ptr->is_bigendian; output_pointcloud_ptr_->point_step = sizeof(OutputPointType); - output_pointcloud_ptr_->header.stamp = input_pointcloud_msg.header.stamp; + output_pointcloud_ptr_->header.stamp = input_pointcloud_msg_ptr->header.stamp; return std::move(output_pointcloud_ptr_); } diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.cpp b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.cpp index 1aa634d8807dd..b2db15491f369 100644 --- a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.cpp +++ b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.cpp @@ -19,6 +19,7 @@ #include <autoware/point_types/types.hpp> #include <geometry_msgs/msg/transform_stamped.hpp> +#include <sensor_msgs/point_cloud2_iterator.hpp> #include <tf2_geometry_msgs/tf2_geometry_msgs.hpp> #include <cuda_runtime.h> @@ -27,8 +28,6 @@ #include <pcl/point_types.h> #include <chrono> -#include <chrono> // TODO(knzo25): remove this -#include <limits> // TODO(knzo25): remove this #include <string> #include <vector> @@ -126,32 +125,6 @@ CudaPointcloudPreprocessorNode::CudaPointcloudPreprocessorNode( debug_publisher_ = std::make_unique<DebugPublisher>(this, "cuda_pointcloud_preprocessor"); stop_watch_ptr_->tic("processing_time"); } - - // New organized pointcloud method - test_num_rings_ = 1; - test_max_points_per_ring_ = 1; - ring_index_device_ = cuda_blackboard::make_unique<std::int32_t[]>(test_num_rings_); - indexes_tensor_device_ = - cuda_blackboard::make_unique<std::uint32_t[]>(test_num_rings_ * test_max_points_per_ring_); - sorted_indexes_tensor_device_ = - cuda_blackboard::make_unique<std::uint32_t[]>(test_num_rings_ * test_max_points_per_ring_); - segment_offsets_device_ = cuda_blackboard::make_unique<std::int32_t[]>(test_num_rings_ + 1); - max_rings_device_ = cuda_blackboard::make_unique<std::int32_t>(); - max_points_per_ring_device_ = cuda_blackboard::make_unique<std::int32_t>(); - organized_points_device_ = - cuda_blackboard::make_unique<InputPointType[]>(test_num_rings_ * test_max_points_per_ring_); - - cudaStreamCreate(&stream_); - cudaMemsetAsync(max_rings_device_.get(), 0, sizeof(std::int32_t), stream_); - cudaMemsetAsync(max_points_per_ring_device_.get(), 0, sizeof(std::int32_t), stream_); - - sort_workspace_bytes_ = querySortWorkspace( - test_num_rings_ * test_max_points_per_ring_, test_num_rings_, segment_offsets_device_.get(), - indexes_tensor_device_.get(), sorted_indexes_tensor_device_.get()); - sort_workspace_device_ = cuda_blackboard::make_unique<std::uint8_t[]>(sort_workspace_bytes_); - - capacity_ = 8192; - points_device_ = cuda_blackboard::make_unique<InputPointType[]>(capacity_); } bool CudaPointcloudPreprocessorNode::getTransform( @@ -231,356 +204,9 @@ void CudaPointcloudPreprocessorNode::imuCallback( } } -void CudaPointcloudPreprocessorNode::estimatePointcloudRingInfo( - const sensor_msgs::msg::PointCloud2::ConstSharedPtr input_pointcloud_msg_ptr) -{ - const autoware::point_types::PointXYZIRCAEDT * input_buffer = - reinterpret_cast<const autoware::point_types::PointXYZIRCAEDT *>( - input_pointcloud_msg_ptr->data.data()); - - std::size_t max_ring = 0; - - for (std::size_t i = 0; i < input_pointcloud_msg_ptr->width * input_pointcloud_msg_ptr->height; - i++) { - const autoware::point_types::PointXYZIRCAEDT & point = input_buffer[i]; - const std::size_t ring = static_cast<std::size_t>(point.channel); - max_ring = std::max(max_ring, ring); - } - - // Set max rings to the next power of two - num_rings_ = std::pow(2, std::ceil(std::log2(max_ring + 1))); - num_rings_ = std::max(num_rings_, static_cast<std::size_t>(16)); - std::vector<std::size_t> ring_points(num_rings_, 0); - - for (std::size_t i = 0; i < input_pointcloud_msg_ptr->width * input_pointcloud_msg_ptr->height; - i++) { - const autoware::point_types::PointXYZIRCAEDT & point = input_buffer[i]; - const std::size_t ring = point.channel; - ring_points[ring]++; - } - - // Set max points per ring to the next multiple of 512 - max_points_per_ring_ = *std::max_element(ring_points.begin(), ring_points.end()); - max_points_per_ring_ = std::max(max_points_per_ring_, static_cast<std::size_t>(512)); - max_points_per_ring_ = (max_points_per_ring_ + 511) / 512 * 512; - - next_ring_index_.resize(num_rings_); - std::fill(next_ring_index_.begin(), next_ring_index_.end(), 0); - host_buffer_ = cuda_blackboard::make_host_unique<autoware::point_types::PointXYZIRCAEDT[]>( - num_rings_ * max_points_per_ring_); - - input_pointcloud_ptr_ = std::make_shared<cuda_blackboard::CudaPointCloud2>(); - input_pointcloud_ptr_->data = cuda_blackboard::make_unique<std::uint8_t[]>( - num_rings_ * max_points_per_ring_ * sizeof(autoware::point_types::PointXYZIRCAEDT)); - input_pointcloud_ptr_->width = max_points_per_ring_; - input_pointcloud_ptr_->height = num_rings_; - input_pointcloud_ptr_->point_step = sizeof(autoware::point_types::PointXYZIRCAEDT); - input_pointcloud_ptr_->row_step = - max_points_per_ring_ * sizeof(autoware::point_types::PointXYZIRCAEDT); - input_pointcloud_ptr_->is_dense = input_pointcloud_msg_ptr->is_dense; - input_pointcloud_ptr_->fields = input_pointcloud_msg_ptr->fields; - input_pointcloud_ptr_->header = input_pointcloud_msg_ptr->header; - - RCLCPP_INFO_STREAM( - get_logger(), "Estimated rings: " << num_rings_ - << ", max_points_per_ring: " << max_points_per_ring_ - << ". This should only be done during the first iterations. " - "Otherwise, performance will be affected."); -} - -bool CudaPointcloudPreprocessorNode::orderPointcloud( - const sensor_msgs::msg::PointCloud2::ConstSharedPtr input_pointcloud_msg_ptr) -{ - const autoware::point_types::PointXYZIRCAEDT * input_buffer = - reinterpret_cast<const autoware::point_types::PointXYZIRCAEDT *>( - input_pointcloud_msg_ptr->data.data()); - - bool ring_overflow = false; - bool point_overflow = false; - - autoware::point_types::PointXYZIRCAEDT * buffer = host_buffer_.get(); - - for (std::size_t i = 0; i < input_pointcloud_msg_ptr->width * input_pointcloud_msg_ptr->height; - i++) { - const autoware::point_types::PointXYZIRCAEDT & point = input_buffer[i]; - const std::size_t raw_ring = point.channel; - const std::size_t ring = raw_ring % num_rings_; - - const std::size_t raw_index = next_ring_index_[ring]; - const std::size_t index = raw_index % max_points_per_ring_; - - ring_overflow |= raw_ring >= num_rings_; - point_overflow |= raw_index >= max_points_per_ring_; - - buffer[ring * max_points_per_ring_ + index] = point; - next_ring_index_[ring] = raw_index + 1; - } - - return !ring_overflow && !point_overflow; -} - void CudaPointcloudPreprocessorNode::pointcloudCallback( const sensor_msgs::msg::PointCloud2::ConstSharedPtr input_pointcloud_msg_ptr) { - // NEW Cuda version - if (input_pointcloud_msg_ptr->width * input_pointcloud_msg_ptr->height > capacity_) { - capacity_ = input_pointcloud_msg_ptr->width * input_pointcloud_msg_ptr->height; - capacity_ = (capacity_ + 1024) / 1024 * 1024; - points_device_ = cuda_blackboard::make_unique<InputPointType[]>(capacity_); - } - - double cuda_t0 = 1e-6 * std::chrono::duration_cast<std::chrono::nanoseconds>( - std::chrono::high_resolution_clock::now().time_since_epoch()) - .count(); - - cudaMemcpyAsync( - points_device_.get(), input_pointcloud_msg_ptr->data.data(), - input_pointcloud_msg_ptr->width * input_pointcloud_msg_ptr->height * sizeof(InputPointType), - cudaMemcpyHostToDevice, stream_); - cudaStreamSynchronize(stream_); - - double cuda_t1 = 1e-6 * std::chrono::duration_cast<std::chrono::nanoseconds>( - std::chrono::high_resolution_clock::now().time_since_epoch()) - .count(); - - // Estimate the number of rings and max points per ring - cudaMemsetAsync(ring_index_device_.get(), 0, test_num_rings_ * sizeof(std::int32_t), stream_); - cudaMemsetAsync( - indexes_tensor_device_.get(), 0xFF, - test_num_rings_ * test_max_points_per_ring_ * sizeof(std::uint32_t), stream_); - organizeLaunch( - points_device_.get(), indexes_tensor_device_.get(), ring_index_device_.get(), test_num_rings_, - max_rings_device_.get(), test_max_points_per_ring_, max_points_per_ring_device_.get(), - input_pointcloud_msg_ptr->width * input_pointcloud_msg_ptr->height, stream_); - cudaStreamSynchronize(stream_); - - std::int32_t max_ring_value; - std::int32_t max_points_per_ring; - cudaMemcpyAsync( - &max_ring_value, max_rings_device_.get(), sizeof(std::int32_t), cudaMemcpyDeviceToHost, - stream_); - cudaMemcpyAsync( - &max_points_per_ring, max_points_per_ring_device_.get(), sizeof(std::int32_t), - cudaMemcpyDeviceToHost, stream_); - cudaStreamSynchronize(stream_); - - if (max_ring_value >= test_num_rings_ || max_points_per_ring > test_max_points_per_ring_) { - RCLCPP_WARN( - get_logger(), "Current ring num: %d, Current Max points per ring: %d", test_num_rings_, - test_max_points_per_ring_); - test_num_rings_ = max_ring_value + 1; - test_max_points_per_ring_ = - std::max((max_points_per_ring + 511) / 512 * 512, 512); // Re think this equation - ring_index_device_ = cuda_blackboard::make_unique<std::int32_t[]>(test_num_rings_); - indexes_tensor_device_ = - cuda_blackboard::make_unique<std::uint32_t[]>(test_num_rings_ * test_max_points_per_ring_); - sorted_indexes_tensor_device_ = - cuda_blackboard::make_unique<std::uint32_t[]>(test_num_rings_ * test_max_points_per_ring_); - segment_offsets_device_ = cuda_blackboard::make_unique<std::int32_t[]>(test_num_rings_ + 1); - organized_points_device_ = - cuda_blackboard::make_unique<InputPointType[]>(test_num_rings_ * test_max_points_per_ring_); - RCLCPP_WARN( - get_logger(), "NEW Max ring value: %d, Max points per ring: %d", max_ring_value, - test_max_points_per_ring_); - - std::vector<std::int32_t> segment_offsets_host(test_num_rings_ + 1); - for (std::size_t i = 0; i < test_num_rings_ + 1; i++) { - segment_offsets_host[i] = i * test_max_points_per_ring_; - } - cudaMemcpyAsync( - segment_offsets_device_.get(), segment_offsets_host.data(), - (test_num_rings_ + 1) * sizeof(std::int32_t), cudaMemcpyHostToDevice, stream_); - - cudaMemsetAsync(ring_index_device_.get(), 0, test_num_rings_ * sizeof(std::int32_t), stream_); - cudaMemsetAsync( - indexes_tensor_device_.get(), 0xFF, - test_num_rings_ * test_max_points_per_ring_ * sizeof(std::int32_t), stream_); - - sort_workspace_bytes_ = querySortWorkspace( - test_num_rings_ * test_max_points_per_ring_, test_num_rings_, segment_offsets_device_.get(), - indexes_tensor_device_.get(), sorted_indexes_tensor_device_.get()); - sort_workspace_device_ = cuda_blackboard::make_unique<std::uint8_t[]>(sort_workspace_bytes_); - - organizeLaunch( - points_device_.get(), indexes_tensor_device_.get(), ring_index_device_.get(), test_num_rings_, - max_rings_device_.get(), test_max_points_per_ring_, max_points_per_ring_device_.get(), - input_pointcloud_msg_ptr->width * input_pointcloud_msg_ptr->height, stream_); - // cudaStreamSynchronize(stream_); - } - - /* std::vector<std::int32_t> ring_index_host(test_num_rings_); - cudaMemcpyAsync( - ring_index_host.data(), ring_index_device_.get(), test_num_rings_ * sizeof(std::int32_t), - cudaMemcpyDeviceToHost, stream_); cudaStreamSynchronize(stream_); - - for (std::size_t i = 0; i < test_num_rings_; i++) { - if (ring_index_host[i] > 0) { - std::cout << "Ring id=" << i << " has " << ring_index_host[i] << " points" << std::endl; - } - } */ - - sortLaunch( - test_num_rings_ * test_max_points_per_ring_, test_num_rings_, segment_offsets_device_.get(), - indexes_tensor_device_.get(), sorted_indexes_tensor_device_.get(), sort_workspace_device_.get(), - sort_workspace_bytes_, stream_); - cudaStreamSynchronize(stream_); - - /* std::vector<std::uint32_t> indexes_tensor_host(test_num_rings_ * test_max_points_per_ring_); - cudaMemcpyAsync( - indexes_tensor_host.data(), sorted_indexes_tensor_device_.get(), - test_num_rings_ * test_max_points_per_ring_ * sizeof(std::uint32_t), cudaMemcpyDeviceToHost, - stream_); cudaStreamSynchronize(stream_); - - if (test_num_rings_ < 64) { - for (std::size_t i = 0; i < test_num_rings_; i++) { - std::cout << "Ring id=" << i << "\n\t"; - for (std::size_t j = 0; j < test_max_points_per_ring_; j++) { - if (indexes_tensor_host[i * test_max_points_per_ring_ + j] < - std::numeric_limits<std::uint32_t>::max()) { std::cout << indexes_tensor_host[i * - test_max_points_per_ring_ + j] << " "; - } - } - std::cout << std::endl << std::flush; - } - } */ - - gatherLaunch( - points_device_.get(), sorted_indexes_tensor_device_.get(), organized_points_device_.get(), - test_num_rings_, test_max_points_per_ring_, stream_); - cudaStreamSynchronize( - stream_); // this will not be needed in the final implementation. for now we benchmark - - double cuda_t2 = 1e-6 * std::chrono::duration_cast<std::chrono::nanoseconds>( - std::chrono::high_resolution_clock::now().time_since_epoch()) - .count(); - - if (iteration_ >= 0) { - organize_time_cuda_ms_ += (cuda_t1 - cuda_t0); - copy_time_cuda_ms_ += (cuda_t2 - cuda_t1); - } - - stop_watch_ptr_->toc("processing_time", true); - - // TODO(knzo25): check the pointcloud layout at least once - - assert(input_pointcloud_msg_ptr->point_step == sizeof(autoware::point_types::PointXYZIRCAEDT)); - - if (num_rings_ == 0 || max_points_per_ring_ == 0) { - estimatePointcloudRingInfo(input_pointcloud_msg_ptr); - } - - double t1 = 1e-6 * std::chrono::duration_cast<std::chrono::nanoseconds>( - std::chrono::high_resolution_clock::now().time_since_epoch()) - .count(); - if (!orderPointcloud(input_pointcloud_msg_ptr)) { - estimatePointcloudRingInfo(input_pointcloud_msg_ptr); - orderPointcloud(input_pointcloud_msg_ptr); - } - double t2 = 1e-6 * std::chrono::duration_cast<std::chrono::nanoseconds>( - std::chrono::high_resolution_clock::now().time_since_epoch()) - .count(); - - // Copy to cuda memory - cudaMemcpy( - input_pointcloud_ptr_->data.get(), host_buffer_.get(), - num_rings_ * max_points_per_ring_ * sizeof(autoware::point_types::PointXYZIRCAEDT), - cudaMemcpyHostToDevice); - - double t3 = 1e-6 * std::chrono::duration_cast<std::chrono::nanoseconds>( - std::chrono::high_resolution_clock::now().time_since_epoch()) - .count(); - - if (iteration_ >= 0) { - organize_time_ms_ += (t2 - t1); - copy_time_ms_ += (t3 - t2); - } - - // Test -> this would replace the cpu version - cudaMemcpyAsync( - input_pointcloud_ptr_->data.get(), organized_points_device_.get(), - test_num_rings_ * test_max_points_per_ring_ * sizeof(autoware::point_types::PointXYZIRCAEDT), - cudaMemcpyHostToDevice, stream_); - input_pointcloud_ptr_->height = test_num_rings_; - input_pointcloud_ptr_->width = test_max_points_per_ring_; - cudaStreamSynchronize(stream_); - - // TEST CODE - { - std::vector<autoware::point_types::PointXYZIRCAEDT> host_buffer_test( - test_num_rings_ * test_max_points_per_ring_); - cudaMemcpyAsync( - host_buffer_test.data(), organized_points_device_.get(), - test_num_rings_ * test_max_points_per_ring_ * sizeof(autoware::point_types::PointXYZIRCAEDT), - cudaMemcpyDeviceToHost, stream_); - cudaStreamSynchronize(stream_); - - if ( - test_num_rings_ > 1 && test_max_points_per_ring_ == max_points_per_ring_ && - test_num_rings_ == num_rings_) { - for (std::size_t i = 0; i < num_rings_; i++) { - for (std::size_t j = 0; j < max_points_per_ring_; j++) { - // Compare the implementations - if ( - (host_buffer_.get()[i * max_points_per_ring_ + j].x != - host_buffer_test[i * max_points_per_ring_ + j].x || - host_buffer_.get()[i * max_points_per_ring_ + j].y != - host_buffer_test[i * max_points_per_ring_ + j].y || - host_buffer_.get()[i * max_points_per_ring_ + j].z != - host_buffer_test[i * max_points_per_ring_ + j].z) && - host_buffer_test[i * max_points_per_ring_ + j].distance > 0) { - RCLCPP_ERROR(get_logger(), "Mismatch at ring=%d, point=%d", i, j); - RCLCPP_ERROR( - get_logger(), "\thost_buffer_test: x=%.2f y=%.2f z=%.2f channel=%d distance=%.2f", - host_buffer_test[i * max_points_per_ring_ + j].x, - host_buffer_test[i * max_points_per_ring_ + j].y, - host_buffer_test[i * max_points_per_ring_ + j].z, - host_buffer_test[i * max_points_per_ring_ + j].channel, - host_buffer_test[i * max_points_per_ring_ + j].distance); - RCLCPP_ERROR( - get_logger(), "\thost_buffer: x=%.2f y=%.2f z=%.2f channel=%d distance=%.2f", - host_buffer_.get()[i * max_points_per_ring_ + j].x, - host_buffer_.get()[i * max_points_per_ring_ + j].y, - host_buffer_.get()[i * max_points_per_ring_ + j].z, - host_buffer_.get()[i * max_points_per_ring_ + j].channel, - host_buffer_.get()[i * max_points_per_ring_ + j].distance); - } - } - } - } - } - - input_pointcloud_ptr_->header = input_pointcloud_msg_ptr->header; - - cudaPointcloudCallback(input_pointcloud_ptr_); - - if (debug_publisher_) { - const double processing_time_ms = stop_watch_ptr_->toc("processing_time", true); - debug_publisher_->publish<tier4_debug_msgs::msg::Float64Stamped>( - "debug/processing_time_ms", processing_time_ms); - - double now_stamp_seconds = rclcpp::Time(this->get_clock()->now()).seconds(); - double cloud_stamp_seconds = rclcpp::Time(input_pointcloud_msg_ptr->header.stamp).seconds(); - - debug_publisher_->publish<tier4_debug_msgs::msg::Float64Stamped>( - "debug/latency_ms", 1000.f * (now_stamp_seconds - cloud_stamp_seconds)); - } - - // Clear indexes - std::fill(next_ring_index_.begin(), next_ring_index_.end(), 0); - - // Clear pointcloud buffer - auto host_buffer = host_buffer_.get(); - for (std::size_t i = 0; i < num_rings_ * max_points_per_ring_; i++) { - host_buffer[i] = autoware::point_types::PointXYZIRCAEDT{}; - } -} - -void CudaPointcloudPreprocessorNode::cudaPointcloudCallback( - const std::shared_ptr<const cuda_blackboard::CudaPointCloud2> input_pointcloud_msg_ptr) -{ - double t1 = 1e-6 * std::chrono::duration_cast<std::chrono::nanoseconds>( - std::chrono::high_resolution_clock::now().time_since_epoch()) - .count(); static_assert( sizeof(InputPointType) == sizeof(autoware::point_types::PointXYZIRCAEDT), "PointStruct and PointXYZIRCAEDT must have the same size"); @@ -588,13 +214,14 @@ void CudaPointcloudPreprocessorNode::cudaPointcloudCallback( stop_watch_ptr_->toc("processing_time", true); // Make sure that the first twist is newer than the first point - InputPointType first_point; - cudaMemcpy( - &first_point, input_pointcloud_msg_ptr->data.get(), sizeof(InputPointType), - cudaMemcpyDeviceToHost); + sensor_msgs::PointCloud2ConstIterator<std::uint32_t> iter_stamp( + *input_pointcloud_msg_ptr, "time_stamp"); + + auto num_points = input_pointcloud_msg_ptr->width * input_pointcloud_msg_ptr->height; + std::uint32_t first_point_rel_stamp = num_points > 0 ? *iter_stamp : 0; double first_point_stamp = input_pointcloud_msg_ptr->header.stamp.sec + input_pointcloud_msg_ptr->header.stamp.nanosec * 1e-9 + - first_point.time_stamp * 1e-9; + first_point_rel_stamp * 1e-9; while (twist_queue_.size() > 1 && rclcpp::Time(twist_queue_.front().header.stamp).seconds() < first_point_stamp) { @@ -617,42 +244,11 @@ void CudaPointcloudPreprocessorNode::cudaPointcloudCallback( return; } - double t2 = 1e-6 * std::chrono::duration_cast<std::chrono::nanoseconds>( - std::chrono::high_resolution_clock::now().time_since_epoch()) - .count(); - auto output_pointcloud_ptr = cuda_pointcloud_preprocessor_->process( - *input_pointcloud_msg_ptr, transform_msg, twist_queue_, angular_velocity_queue_); + input_pointcloud_msg_ptr, transform_msg, twist_queue_, angular_velocity_queue_, + first_point_rel_stamp); output_pointcloud_ptr->header.frame_id = base_frame_; - double t3 = 1e-6 * std::chrono::duration_cast<std::chrono::nanoseconds>( - std::chrono::high_resolution_clock::now().time_since_epoch()) - .count(); - - if (iteration_ >= 0) { - processing_time_ms_ += (t2 - t1); - kernel_time_ms_ += (t3 - t2); - } - iteration_++; - - if (num_rings_ > 64) { - RCLCPP_INFO( - get_logger(), - "Organize time: %f ms, Copy time: %f ms, Process time: %f ms, Kernel time: %f ms Total time: " - "%f ms", - organize_time_ms_ / iteration_, copy_time_ms_ / iteration_, processing_time_ms_ / iteration_, - kernel_time_ms_ / iteration_, - (organize_time_ms_ + copy_time_ms_ + processing_time_ms_ + kernel_time_ms_) / iteration_); - RCLCPP_INFO( - get_logger(), - "\t Cuda Organize time: %f ms, Cuda Copy time: %f ms, Cuda Process time: %f ms, Cuda Kernel " - "time: %f ms Total time: %f ms", - organize_time_cuda_ms_ / iteration_, copy_time_cuda_ms_ / iteration_, - processing_time_ms_ / iteration_, kernel_time_ms_ / iteration_, - (organize_time_cuda_ms_ + copy_time_cuda_ms_ + processing_time_ms_ + kernel_time_ms_) / - iteration_); - } - // Publish pub_->publish(std::move(output_pointcloud_ptr)); From c55429062aca9be4658adfb6977a454693076d29 Mon Sep 17 00:00:00 2001 From: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp> Date: Wed, 5 Feb 2025 16:36:41 +0900 Subject: [PATCH 27/55] chore: removed redundant ternay operator Signed-off-by: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp> --- .../cuda_pointcloud_preprocessor.cu | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu index d49f6eeccce42..ffc2216006d74 100644 --- a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu +++ b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu @@ -334,11 +334,9 @@ __global__ void ringOutlierFilterKernel( const float y = left_point.y - right_point.y; const float z = left_point.z - right_point.z; - output_mask[j * max_points_per_ring + i] = - ((walk_size > num_points_threshold) || - (x * x + y * y + z * z >= object_length_threshold_squared)) - ? 1 - : 0; + output_mask[j * max_points_per_ring + i] = static_cast<std::uint32_t>( + (walk_size > num_points_threshold) || + (x * x + y * y + z * z >= object_length_threshold_squared)); } __global__ void transformPointTypeKernel( From d6ad9db83987f4e7f9cc4283b3787909d237562c Mon Sep 17 00:00:00 2001 From: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp> Date: Wed, 5 Feb 2025 16:52:21 +0900 Subject: [PATCH 28/55] chore: added a temporary memory check. the check will be unified in a later PR Signed-off-by: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp> --- .../cuda_pointcloud_preprocessor/memory.hpp | 91 +++++++++++++++++++ .../cuda_pointcloud_preprocessor_node.cpp | 6 ++ 2 files changed, 97 insertions(+) create mode 100644 sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/memory.hpp diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/memory.hpp b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/memory.hpp new file mode 100644 index 0000000000000..9482f170be550 --- /dev/null +++ b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/memory.hpp @@ -0,0 +1,91 @@ +// Copyright 2024 TIER IV, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef AUTOWARE__CUDA_POINTCLOUD_PREPROCESSOR__MEMORY_HPP_ +#define AUTOWARE__CUDA_POINTCLOUD_PREPROCESSOR__MEMORY_HPP_ + +#include "autoware/cuda_pointcloud_preprocessor/point_types.hpp" + +#include <sensor_msgs/msg/point_cloud2.hpp> + +#include <vector> + +namespace autoware::cuda_pointcloud_preprocessor +{ + +// TODO(knzo25): this checks already exists within the pointcloud_preprocessor. After this PR is +// merged I will move it somewhere all classes that require checks can access it +bool is_data_layout_compatible_with_point_xyzircaedt( + const std::vector<sensor_msgs::msg::PointField> & fields) +{ + if (fields.size() != 10) { + return false; + } + bool same_layout = true; + const auto & field_x = fields.at(0); + same_layout &= field_x.name == "x"; + same_layout &= field_x.offset == offsetof(InputPointType, x); + same_layout &= field_x.datatype == sensor_msgs::msg::PointField::FLOAT32; + same_layout &= field_x.count == 1; + const auto & field_y = fields.at(1); + same_layout &= field_y.name == "y"; + same_layout &= field_y.offset == offsetof(InputPointType, y); + same_layout &= field_y.datatype == sensor_msgs::msg::PointField::FLOAT32; + same_layout &= field_y.count == 1; + const auto & field_z = fields.at(2); + same_layout &= field_z.name == "z"; + same_layout &= field_z.offset == offsetof(InputPointType, z); + same_layout &= field_z.datatype == sensor_msgs::msg::PointField::FLOAT32; + same_layout &= field_z.count == 1; + const auto & field_intensity = fields.at(3); + same_layout &= field_intensity.name == "intensity"; + same_layout &= field_intensity.offset == offsetof(InputPointType, intensity); + same_layout &= field_intensity.datatype == sensor_msgs::msg::PointField::UINT8; + same_layout &= field_intensity.count == 1; + const auto & field_return_type = fields.at(4); + same_layout &= field_return_type.name == "return_type"; + same_layout &= field_return_type.offset == offsetof(InputPointType, return_type); + same_layout &= field_return_type.datatype == sensor_msgs::msg::PointField::UINT8; + same_layout &= field_return_type.count == 1; + const auto & field_ring = fields.at(5); + same_layout &= field_ring.name == "channel"; + same_layout &= field_ring.offset == offsetof(InputPointType, channel); + same_layout &= field_ring.datatype == sensor_msgs::msg::PointField::UINT16; + same_layout &= field_ring.count == 1; + const auto & field_azimuth = fields.at(6); + same_layout &= field_azimuth.name == "azimuth"; + same_layout &= field_azimuth.offset == offsetof(InputPointType, azimuth); + same_layout &= field_azimuth.datatype == sensor_msgs::msg::PointField::FLOAT32; + same_layout &= field_azimuth.count == 1; + const auto & field_elevation = fields.at(7); + same_layout &= field_elevation.name == "elevation"; + same_layout &= field_elevation.offset == offsetof(InputPointType, elevation); + same_layout &= field_elevation.datatype == sensor_msgs::msg::PointField::FLOAT32; + same_layout &= field_elevation.count == 1; + const auto & field_distance = fields.at(8); + same_layout &= field_distance.name == "distance"; + same_layout &= field_distance.offset == offsetof(InputPointType, distance); + same_layout &= field_distance.datatype == sensor_msgs::msg::PointField::FLOAT32; + same_layout &= field_distance.count == 1; + const auto & field_time_stamp = fields.at(9); + same_layout &= field_time_stamp.name == "time_stamp"; + same_layout &= field_time_stamp.offset == offsetof(InputPointType, time_stamp); + same_layout &= field_time_stamp.datatype == sensor_msgs::msg::PointField::UINT32; + same_layout &= field_time_stamp.count == 1; + return same_layout; +} + +} // namespace autoware::cuda_pointcloud_preprocessor + +#endif // AUTOWARE__CUDA_POINTCLOUD_PREPROCESSOR__MEMORY_HPP_ diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.cpp b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.cpp index b2db15491f369..bf6cc1f79890c 100644 --- a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.cpp +++ b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.cpp @@ -14,6 +14,7 @@ #include "autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.hpp" +#include "autoware/cuda_pointcloud_preprocessor/memory.hpp" #include "autoware/cuda_pointcloud_preprocessor/point_types.hpp" #include <autoware/point_types/types.hpp> @@ -207,6 +208,11 @@ void CudaPointcloudPreprocessorNode::imuCallback( void CudaPointcloudPreprocessorNode::pointcloudCallback( const sensor_msgs::msg::PointCloud2::ConstSharedPtr input_pointcloud_msg_ptr) { + if (!is_data_layout_compatible_with_point_xyzircaedt(input_pointcloud_msg_ptr->fields)) { + RCLCPP_ERROR( + get_logger(), "Input pointcloud data layout is not compatible with PointXYZIRCAEDT"); + } + static_assert( sizeof(InputPointType) == sizeof(autoware::point_types::PointXYZIRCAEDT), "PointStruct and PointXYZIRCAEDT must have the same size"); From 303b9ede65c8a748a1dd495df78ee3eda8d1b066 Mon Sep 17 00:00:00 2001 From: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp> Date: Wed, 5 Feb 2025 18:17:02 +0900 Subject: [PATCH 29/55] chore: refactored the structure to avoid large files Signed-off-by: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp> --- .../CMakeLists.txt | 4 + .../common_kernels.hpp | 45 ++ .../cuda_pointcloud_preprocessor.hpp | 94 +-- .../organize_kernels.hpp | 42 ++ .../outlier_kernels.hpp | 32 + .../cuda_pointcloud_preprocessor/types.hpp | 104 ++++ .../undistort_kernels.hpp | 53 ++ .../common_kernels.cu | 128 ++++ .../cuda_pointcloud_preprocessor.cu | 579 ++---------------- .../organize_kernels.cu | 108 ++++ .../outlier_kernels.cu | 86 +++ .../undistort_kernels.cu | 335 ++++++++++ 12 files changed, 974 insertions(+), 636 deletions(-) create mode 100644 sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/common_kernels.hpp create mode 100644 sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/organize_kernels.hpp create mode 100644 sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/outlier_kernels.hpp create mode 100644 sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/types.hpp create mode 100644 sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/undistort_kernels.hpp create mode 100644 sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/common_kernels.cu create mode 100644 sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/organize_kernels.cu create mode 100644 sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/outlier_kernels.cu create mode 100644 sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/undistort_kernels.cu diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/CMakeLists.txt b/sensing/autoware_cuda_pointcloud_preprocessor/CMakeLists.txt index 8abff042d6a3a..4a4fc505862bb 100644 --- a/sensing/autoware_cuda_pointcloud_preprocessor/CMakeLists.txt +++ b/sensing/autoware_cuda_pointcloud_preprocessor/CMakeLists.txt @@ -36,6 +36,10 @@ list(APPEND CUDA_NVCC_FLAGS "--expt-relaxed-constexpr -diag-suppress 20012") # c cuda_add_library(cuda_pointcloud_preprocessor_lib SHARED src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu + src/cuda_pointcloud_preprocessor/common_kernels.cu + src/cuda_pointcloud_preprocessor/organize_kernels.cu + src/cuda_pointcloud_preprocessor/outlier_kernels.cu + src/cuda_pointcloud_preprocessor/undistort_kernels.cu ) target_link_libraries(cuda_pointcloud_preprocessor_lib diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/common_kernels.hpp b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/common_kernels.hpp new file mode 100644 index 0000000000000..0e21e933569ee --- /dev/null +++ b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/common_kernels.hpp @@ -0,0 +1,45 @@ +// Copyright 2024 TIER IV, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef AUTOWARE__CUDA_POINTCLOUD_PREPROCESSOR__COMMON_KERNELS_HPP_ +#define AUTOWARE__CUDA_POINTCLOUD_PREPROCESSOR__COMMON_KERNELS_HPP_ + +#include "autoware/cuda_pointcloud_preprocessor/point_types.hpp" +#include "autoware/cuda_pointcloud_preprocessor/types.hpp" + +#include <cuda_runtime.h> + +namespace autoware::cuda_pointcloud_preprocessor +{ +void transformPointsLaunch( + const InputPointType * input_points, InputPointType * output_points, int num_points, + TransformStruct transform, int threads_per_block, int blocks_per_grid, cudaStream_t & stream); + +void cropBoxLaunch( + InputPointType * d_points, std::uint32_t * output_mask, int num_points, + const CropBoxParameters * crop_box_parameters_ptr, int num_crop_boxes, int threads_per_block, + int blocks_per_grid, cudaStream_t & stream); + +void combineMasksLaunch( + const std::uint32_t * mask1, const std::uint32_t * mask2, int num_points, + std::uint32_t * output_mask, int threads_per_block, int blocks_per_grid, cudaStream_t & stream); + +void extractPointsLaunch( + InputPointType * input_points, std::uint32_t * masks, std::uint32_t * indices, int num_points, + OutputPointType * output_points, int threads_per_block, int blocks_per_grid, + cudaStream_t & stream); + +} // namespace autoware::cuda_pointcloud_preprocessor + +#endif // AUTOWARE__CUDA_POINTCLOUD_PREPROCESSOR__COMMON_KERNELS_HPP_ diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.hpp b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.hpp index 7c4c0113d3618..bf19949ddc795 100644 --- a/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.hpp +++ b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.hpp @@ -16,6 +16,7 @@ #define AUTOWARE__CUDA_POINTCLOUD_PREPROCESSOR__CUDA_POINTCLOUD_PREPROCESSOR_HPP_ #include "autoware/cuda_pointcloud_preprocessor/point_types.hpp" +#include "autoware/cuda_pointcloud_preprocessor/types.hpp" #include <Eigen/Core> #include <Eigen/Geometry> @@ -28,88 +29,12 @@ #include <thrust/device_vector.h> -#include <cmath> #include <cstdint> #include <deque> namespace autoware::cuda_pointcloud_preprocessor { -struct TwistStruct2D -{ - float cum_x; - float cum_y; - float cum_theta; - float cum_cos_theta; - float cum_sin_theta; - std::uint32_t last_stamp_nsec; // relative to the start of the pointcloud - std::uint32_t stamp_nsec; // relative to the start of the pointcloud - float v_x; - float v_theta; -}; - -struct TwistStruct3D -{ - float cum_transform_buffer[16]; - std::uint32_t last_stamp_nsec; // relative to the start of the pointcloud - std::uint32_t stamp_nsec; // relative to the start of the pointcloud - float v[3]; - float w[3]; -}; - -struct TransformStruct -{ - float x; - float y; - float z; - float m11; - float m12; - float m13; - float m21; - float m22; - float m23; - float m31; - float m32; - float m33; -}; - -struct CropBoxParameters -{ - float min_x; - float max_x; - float min_y; - float max_y; - float min_z; - float max_z; -}; - -struct RingOutlierFilterParameters -{ - float distance_ratio{std::nanf("")}; - float object_length_threshold{std::nanf("")}; - std::size_t num_points_threshold{0}; -}; - -template <typename T> -class MemoryPoolAllocator -{ -public: - using value_type = T; - MemoryPoolAllocator(cudaMemPool_t pool) : m_pool(pool) {} - - T * allocate(std::size_t n) - { - void * ptr = nullptr; - cudaMallocFromPoolAsync(&ptr, n * sizeof(T), m_pool, cudaStreamDefault); - return static_cast<T *>(ptr); - } - - void deallocate(T * ptr, std::size_t) { cudaFreeAsync(ptr, cudaStreamDefault); } - -protected: - cudaMemPool_t m_pool; -}; - class CudaPointcloudPreprocessor { public: @@ -130,20 +55,6 @@ class CudaPointcloudPreprocessor const std::uint32_t first_point_rel_stamp_nsec); private: - void setupTwist2DStructs( - const std::deque<geometry_msgs::msg::TwistWithCovarianceStamped> & twist_queue, - const std::deque<geometry_msgs::msg::Vector3Stamped> & angular_velocity_queue, - const std::uint64_t pointcloud_stamp_nsec, const std::uint32_t first_point_rel_stamp_nsec); - - void setupTwist3DStructs( - const std::deque<geometry_msgs::msg::TwistWithCovarianceStamped> & twist_queue, - const std::deque<geometry_msgs::msg::Vector3Stamped> & angular_velocity_queue, - const std::uint64_t pointcloud_stamp_nsec, const std::uint32_t first_point_rel_stamp_nsec); - - std::size_t querySortWorkspace( - int num_items, int num_segments, int * offsets_device, std::uint32_t * keys_in_device, - std::uint32_t * keys_out_device); - void organizePointcloud(); CropBoxParameters self_crop_box_parameters_{}; @@ -185,11 +96,8 @@ class CudaPointcloudPreprocessor thrust::device_vector<std::uint32_t> device_indices_{}; thrust::device_vector<TwistStruct2D> device_twist_2d_structs_{}; thrust::device_vector<TwistStruct3D> device_twist_3d_structs_{}; - thrust::device_vector<CropBoxParameters> host_crop_box_structs_{}; thrust::device_vector<CropBoxParameters> device_crop_box_structs_{}; - std::vector<TwistStruct2D> host_twist_2d_structs_; - std::vector<TwistStruct3D> host_twist_3d_structs_; }; } // namespace autoware::cuda_pointcloud_preprocessor diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/organize_kernels.hpp b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/organize_kernels.hpp new file mode 100644 index 0000000000000..693a001045d3c --- /dev/null +++ b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/organize_kernels.hpp @@ -0,0 +1,42 @@ +// Copyright 2024 TIER IV, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef AUTOWARE__CUDA_POINTCLOUD_PREPROCESSOR__ORGANIZE_KERNELS_HPP_ +#define AUTOWARE__CUDA_POINTCLOUD_PREPROCESSOR__ORGANIZE_KERNELS_HPP_ + +#include "autoware/cuda_pointcloud_preprocessor/point_types.hpp" +#include "autoware/cuda_pointcloud_preprocessor/types.hpp" + +#include <cuda_runtime.h> + +namespace autoware::cuda_pointcloud_preprocessor +{ + +std::size_t querySortWorkspace( + int num_items, int num_segments, int * offsets_device, std::uint32_t * keys_in_device, + std::uint32_t * keys_out_device); + +void organizeLaunch( + const InputPointType * input_points, std::uint32_t * index_tensor, std::int32_t * ring_indexes, + std::int32_t initial_max_rings, std::int32_t * output_max_rings, + std::int32_t initial_max_points_per_ring, std::int32_t * output_max_points_per_ring, + int num_points, int threads_per_block, int blocks_per_grid, cudaStream_t & stream); + +void gatherLaunch( + const InputPointType * input_points, const std::uint32_t * index_tensor, + InputPointType * output_points, int num_rings, int max_points_per_ring, int threads_per_block, + int blocks_per_grid, cudaStream_t & stream); +} // namespace autoware::cuda_pointcloud_preprocessor + +#endif // AUTOWARE__CUDA_POINTCLOUD_PREPROCESSOR__ORGANIZE_KERNELS_HPP_ diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/outlier_kernels.hpp b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/outlier_kernels.hpp new file mode 100644 index 0000000000000..2010127afdeb5 --- /dev/null +++ b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/outlier_kernels.hpp @@ -0,0 +1,32 @@ +// Copyright 2024 TIER IV, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef AUTOWARE__CUDA_POINTCLOUD_PREPROCESSOR__OUTLIER_KERNELS_HPP_ +#define AUTOWARE__CUDA_POINTCLOUD_PREPROCESSOR__OUTLIER_KERNELS_HPP_ + +#include "autoware/cuda_pointcloud_preprocessor/point_types.hpp" +#include "autoware/cuda_pointcloud_preprocessor/types.hpp" + +#include <cuda_runtime.h> + +namespace autoware::cuda_pointcloud_preprocessor +{ +void ringOutlierFilterLaunch( + const InputPointType * points, std::uint32_t * output_mask, int num_rings, + int max_points_per_ring, float distance_ratio, float object_length_threshold_squared, + int num_points_threshold, int threads_per_block, int blocks_per_grid, cudaStream_t & stream); + +} // namespace autoware::cuda_pointcloud_preprocessor + +#endif // AUTOWARE__CUDA_POINTCLOUD_PREPROCESSOR__OUTLIER_KERNELS_HPP_ diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/types.hpp b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/types.hpp new file mode 100644 index 0000000000000..4cb50c8c28e38 --- /dev/null +++ b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/types.hpp @@ -0,0 +1,104 @@ +// Copyright 2024 TIER IV, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef AUTOWARE__CUDA_POINTCLOUD_PREPROCESSOR__TYPES_HPP_ +#define AUTOWARE__CUDA_POINTCLOUD_PREPROCESSOR__TYPES_HPP_ + +#include <cuda_runtime.h> + +#include <cmath> +#include <cstddef> +#include <cstdint> + +namespace autoware::cuda_pointcloud_preprocessor +{ + +struct TwistStruct2D +{ + float cum_x; + float cum_y; + float cum_theta; + float cum_cos_theta; + float cum_sin_theta; + std::uint32_t last_stamp_nsec; // relative to the start of the pointcloud + std::uint32_t stamp_nsec; // relative to the start of the pointcloud + float v_x; + float v_theta; +}; + +struct TwistStruct3D +{ + float cum_transform_buffer[16]; + std::uint32_t last_stamp_nsec; // relative to the start of the pointcloud + std::uint32_t stamp_nsec; // relative to the start of the pointcloud + float v[3]; + float w[3]; +}; + +struct TransformStruct +{ + float x; + float y; + float z; + float m11; + float m12; + float m13; + float m21; + float m22; + float m23; + float m31; + float m32; + float m33; +}; + +struct CropBoxParameters +{ + float min_x; + float max_x; + float min_y; + float max_y; + float min_z; + float max_z; +}; + +struct RingOutlierFilterParameters +{ + float distance_ratio{std::nanf("")}; + float object_length_threshold{std::nanf("")}; + std::size_t num_points_threshold{0}; +}; + +template <typename T> +class MemoryPoolAllocator +{ +public: + using value_type = T; + MemoryPoolAllocator(cudaMemPool_t pool) : m_pool(pool) {} + + T * allocate(std::size_t n) + { + void * ptr = nullptr; + cudaMallocFromPoolAsync(&ptr, n * sizeof(T), m_pool, cudaStreamDefault); + return static_cast<T *>(ptr); + } + + void deallocate(T * ptr, std::size_t) { cudaFreeAsync(ptr, cudaStreamDefault); } + +protected: + cudaMemPool_t m_pool; +}; + +} // namespace autoware::cuda_pointcloud_preprocessor + +#endif // AUTOWARE__CUDA_POINTCLOUD_PREPROCESSOR__TYPES_HPP_ diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/undistort_kernels.hpp b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/undistort_kernels.hpp new file mode 100644 index 0000000000000..4a2f91af12953 --- /dev/null +++ b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/undistort_kernels.hpp @@ -0,0 +1,53 @@ +// Copyright 2024 TIER IV, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef AUTOWARE__CUDA_POINTCLOUD_PREPROCESSOR__UNDISTORT_KERNELS_HPP_ +#define AUTOWARE__CUDA_POINTCLOUD_PREPROCESSOR__UNDISTORT_KERNELS_HPP_ + +#include "autoware/cuda_pointcloud_preprocessor/point_types.hpp" +#include "autoware/cuda_pointcloud_preprocessor/types.hpp" + +#include <geometry_msgs/msg/twist_with_covariance_stamped.hpp> +#include <geometry_msgs/msg/vector3_stamped.hpp> + +#include <cuda_runtime.h> +#include <thrust/device_vector.h> + +#include <deque> + +namespace autoware::cuda_pointcloud_preprocessor +{ +void undistort2dLaunch( + InputPointType * input_points, int num_points, TwistStruct2D * twist_structs, int num_twists, + int threads_per_block, int blocks_per_grid, cudaStream_t & stream); + +void undistort3dLaunch( + InputPointType * input_points, int num_points, TwistStruct3D * twist_structs, int num_twists, + int threads_per_block, int blocks_per_grid, cudaStream_t & stream); + +void setupTwist2DStructs( + const std::deque<geometry_msgs::msg::TwistWithCovarianceStamped> & twist_queue, + const std::deque<geometry_msgs::msg::Vector3Stamped> & angular_velocity_queue, + const std::uint64_t pointcloud_stamp_nsec, const std::uint32_t first_point_rel_stamp_nsec, + thrust::device_vector<TwistStruct2D> & device_twist_2d_structs, cudaStream_t & stream); + +void setupTwist3DStructs( + const std::deque<geometry_msgs::msg::TwistWithCovarianceStamped> & twist_queue, + const std::deque<geometry_msgs::msg::Vector3Stamped> & angular_velocity_queue, + const std::uint64_t pointcloud_stamp_nsec, const std::uint32_t first_point_rel_stamp_nsec, + thrust::device_vector<TwistStruct3D> & device_twist_3d_structs, cudaStream_t & stream); + +} // namespace autoware::cuda_pointcloud_preprocessor + +#endif // AUTOWARE__CUDA_POINTCLOUD_PREPROCESSOR__UNDISTORT_KERNELS_HPP_ diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/common_kernels.cu b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/common_kernels.cu new file mode 100644 index 0000000000000..64ed3ca8ab72f --- /dev/null +++ b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/common_kernels.cu @@ -0,0 +1,128 @@ +// Copyright 2024 TIER IV, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "autoware/cuda_pointcloud_preprocessor/common_kernels.hpp" +#include "autoware/cuda_pointcloud_preprocessor/point_types.hpp" +#include "autoware/cuda_pointcloud_preprocessor/types.hpp" + +namespace autoware::cuda_pointcloud_preprocessor +{ +__global__ void transformPointsKernel( + const InputPointType * __restrict__ input_points, InputPointType * output_points, int num_points, + TransformStruct transform) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < num_points) { + output_points[idx] = input_points[idx]; + + const float x = input_points[idx].x; + const float y = input_points[idx].y; + const float z = input_points[idx].z; + + output_points[idx].x = transform.m11 * x + transform.m12 * y + transform.m13 * z + transform.x; + output_points[idx].y = transform.m21 * x + transform.m22 * y + transform.m23 * z + transform.y; + output_points[idx].z = transform.m31 * x + transform.m32 * y + transform.m33 * z + transform.z; + } +} + +__global__ void cropBoxKernel( + InputPointType * __restrict__ d_points, std::uint32_t * __restrict__ output_mask, int num_points, + const CropBoxParameters * __restrict__ crop_box_parameters_ptr, int num_crop_boxes) +{ + for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < num_points; + idx += blockDim.x * gridDim.x) { + const float x = d_points[idx].x; + const float y = d_points[idx].y; + const float z = d_points[idx].z; + + std::uint32_t mask = 1; + + for (int i = 0; i < num_crop_boxes; i++) { + const CropBoxParameters & crop_box_parameters = crop_box_parameters_ptr[i]; + const float & min_x = crop_box_parameters.min_x; + const float & min_y = crop_box_parameters.min_y; + const float & min_z = crop_box_parameters.min_z; + const float & max_x = crop_box_parameters.max_x; + const float & max_y = crop_box_parameters.max_y; + const float & max_z = crop_box_parameters.max_z; + mask &= + (x <= min_x || x >= max_x) || (y <= min_y || y >= max_y) || (z <= min_z || z >= max_z); + } + + output_mask[idx] = mask; + } +} + +__global__ void combineMasksKernel( + const std::uint32_t * __restrict__ mask1, const std::uint32_t * __restrict__ mask2, + int num_points, std::uint32_t * __restrict__ output_mask) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < num_points) { + output_mask[idx] = mask1[idx] & mask2[idx]; + } +} + +__global__ void extractPointsKernel( + InputPointType * input_points, std::uint32_t * masks, std::uint32_t * indices, int num_points, + OutputPointType * output_points) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < num_points && masks[idx] == 1) { + InputPointType & input_point = input_points[idx]; + OutputPointType & output_point = output_points[indices[idx] - 1]; + output_point.x = input_point.x; + output_point.y = input_point.y; + output_point.z = input_point.z; + output_point.intensity = input_point.intensity; + output_point.return_type = input_point.return_type; + output_point.channel = input_point.channel; + } +} + +void transformPointsLaunch( + const InputPointType * input_points, InputPointType * output_points, int num_points, + TransformStruct transform, int threads_per_block, int blocks_per_grid, cudaStream_t & stream) +{ + transformPointsKernel<<<blocks_per_grid, threads_per_block, 0, stream>>>( + input_points, output_points, num_points, transform); +} + +void cropBoxLaunch( + InputPointType * d_points, std::uint32_t * output_mask, int num_points, + const CropBoxParameters * crop_box_parameters_ptr, int num_crop_boxes, int threads_per_block, + int blocks_per_grid, cudaStream_t & stream) +{ + cropBoxKernel<<<blocks_per_grid, threads_per_block, 0, stream>>>( + d_points, output_mask, num_points, crop_box_parameters_ptr, num_crop_boxes); +} + +void combineMasksLaunch( + const std::uint32_t * mask1, const std::uint32_t * mask2, int num_points, + std::uint32_t * output_mask, int threads_per_block, int blocks_per_grid, cudaStream_t & stream) +{ + combineMasksKernel<<<blocks_per_grid, threads_per_block, 0, stream>>>( + mask1, mask2, num_points, output_mask); +} + +void extractPointsLaunch( + InputPointType * input_points, std::uint32_t * masks, std::uint32_t * indices, int num_points, + OutputPointType * output_points, int threads_per_block, int blocks_per_grid, + cudaStream_t & stream) +{ + extractPointsKernel<<<blocks_per_grid, threads_per_block, 0, stream>>>( + input_points, masks, indices, num_points, output_points); +} + +} // namespace autoware::cuda_pointcloud_preprocessor diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu index ffc2216006d74..aac9f560a78ca 100644 --- a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu +++ b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu @@ -12,8 +12,13 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "autoware/cuda_pointcloud_preprocessor/common_kernels.hpp" #include "autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.hpp" +#include "autoware/cuda_pointcloud_preprocessor/organize_kernels.hpp" +#include "autoware/cuda_pointcloud_preprocessor/outlier_kernels.hpp" #include "autoware/cuda_pointcloud_preprocessor/point_types.hpp" +#include "autoware/cuda_pointcloud_preprocessor/types.hpp" +#include "autoware/cuda_pointcloud_preprocessor/undistort_kernels.hpp" #include <Eigen/Core> #include <Eigen/Dense> @@ -29,331 +34,6 @@ namespace autoware::cuda_pointcloud_preprocessor { -__host__ __device__ Eigen::Matrix3f skewSymmetric(const Eigen::Vector3f & v) -{ - Eigen::Matrix3f m; - m << 0, -v.z(), v.y(), v.z(), 0, -v.x(), -v.y(), v.x(), 0; - return m; -} - -__host__ __device__ Eigen::Matrix3f leftJacobianSO3(const Eigen::Vector3f & omega) -{ - double theta = omega.norm(); - if (std::abs(theta) < 1e-6) { - return Eigen::Matrix3f::Identity(); - } - - Eigen::Matrix3f Omega = skewSymmetric(omega); - - Eigen::Matrix3f Omega2 = Omega * Omega; - double theta2 = theta * theta; - double theta3 = theta2 * theta; - - // Rodrigues' formula for Jacobian - return Eigen::Matrix3f::Identity() + ((1 - cos(theta)) / theta2) * Omega + - ((theta - sin(theta)) / theta3) * Omega2; -} - -__host__ __device__ Eigen::Matrix4f transformationMatrixFromVelocity( - const Eigen::Vector3f & linear_velocity, const Eigen::Vector3f & angular_velocity, double dt) -{ - Eigen::Matrix3f R = Eigen::AngleAxisf(angular_velocity.norm() * dt, angular_velocity.normalized()) - .toRotationMatrix(); - Eigen::Matrix3f J = leftJacobianSO3(angular_velocity * dt); - - Eigen::Vector3f translation = J * (linear_velocity * dt); - - Eigen::Matrix4f transformation = Eigen::Matrix4f::Identity(); - transformation.block<3, 3>(0, 0) = R; - transformation.block<3, 1>(0, 3) = translation; - - return transformation; -} - -__global__ void organizeKernel( - const InputPointType * __restrict__ input_points, std::uint32_t * index_tensor, - std::int32_t * ring_indexes, std::int32_t initial_max_rings, std::int32_t * output_max_rings, - std::int32_t initial_max_points_per_ring, std::int32_t * output_max_points_per_ring, - int num_points) -{ - int idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx >= num_points) { - return; - } - - auto ring = input_points[idx].channel; - - if (ring >= initial_max_rings) { - atomicMax(output_max_rings, ring); - return; - } - - int next_offset = atomicAdd(&ring_indexes[ring], 1); - - if (next_offset >= initial_max_points_per_ring) { - atomicMax(output_max_points_per_ring, next_offset); - return; - } - - index_tensor[ring * initial_max_points_per_ring + next_offset] = idx; -} - -__global__ void gatherKernel( - const InputPointType * __restrict__ input_points, const std::uint32_t * __restrict__ index_tensor, - InputPointType * __restrict__ output_points, int num_rings, int max_points_per_ring) -{ - const int idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx >= num_rings * max_points_per_ring) { - return; - } - - const int ring = idx / max_points_per_ring; - const int point = idx % max_points_per_ring; - - const std::uint32_t input_idx = index_tensor[ring * max_points_per_ring + point]; - - if (input_idx < std::numeric_limits<std::uint32_t>::max()) { - output_points[ring * max_points_per_ring + point] = input_points[input_idx]; - } else { - output_points[ring * max_points_per_ring + point].distance = 0.0f; - } -} - -__global__ void transformPointsKernel( - const InputPointType * __restrict__ input_points, InputPointType * output_points, int num_points, - TransformStruct transform) -{ - int idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx < num_points) { - output_points[idx] = input_points[idx]; - - const float x = input_points[idx].x; - const float y = input_points[idx].y; - const float z = input_points[idx].z; - - output_points[idx].x = transform.m11 * x + transform.m12 * y + transform.m13 * z + transform.x; - output_points[idx].y = transform.m21 * x + transform.m22 * y + transform.m23 * z + transform.y; - output_points[idx].z = transform.m31 * x + transform.m32 * y + transform.m33 * z + transform.z; - } -} - -__global__ void cropBoxKernel( - InputPointType * __restrict__ d_points, std::uint32_t * __restrict__ output_mask, int num_points, - const CropBoxParameters * __restrict__ crop_box_parameters_ptr, int num_crop_boxes) -{ - for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < num_points; - idx += blockDim.x * gridDim.x) { - const float x = d_points[idx].x; - const float y = d_points[idx].y; - const float z = d_points[idx].z; - - std::uint32_t mask = 1; - - for (int i = 0; i < num_crop_boxes; i++) { - const CropBoxParameters & crop_box_parameters = crop_box_parameters_ptr[i]; - const float & min_x = crop_box_parameters.min_x; - const float & min_y = crop_box_parameters.min_y; - const float & min_z = crop_box_parameters.min_z; - const float & max_x = crop_box_parameters.max_x; - const float & max_y = crop_box_parameters.max_y; - const float & max_z = crop_box_parameters.max_z; - mask &= - (x <= min_x || x >= max_x) || (y <= min_y || y >= max_y) || (z <= min_z || z >= max_z); - } - - output_mask[idx] = mask; - } -} - -__global__ void combineMasksKernel( - const std::uint32_t * __restrict__ mask1, const std::uint32_t * __restrict__ mask2, - int num_points, std::uint32_t * __restrict__ output_mask) -{ - int idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx < num_points) { - output_mask[idx] = mask1[idx] & mask2[idx]; - } -} - -__global__ void extractInputPointIndicesKernel( - InputPointType * input_points, std::uint32_t * masks, std::uint32_t * indices, int num_points, - InputPointType * output_points) -{ - int idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx < num_points && masks[idx] == 1) { - output_points[indices[idx] - 1] = input_points[idx]; - } -} - -__global__ void extractOutputPointIndicesKernel( - OutputPointType * input_points, std::uint32_t * masks, std::uint32_t * indices, int num_points, - OutputPointType * output_points) -{ - int idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx < num_points && masks[idx] == 1) { - output_points[indices[idx] - 1] = input_points[idx]; - } -} - -__global__ void extractInputPointsToOutputPoints_indicesKernel( - InputPointType * input_points, std::uint32_t * masks, std::uint32_t * indices, int num_points, - OutputPointType * output_points) -{ - int idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx < num_points && masks[idx] == 1) { - InputPointType & input_point = input_points[idx]; - OutputPointType & output_point = output_points[indices[idx] - 1]; - output_point.x = input_point.x; - output_point.y = input_point.y; - output_point.z = input_point.z; - output_point.intensity = input_point.intensity; - output_point.return_type = input_point.return_type; - output_point.channel = input_point.channel; - } -} - -__global__ void undistort2dKernel( - InputPointType * input_points, int num_points, TwistStruct2D * twist_structs, int num_twists) -{ - int idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx < num_points) { - InputPointType & point = input_points[idx]; - - // The twist must always be newer than the point ! (or it was the last twist) - int twist_index = 0; - while (twist_index < num_twists && twist_structs[twist_index].stamp_nsec < point.time_stamp) { - twist_index++; - } - - twist_index = min(twist_index, num_twists - 1); - - TwistStruct2D twist = twist_structs[twist_index]; - float x = twist.cum_x; - float y = twist.cum_y; - float theta = twist.cum_theta; - - double dt_nsec = - point.time_stamp > twist.last_stamp_nsec ? point.time_stamp - twist.last_stamp_nsec : 0; - double dt = 1e-9 * (dt_nsec); - - theta += twist.v_theta * dt; - float d = twist.v_x * dt; - x += d * cos(theta); - y += d * sin(theta); - - float distorted_x = point.x; - float distorted_y = point.y; - - point.x = distorted_x * cos(theta) - distorted_y * sin(theta) + x; - point.y = distorted_x * sin(theta) + distorted_y * cos(theta) + y; - } -} - -__global__ void undistort3dKernel( - InputPointType * input_points, int num_points, TwistStruct3D * twist_structs, int num_twists) -{ - int idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx < num_points) { - InputPointType & point = input_points[idx]; - - // The twist must always be newer than the point ! (or it was the last twist) - int twist_index = 0; - while (twist_index < num_twists && twist_structs[twist_index].stamp_nsec < point.time_stamp) { - twist_index++; - } - - twist_index = min(twist_index, num_twists - 1); - - TwistStruct3D twist = twist_structs[twist_index]; - Eigen::Map<Eigen::Matrix4f> cum_transform_buffer_map(twist.cum_transform_buffer); - Eigen::Map<Eigen::Vector3f> v_map(twist.v); - Eigen::Map<Eigen::Vector3f> w_map(twist.w); - - double dt_nsec = - point.time_stamp > twist.last_stamp_nsec ? point.time_stamp - twist.last_stamp_nsec : 0; - double dt = 1e-9 * (dt_nsec); - - Eigen::Matrix4f transform = - cum_transform_buffer_map * transformationMatrixFromVelocity(v_map, w_map, dt); - Eigen::Vector3f p(point.x, point.y, point.z); - p = transform.block<3, 3>(0, 0) * p + transform.block<3, 1>(0, 3); - - point.x = p.x(); - point.y = p.y(); - point.z = p.z(); - } -} - -__global__ void ringOutlierFilterKernel( - const InputPointType * d_points, std::uint32_t * output_mask, int num_rings, - int max_points_per_ring, float distance_ratio, float object_length_threshold_squared, - int num_points_threshold) -{ - int idx = blockIdx.x * blockDim.x + threadIdx.x; - int j = idx / max_points_per_ring; - int i = idx % max_points_per_ring; - - if (j >= num_rings || i >= max_points_per_ring) { - return; - } - - int min_i = max(i - num_points_threshold, 0); - int max_i = min(i + num_points_threshold, max_points_per_ring); - - int walk_size = 1; - int left_idx = min_i; - int right_idx = min_i + 1; - - for (int k = min_i; k < max_i - 1; k++) { - const InputPointType & left_point = d_points[j * max_points_per_ring + k]; - const InputPointType & right_point = d_points[j * max_points_per_ring + k + 1]; - - // Find biggest walk that passes through i - float azimuth_diff = right_point.azimuth - left_point.azimuth; - azimuth_diff = azimuth_diff < 0.f ? azimuth_diff + 2 * M_PI : azimuth_diff; - - if ( - max(left_point.distance, right_point.distance) < - min(left_point.distance, right_point.distance) * distance_ratio && - azimuth_diff < 1.f * M_PI / 180.f) { - // Determined to be included in the same walk - walk_size++; - right_idx++; - } else if (k >= i) { - break; - } else { - walk_size = 1; - left_idx = k + 1; - right_idx = k + 2; // this is safe since we break if k >= i - } - } - - const InputPointType & left_point = d_points[j * max_points_per_ring + left_idx]; - const InputPointType & right_point = d_points[j * max_points_per_ring + right_idx - 1]; - const float x = left_point.x - right_point.x; - const float y = left_point.y - right_point.y; - const float z = left_point.z - right_point.z; - - output_mask[j * max_points_per_ring + i] = static_cast<std::uint32_t>( - (walk_size > num_points_threshold) || - (x * x + y * y + z * z >= object_length_threshold_squared)); -} - -__global__ void transformPointTypeKernel( - const InputPointType * device_input_points, int num_points, OutputPointType * device_ouput_points) -{ - int idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx < num_points) { - const InputPointType & input_point = device_input_points[idx]; - OutputPointType & output_point = device_ouput_points[idx]; - - output_point.x = input_point.x; - output_point.y = input_point.y; - output_point.z = input_point.z; - output_point.intensity = (float)input_point.intensity; - } -} - CudaPointcloudPreprocessor::CudaPointcloudPreprocessor() { sensor_msgs::msg::PointField x_field, y_field, z_field, intensity_field, return_type_field, @@ -514,199 +194,6 @@ void CudaPointcloudPreprocessor::preallocateOutput() num_rings_ * max_points_per_ring_ * sizeof(OutputPointType)); } -void CudaPointcloudPreprocessor::setupTwist2DStructs( - const std::deque<geometry_msgs::msg::TwistWithCovarianceStamped> & twist_queue, - const std::deque<geometry_msgs::msg::Vector3Stamped> & angular_velocity_queue, - const std::uint64_t pointcloud_stamp_nsec, const std::uint32_t first_point_rel_stamp_nsec) -{ - host_twist_2d_structs_.clear(); - host_twist_2d_structs_.reserve(twist_queue.size() + angular_velocity_queue.size()); - - // Twist preprocessing - float cum_x = 0; - float cum_y = 0; - float cum_theta = 0; - // All time stamps from now on are in nsec from the "beginning of the pointcloud" - std::uint32_t last_stamp_nsec = first_point_rel_stamp_nsec; - - std::size_t twist_index = 0; - std::size_t angular_velocity_index = 0; - - for (; twist_index < twist_queue.size() || - angular_velocity_index < angular_velocity_queue.size();) { - std::uint64_t twist_stamp, input_twist_global_stamp_nsec, angular_velocity_global_stamp_nsec; - float v_x, v_theta; - - if (twist_index < twist_queue.size()) { - input_twist_global_stamp_nsec = - 1'000'000'000 * static_cast<std::uint64_t>(twist_queue[twist_index].header.stamp.sec) + - static_cast<std::uint64_t>(twist_queue[twist_index].header.stamp.nanosec); - v_x = twist_queue[twist_index].twist.twist.linear.x; - } else { - input_twist_global_stamp_nsec = std::numeric_limits<std::uint64_t>::max(); - v_x = 0.0; - } - - if (angular_velocity_index < angular_velocity_queue.size()) { - angular_velocity_global_stamp_nsec = - 1'000'000'000 * static_cast<std::uint64_t>( - angular_velocity_queue[angular_velocity_index].header.stamp.sec) + - static_cast<std::uint64_t>( - angular_velocity_queue[angular_velocity_index].header.stamp.nanosec); - v_theta = angular_velocity_queue[angular_velocity_index].vector.z; - } else { - angular_velocity_global_stamp_nsec = std::numeric_limits<std::uint64_t>::max(); - v_theta = 0.0; - } - - if (input_twist_global_stamp_nsec < angular_velocity_global_stamp_nsec) { - twist_stamp = input_twist_global_stamp_nsec; - twist_index++; - } else if (input_twist_global_stamp_nsec > angular_velocity_global_stamp_nsec) { - twist_stamp = angular_velocity_global_stamp_nsec; - angular_velocity_index++; - } else { - twist_index++; - angular_velocity_index++; - } - - TwistStruct2D twist; - twist.cum_x = cum_x; - twist.cum_y = cum_y; - twist.cum_theta = cum_theta; - - std::uint64_t twist_global_stamp_nsec = twist_stamp; - assert(twist_global_stamp_nsec > pointcloud_stamp_nsec); // by construction - std::uint32_t twist_from_pointcloud_start_nsec = - twist_global_stamp_nsec - pointcloud_stamp_nsec; - - twist.stamp_nsec = twist_from_pointcloud_start_nsec; - twist.v_x = v_x; - twist.v_theta = v_theta; - twist.last_stamp_nsec = last_stamp_nsec; - host_twist_2d_structs_.push_back(twist); - - double dt_seconds = 1e-9 * (twist.stamp_nsec - last_stamp_nsec); - last_stamp_nsec = twist.stamp_nsec; - cum_theta += v_theta * dt_seconds; - float d = twist.v_x * dt_seconds; - cum_x += d * cos(cum_theta); - cum_y += d * sin(cum_theta); - } - - // Copy to device - device_twist_2d_structs_.resize(host_twist_2d_structs_.size()); - cudaMemcpyAsync( - thrust::raw_pointer_cast(device_twist_2d_structs_.data()), host_twist_2d_structs_.data(), - host_twist_2d_structs_.size() * sizeof(TwistStruct2D), cudaMemcpyHostToDevice, stream_); -} - -void CudaPointcloudPreprocessor::setupTwist3DStructs( - const std::deque<geometry_msgs::msg::TwistWithCovarianceStamped> & twist_queue, - const std::deque<geometry_msgs::msg::Vector3Stamped> & angular_velocity_queue, - const std::uint64_t pointcloud_stamp_nsec, const std::uint32_t first_point_rel_stamp_nsec) -{ - // Twist preprocessing - host_twist_3d_structs_.clear(); - host_twist_3d_structs_.reserve(twist_queue.size() + angular_velocity_queue.size()); - - Eigen::Matrix4f cum_transform = Eigen::Matrix4f::Identity(); - Eigen::Vector3f v = Eigen::Vector3f::Zero(); - Eigen::Vector3f w = Eigen::Vector3f::Zero(); - - // All time stamps from now on are in nsec from the "beginning of the pointcloud" - std::uint32_t last_stamp_nsec = first_point_rel_stamp_nsec; - - std::size_t twist_index = 0; - std::size_t angular_velocity_index = 0; - - for (; twist_index < twist_queue.size() || - angular_velocity_index < angular_velocity_queue.size();) { - std::uint64_t twist_stamp, input_twist_global_stamp_nsec, angular_velocity_global_stamp_nsec; - - if (twist_index < twist_queue.size()) { - input_twist_global_stamp_nsec = - 1'000'000'000 * static_cast<std::uint64_t>(twist_queue[twist_index].header.stamp.sec) + - static_cast<std::uint64_t>(twist_queue[twist_index].header.stamp.nanosec); - v.x() = twist_queue[twist_index].twist.twist.linear.x; - v.y() = twist_queue[twist_index].twist.twist.linear.y; - v.z() = twist_queue[twist_index].twist.twist.linear.z; - } else { - input_twist_global_stamp_nsec = std::numeric_limits<uint64_t>::max(); - v = Eigen::Vector3f::Zero(); - } - - if (angular_velocity_index < angular_velocity_queue.size()) { - angular_velocity_global_stamp_nsec = - 1'000'000'000 * static_cast<std::uint64_t>( - angular_velocity_queue[angular_velocity_index].header.stamp.sec) + - static_cast<std::uint64_t>( - angular_velocity_queue[angular_velocity_index].header.stamp.nanosec); - w.x() = angular_velocity_queue[angular_velocity_index].vector.x; - w.y() = angular_velocity_queue[angular_velocity_index].vector.y; - w.z() = angular_velocity_queue[angular_velocity_index].vector.z; - } else { - angular_velocity_global_stamp_nsec = std::numeric_limits<std::uint64_t>::max(); - w = Eigen::Vector3f::Zero(); - } - - if (input_twist_global_stamp_nsec < angular_velocity_global_stamp_nsec) { - twist_stamp = input_twist_global_stamp_nsec; - twist_index++; - } else if (input_twist_global_stamp_nsec > angular_velocity_global_stamp_nsec) { - twist_stamp = angular_velocity_global_stamp_nsec; - angular_velocity_index++; - } else { - twist_index++; - angular_velocity_index++; - } - - TwistStruct3D twist; - - Eigen::Map<Eigen::Matrix4f> cum_transform_buffer_map(twist.cum_transform_buffer); - Eigen::Map<Eigen::Vector3f> v_map(twist.v); - Eigen::Map<Eigen::Vector3f> w_map(twist.w); - cum_transform_buffer_map = cum_transform; - - std::uint64_t twist_global_stamp_nsec = twist_stamp; - assert(twist_global_stamp_nsec > pointcloud_stamp_nsec); // by construction - std::uint32_t twist_from_pointcloud_start_nsec = - twist_global_stamp_nsec - pointcloud_stamp_nsec; - - twist.stamp_nsec = twist_from_pointcloud_start_nsec; - v_map = v; - w_map = w; - twist.last_stamp_nsec = last_stamp_nsec; - host_twist_3d_structs_.push_back(twist); - - double dt_seconds = 1e-9 * (twist.stamp_nsec - last_stamp_nsec); - last_stamp_nsec = twist.stamp_nsec; - - auto delta_transform = transformationMatrixFromVelocity(v, w, dt_seconds); - cum_transform = cum_transform * delta_transform; - } - - // Copy to device - device_twist_3d_structs_.resize(host_twist_3d_structs_.size()); - cudaMemcpyAsync( - thrust::raw_pointer_cast(device_twist_3d_structs_.data()), host_twist_3d_structs_.data(), - host_twist_3d_structs_.size() * sizeof(TwistStruct3D), cudaMemcpyHostToDevice, stream_); -} - -std::size_t CudaPointcloudPreprocessor::querySortWorkspace( - int num_items, int num_segments, int * offsets_device, std::uint32_t * keys_in_device, - std::uint32_t * keys_out_device) -{ - // Determine temporary device storage requirements - void * temp_storage = nullptr; - size_t temp_storage_bytes = 0; - cub::DeviceSegmentedRadixSort::SortKeys( - temp_storage, temp_storage_bytes, keys_in_device, keys_out_device, num_items, num_segments, - offsets_device, offsets_device + 1); - - return temp_storage_bytes; -} - void CudaPointcloudPreprocessor::organizePointcloud() { cudaMemsetAsync( @@ -723,12 +210,13 @@ void CudaPointcloudPreprocessor::organizePointcloud() const int raw_points_blocks_per_grid = (num_raw_points_ + threads_per_block_ - 1) / threads_per_block_; - organizeKernel<<<raw_points_blocks_per_grid, threads_per_block_, 0, stream_>>>( + organizeLaunch( thrust::raw_pointer_cast(device_input_points_.data()), thrust::raw_pointer_cast(device_indexes_tensor_.data()), thrust::raw_pointer_cast(device_ring_index_.data()), num_rings_, thrust::raw_pointer_cast(device_max_ring_.data()), max_points_per_ring_, - thrust::raw_pointer_cast(device_max_points_per_ring_.data()), num_raw_points_); + thrust::raw_pointer_cast(device_max_points_per_ring_.data()), num_raw_points_, + threads_per_block_, raw_points_blocks_per_grid, stream_); std::int32_t max_ring_value; std::int32_t max_points_per_ring; @@ -781,12 +269,13 @@ void CudaPointcloudPreprocessor::organizePointcloud() thrust::raw_pointer_cast(device_sorted_indexes_tensor_.data())); device_sort_workspace_.resize(sort_workspace_bytes_); - organizeKernel<<<raw_points_blocks_per_grid, threads_per_block_, 0, stream_>>>( + organizeLaunch( thrust::raw_pointer_cast(device_input_points_.data()), thrust::raw_pointer_cast(device_indexes_tensor_.data()), thrust::raw_pointer_cast(device_ring_index_.data()), num_rings_, thrust::raw_pointer_cast(device_max_ring_.data()), max_points_per_ring_, - thrust::raw_pointer_cast(device_max_points_per_ring_.data()), num_raw_points_); + thrust::raw_pointer_cast(device_max_points_per_ring_.data()), num_raw_points_, + threads_per_block_, raw_points_blocks_per_grid, stream_); } if (num_organized_points_ == num_rings_) { @@ -806,10 +295,12 @@ void CudaPointcloudPreprocessor::organizePointcloud() const int organized_points_blocks_per_grid = (num_organized_points_ + threads_per_block_ - 1) / threads_per_block_; - gatherKernel<<<organized_points_blocks_per_grid, threads_per_block_, 0, stream_>>>( + + gatherLaunch( thrust::raw_pointer_cast(device_input_points_.data()), thrust::raw_pointer_cast(device_sorted_indexes_tensor_.data()), - thrust::raw_pointer_cast(device_organized_points_.data()), num_rings_, max_points_per_ring_); + thrust::raw_pointer_cast(device_organized_points_.data()), num_rings_, max_points_per_ring_, + threads_per_block_, organized_points_blocks_per_grid, stream_); } std::unique_ptr<cuda_blackboard::CudaPointCloud2> CudaPointcloudPreprocessor::process( @@ -862,14 +353,14 @@ std::unique_ptr<cuda_blackboard::CudaPointCloud2> CudaPointcloudPreprocessor::pr if (use_3d_undistortion_) { setupTwist3DStructs( - twist_queue, angular_velocity_queue, pointcloud_stamp_nsec, first_point_rel_stamp_nsec); + twist_queue, angular_velocity_queue, pointcloud_stamp_nsec, first_point_rel_stamp_nsec, + device_twist_3d_structs_, stream_); } else { setupTwist2DStructs( - twist_queue, angular_velocity_queue, pointcloud_stamp_nsec, first_point_rel_stamp_nsec); + twist_queue, angular_velocity_queue, pointcloud_stamp_nsec, first_point_rel_stamp_nsec, + device_twist_2d_structs_, stream_); } - // check_error(frame_id, "After setup"); - // Obtain raw pointers for the kernels TwistStruct2D * device_twist_2d_structs = thrust::raw_pointer_cast(device_twist_2d_structs_.data()); @@ -884,38 +375,40 @@ std::unique_ptr<cuda_blackboard::CudaPointCloud2> CudaPointcloudPreprocessor::pr const int blocks_per_grid = (num_organized_points_ + threads_per_block_ - 1) / threads_per_block_; - transformPointsKernel<<<blocks_per_grid, threads_per_block_, 0, stream_>>>( + transformPointsLaunch( thrust::raw_pointer_cast(device_organized_points_.data()), device_transformed_points, - num_organized_points_, transform_struct); + num_organized_points_, transform_struct, threads_per_block_, blocks_per_grid, stream_); int crop_box_blocks_per_grid = std::min(blocks_per_grid, max_blocks_per_grid_); if (host_crop_box_structs_.size() > 0) { - cropBoxKernel<<<crop_box_blocks_per_grid, threads_per_block_, 0, stream_>>>( + cropBoxLaunch( device_transformed_points, device_crop_mask, num_organized_points_, - thrust::raw_pointer_cast(device_crop_box_structs_.data()), host_crop_box_structs_.size()); + thrust::raw_pointer_cast(device_crop_box_structs_.data()), host_crop_box_structs_.size(), + crop_box_blocks_per_grid, threads_per_block_, stream_); } else { thrust::fill(thrust::device, device_crop_mask, device_crop_mask + num_organized_points_, 1); } if (use_3d_undistortion_ && device_twist_3d_structs_.size() > 0) { - undistort3dKernel<<<blocks_per_grid, threads_per_block_, 0, stream_>>>( + undistort3dLaunch( device_transformed_points, num_organized_points_, device_twist_3d_structs, - device_twist_3d_structs_.size()); + device_twist_3d_structs_.size(), threads_per_block_, blocks_per_grid, stream_); } else if (!use_3d_undistortion_ && device_twist_2d_structs_.size() > 0) { - undistort2dKernel<<<blocks_per_grid, threads_per_block_, 0, stream_>>>( + undistort2dLaunch( device_transformed_points, num_organized_points_, device_twist_2d_structs, - device_twist_2d_structs_.size()); + device_twist_2d_structs_.size(), threads_per_block_, blocks_per_grid, stream_); } - ringOutlierFilterKernel<<<blocks_per_grid, threads_per_block_, 0, stream_>>>( + ringOutlierFilterLaunch( device_transformed_points, device_ring_outlier_mask, num_rings_, max_points_per_ring_, ring_outlier_parameters_.distance_ratio, ring_outlier_parameters_.object_length_threshold * ring_outlier_parameters_.object_length_threshold, - ring_outlier_parameters_.num_points_threshold); + ring_outlier_parameters_.num_points_threshold, threads_per_block_, blocks_per_grid, stream_); - combineMasksKernel<<<blocks_per_grid, threads_per_block_, 0, stream_>>>( - device_crop_mask, device_ring_outlier_mask, num_organized_points_, device_ring_outlier_mask); + combineMasksLaunch( + device_crop_mask, device_ring_outlier_mask, num_organized_points_, device_ring_outlier_mask, + threads_per_block_, blocks_per_grid, stream_); thrust::inclusive_scan( thrust::device, device_ring_outlier_mask, device_ring_outlier_mask + num_organized_points_, @@ -928,10 +421,10 @@ std::unique_ptr<cuda_blackboard::CudaPointCloud2> CudaPointcloudPreprocessor::pr cudaStreamSynchronize(stream_); if (num_output_points > 0) { - extractInputPointsToOutputPoints_indicesKernel<<< - blocks_per_grid, threads_per_block_, 0, stream_>>>( + extractPointsLaunch( device_transformed_points, device_ring_outlier_mask, device_indices, num_organized_points_, - reinterpret_cast<OutputPointType *>(output_pointcloud_ptr_->data.get())); + reinterpret_cast<OutputPointType *>(output_pointcloud_ptr_->data.get()), threads_per_block_, + blocks_per_grid, stream_); } cudaStreamSynchronize(stream_); diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/organize_kernels.cu b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/organize_kernels.cu new file mode 100644 index 0000000000000..00ab2ddc1aec5 --- /dev/null +++ b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/organize_kernels.cu @@ -0,0 +1,108 @@ +// Copyright 2024 TIER IV, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "autoware/cuda_pointcloud_preprocessor/organize_kernels.hpp" +#include "autoware/cuda_pointcloud_preprocessor/point_types.hpp" + +#include <cub/cub.cuh> + +#include <limits> + +namespace autoware::cuda_pointcloud_preprocessor +{ + +__global__ void organizeKernel( + const InputPointType * __restrict__ input_points, std::uint32_t * index_tensor, + std::int32_t * ring_indexes, std::int32_t initial_max_rings, std::int32_t * output_max_rings, + std::int32_t initial_max_points_per_ring, std::int32_t * output_max_points_per_ring, + int num_points) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= num_points) { + return; + } + + auto ring = input_points[idx].channel; + + if (ring >= initial_max_rings) { + atomicMax(output_max_rings, ring); + return; + } + + int next_offset = atomicAdd(&ring_indexes[ring], 1); + + if (next_offset >= initial_max_points_per_ring) { + atomicMax(output_max_points_per_ring, next_offset); + return; + } + + index_tensor[ring * initial_max_points_per_ring + next_offset] = idx; +} + +__global__ void gatherKernel( + const InputPointType * __restrict__ input_points, const std::uint32_t * __restrict__ index_tensor, + InputPointType * __restrict__ output_points, int num_rings, int max_points_per_ring) +{ + const int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= num_rings * max_points_per_ring) { + return; + } + + const int ring = idx / max_points_per_ring; + const int point = idx % max_points_per_ring; + + const std::uint32_t input_idx = index_tensor[ring * max_points_per_ring + point]; + + if (input_idx < std::numeric_limits<std::uint32_t>::max()) { + output_points[ring * max_points_per_ring + point] = input_points[input_idx]; + } else { + output_points[ring * max_points_per_ring + point].distance = 0.0f; + } +} + +std::size_t querySortWorkspace( + int num_items, int num_segments, int * offsets_device, std::uint32_t * keys_in_device, + std::uint32_t * keys_out_device) +{ + // Determine temporary device storage requirements + void * temp_storage = nullptr; + size_t temp_storage_bytes = 0; + cub::DeviceSegmentedRadixSort::SortKeys( + temp_storage, temp_storage_bytes, keys_in_device, keys_out_device, num_items, num_segments, + offsets_device, offsets_device + 1); + + return temp_storage_bytes; +} + +void organizeLaunch( + const InputPointType * input_points, std::uint32_t * index_tensor, std::int32_t * ring_indexes, + std::int32_t initial_max_rings, std::int32_t * output_max_rings, + std::int32_t initial_max_points_per_ring, std::int32_t * output_max_points_per_ring, + int num_points, int threads_per_block, int blocks_per_grid, cudaStream_t & stream) +{ + organizeKernel<<<blocks_per_grid, threads_per_block, 0, stream>>>( + input_points, index_tensor, ring_indexes, initial_max_rings, output_max_rings, + initial_max_points_per_ring, output_max_points_per_ring, num_points); +} + +void gatherLaunch( + const InputPointType * input_points, const std::uint32_t * index_tensor, + InputPointType * output_points, int num_rings, int max_points_per_ring, int threads_per_block, + int blocks_per_grid, cudaStream_t & stream) +{ + gatherKernel<<<blocks_per_grid, threads_per_block, 0, stream>>>( + input_points, index_tensor, output_points, num_rings, max_points_per_ring); +} + +} // namespace autoware::cuda_pointcloud_preprocessor diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/outlier_kernels.cu b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/outlier_kernels.cu new file mode 100644 index 0000000000000..18b116535a32c --- /dev/null +++ b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/outlier_kernels.cu @@ -0,0 +1,86 @@ +// Copyright 2024 TIER IV, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "autoware/cuda_pointcloud_preprocessor/outlier_kernels.hpp" +#include "autoware/cuda_pointcloud_preprocessor/point_types.hpp" + +namespace autoware::cuda_pointcloud_preprocessor +{ + +__global__ void ringOutlierFilterKernel( + const InputPointType * points, std::uint32_t * output_mask, int num_rings, + int max_points_per_ring, float distance_ratio, float object_length_threshold_squared, + int num_points_threshold) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + int j = idx / max_points_per_ring; + int i = idx % max_points_per_ring; + + if (j >= num_rings || i >= max_points_per_ring) { + return; + } + + int min_i = max(i - num_points_threshold, 0); + int max_i = min(i + num_points_threshold, max_points_per_ring); + + int walk_size = 1; + int left_idx = min_i; + int right_idx = min_i + 1; + + for (int k = min_i; k < max_i - 1; k++) { + const InputPointType & left_point = points[j * max_points_per_ring + k]; + const InputPointType & right_point = points[j * max_points_per_ring + k + 1]; + + // Find biggest walk that passes through i + float azimuth_diff = right_point.azimuth - left_point.azimuth; + azimuth_diff = azimuth_diff < 0.f ? azimuth_diff + 2 * M_PI : azimuth_diff; + + if ( + max(left_point.distance, right_point.distance) < + min(left_point.distance, right_point.distance) * distance_ratio && + azimuth_diff < 1.f * M_PI / 180.f) { + // Determined to be included in the same walk + walk_size++; + right_idx++; + } else if (k >= i) { + break; + } else { + walk_size = 1; + left_idx = k + 1; + right_idx = k + 2; // this is safe since we break if k >= i + } + } + + const InputPointType & left_point = points[j * max_points_per_ring + left_idx]; + const InputPointType & right_point = points[j * max_points_per_ring + right_idx - 1]; + const float x = left_point.x - right_point.x; + const float y = left_point.y - right_point.y; + const float z = left_point.z - right_point.z; + + output_mask[j * max_points_per_ring + i] = static_cast<std::uint32_t>( + (walk_size > num_points_threshold) || + (x * x + y * y + z * z >= object_length_threshold_squared)); +} + +void ringOutlierFilterLaunch( + const InputPointType * points, std::uint32_t * output_mask, int num_rings, + int max_points_per_ring, float distance_ratio, float object_length_threshold_squared, + int num_points_threshold, int threads_per_block, int blocks_per_grid, cudaStream_t & stream) +{ + ringOutlierFilterKernel<<<blocks_per_grid, threads_per_block, 0, stream>>>( + points, output_mask, num_rings, max_points_per_ring, distance_ratio, + object_length_threshold_squared, num_points_threshold); +} + +} // namespace autoware::cuda_pointcloud_preprocessor diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/undistort_kernels.cu b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/undistort_kernels.cu new file mode 100644 index 0000000000000..6767b03aebf56 --- /dev/null +++ b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/undistort_kernels.cu @@ -0,0 +1,335 @@ +// Copyright 2024 TIER IV, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "autoware/cuda_pointcloud_preprocessor/point_types.hpp" +#include "autoware/cuda_pointcloud_preprocessor/types.hpp" +#include "autoware/cuda_pointcloud_preprocessor/undistort_kernels.hpp" + +#include <Eigen/Core> +#include <Eigen/Dense> +#include <Eigen/Geometry> + +namespace autoware::cuda_pointcloud_preprocessor +{ + +__host__ __device__ Eigen::Matrix3f skewSymmetric(const Eigen::Vector3f & v) +{ + Eigen::Matrix3f m; + m << 0, -v.z(), v.y(), v.z(), 0, -v.x(), -v.y(), v.x(), 0; + return m; +} + +__host__ __device__ Eigen::Matrix3f leftJacobianSO3(const Eigen::Vector3f & omega) +{ + double theta = omega.norm(); + if (std::abs(theta) < 1e-6) { + return Eigen::Matrix3f::Identity(); + } + + Eigen::Matrix3f Omega = skewSymmetric(omega); + + Eigen::Matrix3f Omega2 = Omega * Omega; + double theta2 = theta * theta; + double theta3 = theta2 * theta; + + // Rodrigues' formula for Jacobian + return Eigen::Matrix3f::Identity() + ((1 - cos(theta)) / theta2) * Omega + + ((theta - sin(theta)) / theta3) * Omega2; +} + +__host__ __device__ Eigen::Matrix4f transformationMatrixFromVelocity( + const Eigen::Vector3f & linear_velocity, const Eigen::Vector3f & angular_velocity, double dt) +{ + Eigen::Matrix3f R = Eigen::AngleAxisf(angular_velocity.norm() * dt, angular_velocity.normalized()) + .toRotationMatrix(); + Eigen::Matrix3f J = leftJacobianSO3(angular_velocity * dt); + + Eigen::Vector3f translation = J * (linear_velocity * dt); + + Eigen::Matrix4f transformation = Eigen::Matrix4f::Identity(); + transformation.block<3, 3>(0, 0) = R; + transformation.block<3, 1>(0, 3) = translation; + + return transformation; +} + +__global__ void undistort2dKernel( + InputPointType * input_points, int num_points, TwistStruct2D * twist_structs, int num_twists) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < num_points) { + InputPointType & point = input_points[idx]; + + // The twist must always be newer than the point ! (or it was the last twist) + int twist_index = 0; + while (twist_index < num_twists && twist_structs[twist_index].stamp_nsec < point.time_stamp) { + twist_index++; + } + + twist_index = min(twist_index, num_twists - 1); + + TwistStruct2D twist = twist_structs[twist_index]; + float x = twist.cum_x; + float y = twist.cum_y; + float theta = twist.cum_theta; + + double dt_nsec = + point.time_stamp > twist.last_stamp_nsec ? point.time_stamp - twist.last_stamp_nsec : 0; + double dt = 1e-9 * (dt_nsec); + + theta += twist.v_theta * dt; + float d = twist.v_x * dt; + x += d * cos(theta); + y += d * sin(theta); + + float distorted_x = point.x; + float distorted_y = point.y; + + point.x = distorted_x * cos(theta) - distorted_y * sin(theta) + x; + point.y = distorted_x * sin(theta) + distorted_y * cos(theta) + y; + } +} + +__global__ void undistort3dKernel( + InputPointType * input_points, int num_points, TwistStruct3D * twist_structs, int num_twists) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < num_points) { + InputPointType & point = input_points[idx]; + + // The twist must always be newer than the point ! (or it was the last twist) + int twist_index = 0; + while (twist_index < num_twists && twist_structs[twist_index].stamp_nsec < point.time_stamp) { + twist_index++; + } + + twist_index = min(twist_index, num_twists - 1); + + TwistStruct3D twist = twist_structs[twist_index]; + Eigen::Map<Eigen::Matrix4f> cum_transform_buffer_map(twist.cum_transform_buffer); + Eigen::Map<Eigen::Vector3f> v_map(twist.v); + Eigen::Map<Eigen::Vector3f> w_map(twist.w); + + double dt_nsec = + point.time_stamp > twist.last_stamp_nsec ? point.time_stamp - twist.last_stamp_nsec : 0; + double dt = 1e-9 * (dt_nsec); + + Eigen::Matrix4f transform = + cum_transform_buffer_map * transformationMatrixFromVelocity(v_map, w_map, dt); + Eigen::Vector3f p(point.x, point.y, point.z); + p = transform.block<3, 3>(0, 0) * p + transform.block<3, 1>(0, 3); + + point.x = p.x(); + point.y = p.y(); + point.z = p.z(); + } +} + +void undistort2dLaunch( + InputPointType * input_points, int num_points, TwistStruct2D * twist_structs, int num_twists, + int threads_per_block, int blocks_per_grid, cudaStream_t & stream) +{ + undistort2dKernel<<<blocks_per_grid, threads_per_block, 0, stream>>>( + input_points, num_points, twist_structs, num_twists); +} + +void undistort3dLaunch( + InputPointType * input_points, int num_points, TwistStruct3D * twist_structs, int num_twists, + int threads_per_block, int blocks_per_grid, cudaStream_t & stream) +{ + undistort3dKernel<<<blocks_per_grid, threads_per_block, 0, stream>>>( + input_points, num_points, twist_structs, num_twists); +} + +void setupTwist2DStructs( + const std::deque<geometry_msgs::msg::TwistWithCovarianceStamped> & twist_queue, + const std::deque<geometry_msgs::msg::Vector3Stamped> & angular_velocity_queue, + const std::uint64_t pointcloud_stamp_nsec, const std::uint32_t first_point_rel_stamp_nsec, + thrust::device_vector<TwistStruct2D> & device_twist_2d_structs, cudaStream_t & stream) +{ + std::vector<TwistStruct2D> host_twist_2d_structs; + host_twist_2d_structs.reserve(twist_queue.size() + angular_velocity_queue.size()); + + // Twist preprocessing + float cum_x = 0; + float cum_y = 0; + float cum_theta = 0; + // All time stamps from now on are in nsec from the "beginning of the pointcloud" + std::uint32_t last_stamp_nsec = first_point_rel_stamp_nsec; + + std::size_t twist_index = 0; + std::size_t angular_velocity_index = 0; + + for (; twist_index < twist_queue.size() || + angular_velocity_index < angular_velocity_queue.size();) { + std::uint64_t twist_stamp, input_twist_global_stamp_nsec, angular_velocity_global_stamp_nsec; + float v_x, v_theta; + + if (twist_index < twist_queue.size()) { + input_twist_global_stamp_nsec = + 1'000'000'000 * static_cast<std::uint64_t>(twist_queue[twist_index].header.stamp.sec) + + static_cast<std::uint64_t>(twist_queue[twist_index].header.stamp.nanosec); + v_x = twist_queue[twist_index].twist.twist.linear.x; + } else { + input_twist_global_stamp_nsec = std::numeric_limits<std::uint64_t>::max(); + v_x = 0.0; + } + + if (angular_velocity_index < angular_velocity_queue.size()) { + angular_velocity_global_stamp_nsec = + 1'000'000'000 * static_cast<std::uint64_t>( + angular_velocity_queue[angular_velocity_index].header.stamp.sec) + + static_cast<std::uint64_t>( + angular_velocity_queue[angular_velocity_index].header.stamp.nanosec); + v_theta = angular_velocity_queue[angular_velocity_index].vector.z; + } else { + angular_velocity_global_stamp_nsec = std::numeric_limits<std::uint64_t>::max(); + v_theta = 0.0; + } + + if (input_twist_global_stamp_nsec < angular_velocity_global_stamp_nsec) { + twist_stamp = input_twist_global_stamp_nsec; + twist_index++; + } else if (input_twist_global_stamp_nsec > angular_velocity_global_stamp_nsec) { + twist_stamp = angular_velocity_global_stamp_nsec; + angular_velocity_index++; + } else { + twist_index++; + angular_velocity_index++; + } + + TwistStruct2D twist; + twist.cum_x = cum_x; + twist.cum_y = cum_y; + twist.cum_theta = cum_theta; + + std::uint64_t twist_global_stamp_nsec = twist_stamp; + assert(twist_global_stamp_nsec > pointcloud_stamp_nsec); // by construction + std::uint32_t twist_from_pointcloud_start_nsec = + twist_global_stamp_nsec - pointcloud_stamp_nsec; + + twist.stamp_nsec = twist_from_pointcloud_start_nsec; + twist.v_x = v_x; + twist.v_theta = v_theta; + twist.last_stamp_nsec = last_stamp_nsec; + host_twist_2d_structs.push_back(twist); + + double dt_seconds = 1e-9 * (twist.stamp_nsec - last_stamp_nsec); + last_stamp_nsec = twist.stamp_nsec; + cum_theta += v_theta * dt_seconds; + float d = twist.v_x * dt_seconds; + cum_x += d * cos(cum_theta); + cum_y += d * sin(cum_theta); + } + + // Copy to device + device_twist_2d_structs.resize(host_twist_2d_structs.size()); + cudaMemcpyAsync( + thrust::raw_pointer_cast(device_twist_2d_structs.data()), host_twist_2d_structs.data(), + host_twist_2d_structs.size() * sizeof(TwistStruct2D), cudaMemcpyHostToDevice, stream); +} + +void setupTwist3DStructs( + const std::deque<geometry_msgs::msg::TwistWithCovarianceStamped> & twist_queue, + const std::deque<geometry_msgs::msg::Vector3Stamped> & angular_velocity_queue, + const std::uint64_t pointcloud_stamp_nsec, const std::uint32_t first_point_rel_stamp_nsec, + thrust::device_vector<TwistStruct3D> & device_twist_3d_structs, cudaStream_t & stream) +{ + std::vector<TwistStruct3D> host_twist_3d_structs; + host_twist_3d_structs.reserve(twist_queue.size() + angular_velocity_queue.size()); + + Eigen::Matrix4f cum_transform = Eigen::Matrix4f::Identity(); + Eigen::Vector3f v = Eigen::Vector3f::Zero(); + Eigen::Vector3f w = Eigen::Vector3f::Zero(); + + // All time stamps from now on are in nsec from the "beginning of the pointcloud" + std::uint32_t last_stamp_nsec = first_point_rel_stamp_nsec; + + std::size_t twist_index = 0; + std::size_t angular_velocity_index = 0; + + for (; twist_index < twist_queue.size() || + angular_velocity_index < angular_velocity_queue.size();) { + std::uint64_t twist_stamp, input_twist_global_stamp_nsec, angular_velocity_global_stamp_nsec; + + if (twist_index < twist_queue.size()) { + input_twist_global_stamp_nsec = + 1'000'000'000 * static_cast<std::uint64_t>(twist_queue[twist_index].header.stamp.sec) + + static_cast<std::uint64_t>(twist_queue[twist_index].header.stamp.nanosec); + v.x() = twist_queue[twist_index].twist.twist.linear.x; + v.y() = twist_queue[twist_index].twist.twist.linear.y; + v.z() = twist_queue[twist_index].twist.twist.linear.z; + } else { + input_twist_global_stamp_nsec = std::numeric_limits<uint64_t>::max(); + v = Eigen::Vector3f::Zero(); + } + + if (angular_velocity_index < angular_velocity_queue.size()) { + angular_velocity_global_stamp_nsec = + 1'000'000'000 * static_cast<std::uint64_t>( + angular_velocity_queue[angular_velocity_index].header.stamp.sec) + + static_cast<std::uint64_t>( + angular_velocity_queue[angular_velocity_index].header.stamp.nanosec); + w.x() = angular_velocity_queue[angular_velocity_index].vector.x; + w.y() = angular_velocity_queue[angular_velocity_index].vector.y; + w.z() = angular_velocity_queue[angular_velocity_index].vector.z; + } else { + angular_velocity_global_stamp_nsec = std::numeric_limits<std::uint64_t>::max(); + w = Eigen::Vector3f::Zero(); + } + + if (input_twist_global_stamp_nsec < angular_velocity_global_stamp_nsec) { + twist_stamp = input_twist_global_stamp_nsec; + twist_index++; + } else if (input_twist_global_stamp_nsec > angular_velocity_global_stamp_nsec) { + twist_stamp = angular_velocity_global_stamp_nsec; + angular_velocity_index++; + } else { + twist_index++; + angular_velocity_index++; + } + + TwistStruct3D twist; + + Eigen::Map<Eigen::Matrix4f> cum_transform_buffer_map(twist.cum_transform_buffer); + Eigen::Map<Eigen::Vector3f> v_map(twist.v); + Eigen::Map<Eigen::Vector3f> w_map(twist.w); + cum_transform_buffer_map = cum_transform; + + std::uint64_t twist_global_stamp_nsec = twist_stamp; + assert(twist_global_stamp_nsec > pointcloud_stamp_nsec); // by construction + std::uint32_t twist_from_pointcloud_start_nsec = + twist_global_stamp_nsec - pointcloud_stamp_nsec; + + twist.stamp_nsec = twist_from_pointcloud_start_nsec; + v_map = v; + w_map = w; + twist.last_stamp_nsec = last_stamp_nsec; + host_twist_3d_structs.push_back(twist); + + double dt_seconds = 1e-9 * (twist.stamp_nsec - last_stamp_nsec); + last_stamp_nsec = twist.stamp_nsec; + + auto delta_transform = transformationMatrixFromVelocity(v, w, dt_seconds); + cum_transform = cum_transform * delta_transform; + } + + // Copy to device + device_twist_3d_structs.resize(host_twist_3d_structs.size()); + cudaMemcpyAsync( + thrust::raw_pointer_cast(device_twist_3d_structs.data()), host_twist_3d_structs.data(), + host_twist_3d_structs.size() * sizeof(TwistStruct3D), cudaMemcpyHostToDevice, stream); +} + +} // namespace autoware::cuda_pointcloud_preprocessor From 17799e5d0dac816cfb89cce5bbcd5701b618a172 Mon Sep 17 00:00:00 2001 From: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp> Date: Wed, 5 Feb 2025 18:19:32 +0900 Subject: [PATCH 30/55] chore: updated the copyright year Signed-off-by: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp> --- .../autoware/cuda_pointcloud_preprocessor/common_kernels.hpp | 2 +- .../cuda_pointcloud_preprocessor.hpp | 2 +- .../cuda_pointcloud_preprocessor_node.hpp | 2 +- .../include/autoware/cuda_pointcloud_preprocessor/memory.hpp | 2 +- .../autoware/cuda_pointcloud_preprocessor/organize_kernels.hpp | 2 +- .../autoware/cuda_pointcloud_preprocessor/outlier_kernels.hpp | 2 +- .../autoware/cuda_pointcloud_preprocessor/point_types.hpp | 2 +- .../include/autoware/cuda_pointcloud_preprocessor/types.hpp | 2 +- .../autoware/cuda_pointcloud_preprocessor/undistort_kernels.hpp | 2 +- .../src/cuda_pointcloud_preprocessor/common_kernels.cu | 2 +- .../cuda_pointcloud_preprocessor.cu | 2 +- .../cuda_pointcloud_preprocessor_node.cpp | 2 +- .../src/cuda_pointcloud_preprocessor/organize_kernels.cu | 2 +- .../src/cuda_pointcloud_preprocessor/outlier_kernels.cu | 2 +- .../src/cuda_pointcloud_preprocessor/undistort_kernels.cu | 2 +- 15 files changed, 15 insertions(+), 15 deletions(-) diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/common_kernels.hpp b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/common_kernels.hpp index 0e21e933569ee..d450a206e9d59 100644 --- a/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/common_kernels.hpp +++ b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/common_kernels.hpp @@ -1,4 +1,4 @@ -// Copyright 2024 TIER IV, Inc. +// Copyright 2025 TIER IV, Inc. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.hpp b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.hpp index bf19949ddc795..b68c6c5e6162b 100644 --- a/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.hpp +++ b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.hpp @@ -1,4 +1,4 @@ -// Copyright 2024 TIER IV, Inc. +// Copyright 2025 TIER IV, Inc. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.hpp b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.hpp index afc2f6cd74de0..22f248eb70704 100644 --- a/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.hpp +++ b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.hpp @@ -1,4 +1,4 @@ -// Copyright 2024 TIER IV, Inc. +// Copyright 2025 TIER IV, Inc. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/memory.hpp b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/memory.hpp index 9482f170be550..9fb24b8a29c3f 100644 --- a/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/memory.hpp +++ b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/memory.hpp @@ -1,4 +1,4 @@ -// Copyright 2024 TIER IV, Inc. +// Copyright 2025 TIER IV, Inc. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/organize_kernels.hpp b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/organize_kernels.hpp index 693a001045d3c..82b8a589cc230 100644 --- a/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/organize_kernels.hpp +++ b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/organize_kernels.hpp @@ -1,4 +1,4 @@ -// Copyright 2024 TIER IV, Inc. +// Copyright 2025 TIER IV, Inc. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/outlier_kernels.hpp b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/outlier_kernels.hpp index 2010127afdeb5..28ec3770d7fb8 100644 --- a/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/outlier_kernels.hpp +++ b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/outlier_kernels.hpp @@ -1,4 +1,4 @@ -// Copyright 2024 TIER IV, Inc. +// Copyright 2025 TIER IV, Inc. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/point_types.hpp b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/point_types.hpp index 50010ac0f2b67..4518c6cd2a908 100644 --- a/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/point_types.hpp +++ b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/point_types.hpp @@ -1,5 +1,5 @@ -// Copyright 2024 TIER IV, Inc. +// Copyright 2025 TIER IV, Inc. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/types.hpp b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/types.hpp index 4cb50c8c28e38..aed91d2598193 100644 --- a/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/types.hpp +++ b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/types.hpp @@ -1,4 +1,4 @@ -// Copyright 2024 TIER IV, Inc. +// Copyright 2025 TIER IV, Inc. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/undistort_kernels.hpp b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/undistort_kernels.hpp index 4a2f91af12953..29cd073b700ae 100644 --- a/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/undistort_kernels.hpp +++ b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/undistort_kernels.hpp @@ -1,4 +1,4 @@ -// Copyright 2024 TIER IV, Inc. +// Copyright 2025 TIER IV, Inc. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/common_kernels.cu b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/common_kernels.cu index 64ed3ca8ab72f..27d1e05765f16 100644 --- a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/common_kernels.cu +++ b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/common_kernels.cu @@ -1,4 +1,4 @@ -// Copyright 2024 TIER IV, Inc. +// Copyright 2025 TIER IV, Inc. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu index aac9f560a78ca..97ccf028cfdfd 100644 --- a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu +++ b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu @@ -1,4 +1,4 @@ -// Copyright 2024 TIER IV, Inc. +// Copyright 2025 TIER IV, Inc. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.cpp b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.cpp index bf6cc1f79890c..2ff9626f4c1a8 100644 --- a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.cpp +++ b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.cpp @@ -1,4 +1,4 @@ -// Copyright 2024 TIER IV, Inc. +// Copyright 2025 TIER IV, Inc. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/organize_kernels.cu b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/organize_kernels.cu index 00ab2ddc1aec5..ea74709cb1b54 100644 --- a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/organize_kernels.cu +++ b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/organize_kernels.cu @@ -1,4 +1,4 @@ -// Copyright 2024 TIER IV, Inc. +// Copyright 2025 TIER IV, Inc. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/outlier_kernels.cu b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/outlier_kernels.cu index 18b116535a32c..8ce0634ed2a41 100644 --- a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/outlier_kernels.cu +++ b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/outlier_kernels.cu @@ -1,4 +1,4 @@ -// Copyright 2024 TIER IV, Inc. +// Copyright 2025 TIER IV, Inc. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/undistort_kernels.cu b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/undistort_kernels.cu index 6767b03aebf56..551ae43fd983b 100644 --- a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/undistort_kernels.cu +++ b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/undistort_kernels.cu @@ -1,4 +1,4 @@ -// Copyright 2024 TIER IV, Inc. +// Copyright 2025 TIER IV, Inc. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. From f10022444e70452b8dbced7990a5351d36d05d71 Mon Sep 17 00:00:00 2001 From: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp> Date: Wed, 5 Feb 2025 20:09:49 +0900 Subject: [PATCH 31/55] fix: fixed a bug in the undistortion kernel setup. validated it comparing it with the baseline Signed-off-by: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp> --- .../src/cuda_pointcloud_preprocessor/undistort_kernels.cu | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/undistort_kernels.cu b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/undistort_kernels.cu index 551ae43fd983b..e9763e990303a 100644 --- a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/undistort_kernels.cu +++ b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/undistort_kernels.cu @@ -170,11 +170,11 @@ void setupTwist2DStructs( std::size_t twist_index = 0; std::size_t angular_velocity_index = 0; + float v_x{0.f}, v_theta{0.f}; for (; twist_index < twist_queue.size() || angular_velocity_index < angular_velocity_queue.size();) { std::uint64_t twist_stamp, input_twist_global_stamp_nsec, angular_velocity_global_stamp_nsec; - float v_x, v_theta; if (twist_index < twist_queue.size()) { input_twist_global_stamp_nsec = @@ -183,7 +183,6 @@ void setupTwist2DStructs( v_x = twist_queue[twist_index].twist.twist.linear.x; } else { input_twist_global_stamp_nsec = std::numeric_limits<std::uint64_t>::max(); - v_x = 0.0; } if (angular_velocity_index < angular_velocity_queue.size()) { @@ -195,7 +194,6 @@ void setupTwist2DStructs( v_theta = angular_velocity_queue[angular_velocity_index].vector.z; } else { angular_velocity_global_stamp_nsec = std::numeric_limits<std::uint64_t>::max(); - v_theta = 0.0; } if (input_twist_global_stamp_nsec < angular_velocity_global_stamp_nsec) { From 6f0fe0c1f1270e1487bcc56f9ff87cdcfd15ac87 Mon Sep 17 00:00:00 2001 From: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp> Date: Thu, 13 Feb 2025 16:42:01 +0900 Subject: [PATCH 32/55] chore: removed unused packages Signed-off-by: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp> --- .../autoware_cuda_pointcloud_preprocessor/CMakeLists.txt | 9 ++++++--- .../autoware_cuda_pointcloud_preprocessor/package.xml | 6 +++--- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/CMakeLists.txt b/sensing/autoware_cuda_pointcloud_preprocessor/CMakeLists.txt index 4a4fc505862bb..e52192cd44b3b 100644 --- a/sensing/autoware_cuda_pointcloud_preprocessor/CMakeLists.txt +++ b/sensing/autoware_cuda_pointcloud_preprocessor/CMakeLists.txt @@ -3,7 +3,6 @@ project(autoware_cuda_pointcloud_preprocessor) find_package(ament_cmake_auto REQUIRED) find_package(CUDA) -find_package(PCL REQUIRED) if(NOT ${CUDA_FOUND}) message(WARNING "cuda was not found, so the autoware_cuda_pointcloud_preprocessor package will not be built.") @@ -43,20 +42,24 @@ cuda_add_library(cuda_pointcloud_preprocessor_lib SHARED ) target_link_libraries(cuda_pointcloud_preprocessor_lib - ${PCL_LIBRARIES} + ${autoware_pointcloud_preprocessor_TARGETS} ) target_include_directories(cuda_pointcloud_preprocessor_lib SYSTEM PRIVATE + ${autoware_pointcloud_preprocessor_INCLUDE_DIRS} ${autoware_point_types_INCLUDE_DIRS} ${cuda_blackboard_INCLUDE_DIRS} + ${diagnostic_msgs_INCLUDE_DIRS} ${geometry_msgs_INCLUDE_DIRS} - ${nebula_common_INCLUDE_DIRS} ${rclcpp_INCLUDE_DIRS} + ${rclcpp_components_INCLUDE_DIRS} ${rcl_interfaces_INCLUDE_DIRS} ${sensor_msgs_INCLUDE_DIRS} ${tf2_INCLUDE_DIRS} + ${tf2_msgs_INCLUDE_DIRS} ) + # Targets ament_auto_add_library(cuda_pointcloud_preprocessor SHARED src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.cpp diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/package.xml b/sensing/autoware_cuda_pointcloud_preprocessor/package.xml index edbd7ff61c874..fecb38c2f48f4 100644 --- a/sensing/autoware_cuda_pointcloud_preprocessor/package.xml +++ b/sensing/autoware_cuda_pointcloud_preprocessor/package.xml @@ -11,15 +11,15 @@ <buildtool_depend>autoware_cmake</buildtool_depend> <depend>autoware_point_types</depend> + <depend>autoware_pointcloud_preprocessor</depend> <depend>autoware_universe_utils</depend> <depend>cuda_blackboard</depend> - <depend>pcl_conversions</depend> - <depend>pcl_ros</depend> + <depend>diagnostic_msgs</depend> <depend>rclcpp</depend> <depend>rclcpp_components</depend> <depend>sensor_msgs</depend> - <depend>sophus</depend> <depend>tf2</depend> + <depend>tf2_msgs</depend> <depend>tier4_debug_msgs</depend> <test_depend>ament_clang_format</test_depend> From 2330cd6f2d8165eb6ec937e49b7b412d67750226 Mon Sep 17 00:00:00 2001 From: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp> Date: Thu, 13 Feb 2025 16:56:33 +0900 Subject: [PATCH 33/55] chore: removed mentions of the removed adapter Signed-off-by: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp> --- .../README.md | 7 ++-- .../docs/cuda-organized-pointcloud-adapter.md | 41 ------------------- .../docs/cuda-pointcloud-preprocessor.md | 1 - 3 files changed, 3 insertions(+), 46 deletions(-) delete mode 100644 sensing/autoware_cuda_pointcloud_preprocessor/docs/cuda-organized-pointcloud-adapter.md diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/README.md b/sensing/autoware_cuda_pointcloud_preprocessor/README.md index 3c87883971e7e..0fa5ab7e7c65a 100644 --- a/sensing/autoware_cuda_pointcloud_preprocessor/README.md +++ b/sensing/autoware_cuda_pointcloud_preprocessor/README.md @@ -10,10 +10,9 @@ To alleviate this issue, this package reimplements most of the pipeline presente A detailed description of each filter's algorithm is available in the following links. -| Filter Name | Description | Detail | -| --------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------- | -| cuda_organized_pointcloud_adapter | Organizes a pointcloud per ring/channel, so that the memory layout allows parallel processing in cuda | [link](docs/cuda-organized-pointcloud-adapter.md) | -| cuda_pointcloud_preprocessor | Implements the cropping, distortion correction, and outlier filtering (ring-based) of the `autoware_pointcloud_preprocessor`'s CPU versions. | [link](docs/cuda-pointcloud-preprocessor.md) | +| Filter Name | Description | Detail | +| ---------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------- | +| cuda_pointcloud_preprocessor | Implements the cropping, distortion correction, and outlier filtering (ring-based) of the `autoware_pointcloud_preprocessor`'s CPU versions. | [link](docs/cuda-pointcloud-preprocessor.md) | ## (Optional) Future extensions / Unimplemented parts diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/docs/cuda-organized-pointcloud-adapter.md b/sensing/autoware_cuda_pointcloud_preprocessor/docs/cuda-organized-pointcloud-adapter.md deleted file mode 100644 index 634cdfb3e14f0..0000000000000 --- a/sensing/autoware_cuda_pointcloud_preprocessor/docs/cuda-organized-pointcloud-adapter.md +++ /dev/null @@ -1,41 +0,0 @@ -# cuda_organized_pointcloud_adapter - -## Purpose - -The node `cuda_pointcloud_preprocessor` expects a 2D pointcloud where every row represents a single channel/ring and each row's points are in non decreasing azimuth order. - -To utilize the previously mentioned node, this node provides an adapter to convert standard flat pointclouds (`height` equals to 1) to the required 2D tensor. - -In addition, this node uses the `cuda_blackboard`, a cuda transport layer that enables a zero-copy mechanism between GPU and GPU memory for both input and output. - -## Inner-workings / Algorithms - -To create the required 2D tensor, this node iterates the input pointcloud sequentially, filling the output 2D tensor depending on the input point's channel. - -The output tensor's size is also estimated in this node, based on the largest `channel` value and the maximum number of points per channel observed so far. - -## Inputs / Outputs - -### Input - -| Name | Type | Description | -| -------------------- | ------------------------------- | ------------------------- | -| `~/input/pointcloud` | `sensor_msgs::msg::PointCloud2` | Input pointcloud's topic. | - -### Output - -| Name | Type | Description | -| -------------------------- | ------------------------------------------------ | ---------------------------------------- | -| `~/output/pointcloud` | `sensor_msgs::msg::PointCloud2` | Processed pointcloud's topic | -| `~/output/pointcloud/cuda` | `negotiated_interfaces/msg/NegotiatedTopicsInfo` | Processed pointcloud's negotiation topic | - -## Parameters - -### Core Parameters - -{{ json_to_markdown("sensing/autoware_cuda_pointcloud_preprocessor/schema/cuda_pointcloud_preprocessor.schema.schema.json") }} - -## Assumptions / Known limits - -- This algorithm assumes that the input points will be in non-decreasing azimuth order (per ring). -- This node expects that the input pointcloud is flat (`height` equals to 1) and follows the `autoware::point_types::PointXYZIRC` layout defined in the `autoware_point_types` package. diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/docs/cuda-pointcloud-preprocessor.md b/sensing/autoware_cuda_pointcloud_preprocessor/docs/cuda-pointcloud-preprocessor.md index 4a7e2d070ccb6..2ba24e6a8fb52 100644 --- a/sensing/autoware_cuda_pointcloud_preprocessor/docs/cuda-pointcloud-preprocessor.md +++ b/sensing/autoware_cuda_pointcloud_preprocessor/docs/cuda-pointcloud-preprocessor.md @@ -43,4 +43,3 @@ In addition to the individual algorithms previously mentioned, this node uses th - The CUDA implementations, while following the original CPU ones, will not offer the same numerical results, and small approximations were needed to maximize GPU usage. - This node expects that the input pointcloud follows the `autoware::point_types::PointXYZIRCAEDT` layout and the output pointcloud will use the `autoware::point_types::PointXYZIRC` layout defined in the `autoware_point_types` package. -- The input pointcloud must be a 2D tensor where each row represents a different channel/ring with its points in non-decreasing azimuth order. Invalid points should contain 0-values. The `cuda_organized_pointcloud_adapter` provides such a pointcloud. From 48b39cd8ec84df413c8ff51344cc254ad32f17a2 Mon Sep 17 00:00:00 2001 From: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp> Date: Fri, 14 Feb 2025 09:21:22 +0900 Subject: [PATCH 34/55] chore: fixed missing autoware prefix Signed-off-by: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp> --- .../launch/cuda_pointcloud_preprocessor.launch.xml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/launch/cuda_pointcloud_preprocessor.launch.xml b/sensing/autoware_cuda_pointcloud_preprocessor/launch/cuda_pointcloud_preprocessor.launch.xml index e8843bd804bf0..a1da651aa76e2 100644 --- a/sensing/autoware_cuda_pointcloud_preprocessor/launch/cuda_pointcloud_preprocessor.launch.xml +++ b/sensing/autoware_cuda_pointcloud_preprocessor/launch/cuda_pointcloud_preprocessor.launch.xml @@ -4,9 +4,9 @@ <arg name="input/twist" default="/sensing/vehicle_velocity_converter/twist_with_covariance"/> <arg name="output/pointcloud" default="/sensing/lidar/top/test"/> - <arg name="cuda_pointcloud_preprocessor_param_file" default="$(find-pkg-share cuda_pointcloud_preprocessor)/config/cuda_pointcloud_preprocessor.param.yaml"/> + <arg name="cuda_pointcloud_preprocessor_param_file" default="$(find-pkg-share autoware_cuda_pointcloud_preprocessor)/config/cuda_pointcloud_preprocessor.param.yaml"/> - <node pkg="cuda_pointcloud_preprocessor" exec="cuda_pointcloud_preprocessor_node" name="cuda_pointcloud_preprocessor" output="screen"> + <node pkg="autoware_cuda_pointcloud_preprocessor" exec="cuda_pointcloud_preprocessor_node" name="cuda_pointcloud_preprocessor" output="screen"> <remap from="~/input/pointcloud" to="$(var input/pointcloud)"/> <remap from="~/input/imu" to="$(var input/imu)"/> <remap from="~/input/twist" to="$(var input/twist)"/> From fa36260cf79670d2bc222261597ea19cf59247e6 Mon Sep 17 00:00:00 2001 From: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp> Date: Fri, 14 Feb 2025 09:25:16 +0900 Subject: [PATCH 35/55] fix: missing assignment in else branch Signed-off-by: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp> --- .../src/cuda_pointcloud_preprocessor/undistort_kernels.cu | 1 + 1 file changed, 1 insertion(+) diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/undistort_kernels.cu b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/undistort_kernels.cu index e9763e990303a..4dce3345b4807 100644 --- a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/undistort_kernels.cu +++ b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/undistort_kernels.cu @@ -203,6 +203,7 @@ void setupTwist2DStructs( twist_stamp = angular_velocity_global_stamp_nsec; angular_velocity_index++; } else { + twist_stamp = input_twist_global_stamp_nsec; twist_index++; angular_velocity_index++; } From 2b9f654a15aa20dec801b8ab258b6496a2ca6015 Mon Sep 17 00:00:00 2001 From: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp> Date: Fri, 14 Feb 2025 09:29:42 +0900 Subject: [PATCH 36/55] chore: added cuda/nvcc debug flags on debug builds Signed-off-by: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp> --- sensing/autoware_cuda_pointcloud_preprocessor/CMakeLists.txt | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/CMakeLists.txt b/sensing/autoware_cuda_pointcloud_preprocessor/CMakeLists.txt index e52192cd44b3b..0e174ee18bbd2 100644 --- a/sensing/autoware_cuda_pointcloud_preprocessor/CMakeLists.txt +++ b/sensing/autoware_cuda_pointcloud_preprocessor/CMakeLists.txt @@ -7,6 +7,9 @@ find_package(CUDA) if(NOT ${CUDA_FOUND}) message(WARNING "cuda was not found, so the autoware_cuda_pointcloud_preprocessor package will not be built.") return() +elseif(CMAKE_BUILD_TYPE STREQUAL "Debug") + set(CMAKE_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} "-g -G") + set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-g -G") endif() ament_auto_find_build_dependencies() From 186fff9d1dac57adf966dc264f964e96a118126c Mon Sep 17 00:00:00 2001 From: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp> Date: Fri, 14 Feb 2025 10:03:12 +0900 Subject: [PATCH 37/55] chore: refactored parameters for the undistortion settings Signed-off-by: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp> --- .../cuda_pointcloud_preprocessor.hpp | 6 ++++-- .../cuda_pointcloud_preprocessor.cu | 20 +++++++++++++------ .../cuda_pointcloud_preprocessor_node.cpp | 6 +++++- 3 files changed, 23 insertions(+), 9 deletions(-) diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.hpp b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.hpp index b68c6c5e6162b..d14fc2ec1eb68 100644 --- a/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.hpp +++ b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.hpp @@ -38,12 +38,14 @@ namespace autoware::cuda_pointcloud_preprocessor class CudaPointcloudPreprocessor { public: + enum class UndistortionType { Invalid, Undistortion2D, Undistortion3D }; + CudaPointcloudPreprocessor(); ~CudaPointcloudPreprocessor() = default; void setCropBoxParameters(const std::vector<CropBoxParameters> & crop_box_parameters); void setRingOutlierFilterParameters(const RingOutlierFilterParameters & ring_outlier_parameters); - void set3DUndistortion(bool value); + void setUndistortionType(const UndistortionType & undistortion_type); void preallocateOutput(); @@ -60,7 +62,7 @@ class CudaPointcloudPreprocessor CropBoxParameters self_crop_box_parameters_{}; CropBoxParameters mirror_crop_box_parameters_{}; RingOutlierFilterParameters ring_outlier_parameters_{}; - bool use_3d_undistortion_{false}; + UndistortionType undistortion_type_{UndistortionType::Invalid}; int num_rings_{}; int max_points_per_ring_{}; diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu index 97ccf028cfdfd..f9a070ee40173 100644 --- a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu +++ b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu @@ -182,9 +182,13 @@ void CudaPointcloudPreprocessor::setRingOutlierFilterParameters( ring_outlier_parameters_ = ring_outlier_parameters; } -void CudaPointcloudPreprocessor::set3DUndistortion(bool use_3d_undistortion) +void CudaPointcloudPreprocessor::setUndistortionType(const UndistortionType & undistortion_type) { - use_3d_undistortion_ = use_3d_undistortion; + if (undistortion_type == UndistortionType::Invalid) { + throw std::runtime_error("Invalid undistortion type"); + } + + undistortion_type_ = undistortion_type; } void CudaPointcloudPreprocessor::preallocateOutput() @@ -351,14 +355,16 @@ std::unique_ptr<cuda_blackboard::CudaPointCloud2> CudaPointcloudPreprocessor::pr std::uint64_t pointcloud_stamp_nsec = 1'000'000'000 * input_pointcloud_msg_ptr->header.stamp.sec + input_pointcloud_msg_ptr->header.stamp.nanosec; - if (use_3d_undistortion_) { + if (undistortion_type_ == UndistortionType::Undistortion3D) { setupTwist3DStructs( twist_queue, angular_velocity_queue, pointcloud_stamp_nsec, first_point_rel_stamp_nsec, device_twist_3d_structs_, stream_); - } else { + } else if (undistortion_type_ == UndistortionType::Undistortion2D) { setupTwist2DStructs( twist_queue, angular_velocity_queue, pointcloud_stamp_nsec, first_point_rel_stamp_nsec, device_twist_2d_structs_, stream_); + } else { + throw std::runtime_error("Invalid undistortion type"); } // Obtain raw pointers for the kernels @@ -389,11 +395,13 @@ std::unique_ptr<cuda_blackboard::CudaPointCloud2> CudaPointcloudPreprocessor::pr thrust::fill(thrust::device, device_crop_mask, device_crop_mask + num_organized_points_, 1); } - if (use_3d_undistortion_ && device_twist_3d_structs_.size() > 0) { + if ( + undistortion_type_ == UndistortionType::Undistortion3D && device_twist_3d_structs_.size() > 0) { undistort3dLaunch( device_transformed_points, num_organized_points_, device_twist_3d_structs, device_twist_3d_structs_.size(), threads_per_block_, blocks_per_grid, stream_); - } else if (!use_3d_undistortion_ && device_twist_2d_structs_.size() > 0) { + } else if ( + undistortion_type_ == UndistortionType::Undistortion2D && device_twist_2d_structs_.size() > 0) { undistort2dLaunch( device_transformed_points, num_organized_points_, device_twist_2d_structs, device_twist_2d_structs_.size(), threads_per_block_, blocks_per_grid, stream_); diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.cpp b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.cpp index 2ff9626f4c1a8..6c62d9019919f 100644 --- a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.cpp +++ b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.cpp @@ -112,10 +112,14 @@ CudaPointcloudPreprocessorNode::CudaPointcloudPreprocessorNode( sub_options); } + CudaPointcloudPreprocessor::UndistortionType undistortion_type = + use_3d_undistortion ? CudaPointcloudPreprocessor::UndistortionType::Undistortion3D + : CudaPointcloudPreprocessor::UndistortionType::Undistortion2D; + cuda_pointcloud_preprocessor_ = std::make_unique<CudaPointcloudPreprocessor>(); cuda_pointcloud_preprocessor_->setRingOutlierFilterParameters(ring_outlier_filter_parameters); cuda_pointcloud_preprocessor_->setCropBoxParameters(crop_box_parameters); - cuda_pointcloud_preprocessor_->set3DUndistortion(use_3d_undistortion); + cuda_pointcloud_preprocessor_->setUndistortionType(undistortion_type); // initialize debug tool { From d2505ee9d7ccda6bc014747b7b7d1ecc2bfeec8b Mon Sep 17 00:00:00 2001 From: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp> Date: Fri, 14 Feb 2025 10:03:42 +0900 Subject: [PATCH 38/55] chore: removed unused headers Signed-off-by: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp> --- .../cuda_pointcloud_preprocessor_node.hpp | 1 - .../autoware/cuda_pointcloud_preprocessor/organize_kernels.hpp | 1 - .../autoware/cuda_pointcloud_preprocessor/outlier_kernels.hpp | 1 - 3 files changed, 3 deletions(-) diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.hpp b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.hpp index 22f248eb70704..f8b77217319a6 100644 --- a/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.hpp +++ b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.hpp @@ -38,7 +38,6 @@ #include <deque> #include <memory> -#include <vector> #define CHECK_OFFSET(structure1, structure2, field) \ static_assert( \ diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/organize_kernels.hpp b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/organize_kernels.hpp index 82b8a589cc230..0fe52bfe76409 100644 --- a/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/organize_kernels.hpp +++ b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/organize_kernels.hpp @@ -16,7 +16,6 @@ #define AUTOWARE__CUDA_POINTCLOUD_PREPROCESSOR__ORGANIZE_KERNELS_HPP_ #include "autoware/cuda_pointcloud_preprocessor/point_types.hpp" -#include "autoware/cuda_pointcloud_preprocessor/types.hpp" #include <cuda_runtime.h> diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/outlier_kernels.hpp b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/outlier_kernels.hpp index 28ec3770d7fb8..23d46458edce3 100644 --- a/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/outlier_kernels.hpp +++ b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/outlier_kernels.hpp @@ -16,7 +16,6 @@ #define AUTOWARE__CUDA_POINTCLOUD_PREPROCESSOR__OUTLIER_KERNELS_HPP_ #include "autoware/cuda_pointcloud_preprocessor/point_types.hpp" -#include "autoware/cuda_pointcloud_preprocessor/types.hpp" #include <cuda_runtime.h> From bc4d4702761618210ff2c8b6d80c5b23fd7283dc Mon Sep 17 00:00:00 2001 From: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp> Date: Thu, 6 Mar 2025 15:55:52 +0900 Subject: [PATCH 39/55] chore: changed default crop box to no filtering at all Signed-off-by: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp> --- .../config/cuda_pointcloud_preprocessor.param.yaml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/config/cuda_pointcloud_preprocessor.param.yaml b/sensing/autoware_cuda_pointcloud_preprocessor/config/cuda_pointcloud_preprocessor.param.yaml index 66ce309caa127..fdea4bebae796 100644 --- a/sensing/autoware_cuda_pointcloud_preprocessor/config/cuda_pointcloud_preprocessor.param.yaml +++ b/sensing/autoware_cuda_pointcloud_preprocessor/config/cuda_pointcloud_preprocessor.param.yaml @@ -8,9 +8,9 @@ object_length_threshold: 0.1 num_points_threshold: 4 - crop_box.min_x: [1.0] - crop_box.min_y: [1.0] - crop_box.min_z: [1.0] - crop_box.max_x: [-1.0] - crop_box.max_y: [-1.0] - crop_box.max_z: [-1.0] + crop_box.min_x: [] + crop_box.min_y: [] + crop_box.min_z: [] + crop_box.max_x: [] + crop_box.max_y: [] + crop_box.max_z: [] From 68d1e421ec79074c289c3b30e6e68c14342dfddb Mon Sep 17 00:00:00 2001 From: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp> Date: Thu, 6 Mar 2025 16:01:41 +0900 Subject: [PATCH 40/55] feat: added missing restrict keyword Signed-off-by: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp> --- .../src/cuda_pointcloud_preprocessor/common_kernels.cu | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/common_kernels.cu b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/common_kernels.cu index 27d1e05765f16..57792e175ddfe 100644 --- a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/common_kernels.cu +++ b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/common_kernels.cu @@ -75,8 +75,9 @@ __global__ void combineMasksKernel( } __global__ void extractPointsKernel( - InputPointType * input_points, std::uint32_t * masks, std::uint32_t * indices, int num_points, - OutputPointType * output_points) + InputPointType * __restrict__ input_points, std::uint32_t * __restrict__ masks, + std::uint32_t * __restrict__ indices, int num_points, + OutputPointType * __restrict__ output_points) { int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < num_points && masks[idx] == 1) { From 5dba7b307c9140fc44498604733464ebf2e3d9ee Mon Sep 17 00:00:00 2001 From: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp> Date: Thu, 6 Mar 2025 16:03:18 +0900 Subject: [PATCH 41/55] chore: spells Signed-off-by: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp> --- .../schema/cuda_pointcloud_preprocessor.schema.json | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/schema/cuda_pointcloud_preprocessor.schema.json b/sensing/autoware_cuda_pointcloud_preprocessor/schema/cuda_pointcloud_preprocessor.schema.json index b8e597b9afdb2..bbd539a26b8ce 100644 --- a/sensing/autoware_cuda_pointcloud_preprocessor/schema/cuda_pointcloud_preprocessor.schema.json +++ b/sensing/autoware_cuda_pointcloud_preprocessor/schema/cuda_pointcloud_preprocessor.schema.json @@ -41,32 +41,32 @@ }, "crop_box.min_x": { "type": "array", - "description": "An array in which each elemt is the minimum x value in a crop box", + "description": "An array in which each element is the minimum x value in a crop box", "default": [1.0] }, "crop_box.min_y": { "type": "array", - "description": "An array in which each elemt is the minimum y value in a crop box", + "description": "An array in which each element is the minimum y value in a crop box", "default": [1.0] }, "crop_box.min_z": { "type": "array", - "description": "An array in which each elemt is the minimum z value in a crop box", + "description": "An array in which each element is the minimum z value in a crop box", "default": [1.0] }, "crop_box.max_x": { "type": "array", - "description": "An array in which each elemt is the maximum x value in a crop box", + "description": "An array in which each element is the maximum x value in a crop box", "default": [-1.0] }, "crop_box.max_y": { "type": "array", - "description": "An array in which each elemt is the maximum y value in a crop box", + "description": "An array in which each element is the maximum y value in a crop box", "default": [-1.0] }, "crop_box.max_z": { "type": "array", - "description": "An array in which each elemt is the maximum z value in a crop box", + "description": "An array in which each element is the maximum z value in a crop box", "default": [-1.0] } }, From 92b04a37f50d04aaa80161f3946836ec1ea55ace Mon Sep 17 00:00:00 2001 From: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp> Date: Thu, 6 Mar 2025 16:06:31 +0900 Subject: [PATCH 42/55] chore: removed default destructor Signed-off-by: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp> --- .../cuda_pointcloud_preprocessor_node.hpp | 1 - 1 file changed, 1 deletion(-) diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.hpp b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.hpp index f8b77217319a6..152398bd8e0d4 100644 --- a/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.hpp +++ b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.hpp @@ -72,7 +72,6 @@ class CudaPointcloudPreprocessorNode : public rclcpp::Node { public: explicit CudaPointcloudPreprocessorNode(const rclcpp::NodeOptions & node_options); - ~CudaPointcloudPreprocessorNode() = default; private: bool getTransform( From 17af8d871a1e8270713209c9bd92874726611102 Mon Sep 17 00:00:00 2001 From: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp> Date: Thu, 6 Mar 2025 16:16:25 +0900 Subject: [PATCH 43/55] chore: ocd activated (spelling) Signed-off-by: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp> --- .../undistort_kernels.hpp | 4 ++-- .../cuda_pointcloud_preprocessor.cu | 4 ++-- .../undistort_kernels.cu | 12 ++++++------ 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/undistort_kernels.hpp b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/undistort_kernels.hpp index 29cd073b700ae..8b791f171285c 100644 --- a/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/undistort_kernels.hpp +++ b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/undistort_kernels.hpp @@ -28,11 +28,11 @@ namespace autoware::cuda_pointcloud_preprocessor { -void undistort2dLaunch( +void undistort2DLaunch( InputPointType * input_points, int num_points, TwistStruct2D * twist_structs, int num_twists, int threads_per_block, int blocks_per_grid, cudaStream_t & stream); -void undistort3dLaunch( +void undistort3DLaunch( InputPointType * input_points, int num_points, TwistStruct3D * twist_structs, int num_twists, int threads_per_block, int blocks_per_grid, cudaStream_t & stream); diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu index f9a070ee40173..aa9f19b8824f6 100644 --- a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu +++ b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.cu @@ -397,12 +397,12 @@ std::unique_ptr<cuda_blackboard::CudaPointCloud2> CudaPointcloudPreprocessor::pr if ( undistortion_type_ == UndistortionType::Undistortion3D && device_twist_3d_structs_.size() > 0) { - undistort3dLaunch( + undistort3DLaunch( device_transformed_points, num_organized_points_, device_twist_3d_structs, device_twist_3d_structs_.size(), threads_per_block_, blocks_per_grid, stream_); } else if ( undistortion_type_ == UndistortionType::Undistortion2D && device_twist_2d_structs_.size() > 0) { - undistort2dLaunch( + undistort2DLaunch( device_transformed_points, num_organized_points_, device_twist_2d_structs, device_twist_2d_structs_.size(), threads_per_block_, blocks_per_grid, stream_); } diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/undistort_kernels.cu b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/undistort_kernels.cu index 4dce3345b4807..8457cad79590d 100644 --- a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/undistort_kernels.cu +++ b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/undistort_kernels.cu @@ -64,7 +64,7 @@ __host__ __device__ Eigen::Matrix4f transformationMatrixFromVelocity( return transformation; } -__global__ void undistort2dKernel( +__global__ void undistort2DKernel( InputPointType * input_points, int num_points, TwistStruct2D * twist_structs, int num_twists) { int idx = blockIdx.x * blockDim.x + threadIdx.x; @@ -101,7 +101,7 @@ __global__ void undistort2dKernel( } } -__global__ void undistort3dKernel( +__global__ void undistort3DKernel( InputPointType * input_points, int num_points, TwistStruct3D * twist_structs, int num_twists) { int idx = blockIdx.x * blockDim.x + threadIdx.x; @@ -136,19 +136,19 @@ __global__ void undistort3dKernel( } } -void undistort2dLaunch( +void undistort2DLaunch( InputPointType * input_points, int num_points, TwistStruct2D * twist_structs, int num_twists, int threads_per_block, int blocks_per_grid, cudaStream_t & stream) { - undistort2dKernel<<<blocks_per_grid, threads_per_block, 0, stream>>>( + undistort2DKernel<<<blocks_per_grid, threads_per_block, 0, stream>>>( input_points, num_points, twist_structs, num_twists); } -void undistort3dLaunch( +void undistort3DLaunch( InputPointType * input_points, int num_points, TwistStruct3D * twist_structs, int num_twists, int threads_per_block, int blocks_per_grid, cudaStream_t & stream) { - undistort3dKernel<<<blocks_per_grid, threads_per_block, 0, stream>>>( + undistort3DKernel<<<blocks_per_grid, threads_per_block, 0, stream>>>( input_points, num_points, twist_structs, num_twists); } From 652d16830a5f0ac366fb685784c434959f6fde7a Mon Sep 17 00:00:00 2001 From: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp> Date: Thu, 6 Mar 2025 16:49:07 +0900 Subject: [PATCH 44/55] chore: fixed the schema Signed-off-by: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp> --- .../cuda_pointcloud_preprocessor.schema.json | 30 ++++++++----------- 1 file changed, 12 insertions(+), 18 deletions(-) diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/schema/cuda_pointcloud_preprocessor.schema.json b/sensing/autoware_cuda_pointcloud_preprocessor/schema/cuda_pointcloud_preprocessor.schema.json index bbd539a26b8ce..9d61b4932ad8a 100644 --- a/sensing/autoware_cuda_pointcloud_preprocessor/schema/cuda_pointcloud_preprocessor.schema.json +++ b/sensing/autoware_cuda_pointcloud_preprocessor/schema/cuda_pointcloud_preprocessor.schema.json @@ -42,32 +42,32 @@ "crop_box.min_x": { "type": "array", "description": "An array in which each element is the minimum x value in a crop box", - "default": [1.0] + "default": [] }, "crop_box.min_y": { "type": "array", "description": "An array in which each element is the minimum y value in a crop box", - "default": [1.0] + "default": [] }, "crop_box.min_z": { "type": "array", "description": "An array in which each element is the minimum z value in a crop box", - "default": [1.0] + "default": [] }, "crop_box.max_x": { "type": "array", "description": "An array in which each element is the maximum x value in a crop box", - "default": [-1.0] + "default": [] }, "crop_box.max_y": { "type": "array", "description": "An array in which each element is the maximum y value in a crop box", - "default": [-1.0] + "default": [] }, "crop_box.max_z": { "type": "array", "description": "An array in which each element is the maximum z value in a crop box", - "default": [-1.0] + "default": [] } }, "required": [ @@ -77,18 +77,12 @@ "distance_ratio", "object_length_threshold", "num_points_threshold", - "self_crop.min_x", - "self_crop.min_y", - "self_crop.min_z", - "self_crop.max_x", - "self_crop.max_y", - "self_crop.max_z", - "mirror_crop.min_x", - "mirror_crop.min_y", - "mirror_crop.min_z", - "mirror_crop.max_x", - "mirror_crop.max_y", - "mirror_crop.max_z" + "crop_box.min_x", + "crop_box.min_y", + "crop_box.min_z", + "crop_box.max_x", + "crop_box.max_y", + "crop_box.max_z" ], "additionalProperties": false } From 2449e67b6685056d7c972a244dd8b0cc5a85b075 Mon Sep 17 00:00:00 2001 From: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp> Date: Thu, 6 Mar 2025 17:18:45 +0900 Subject: [PATCH 45/55] chore: improved readibility Signed-off-by: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp> --- .../cuda_pointcloud_preprocessor_node.cpp | 44 +++++++++++-------- 1 file changed, 25 insertions(+), 19 deletions(-) diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.cpp b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.cpp index 6c62d9019919f..98453cb75c8ae 100644 --- a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.cpp +++ b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.cpp @@ -165,47 +165,53 @@ void CudaPointcloudPreprocessorNode::twistCallback( while (!twist_queue_.empty()) { // for replay rosbag - if ( - rclcpp::Time(twist_queue_.front().header.stamp) > rclcpp::Time(twist_msg_ptr->header.stamp)) { - twist_queue_.pop_front(); - } else if ( // NOLINT + bool backwards_time_jump_detected = + rclcpp::Time(twist_queue_.front().header.stamp) > rclcpp::Time(twist_msg_ptr->header.stamp); + bool is_queue_longer_than_1s = rclcpp::Time(twist_queue_.front().header.stamp) < - rclcpp::Time(twist_msg_ptr->header.stamp) - rclcpp::Duration::from_seconds(1.0)) { + rclcpp::Time(twist_msg_ptr->header.stamp) - rclcpp::Duration::from_seconds(1.0); + + if (backwards_time_jump_detected) { + twist_queue_.clear(); + } else if (is_queue_longer_than_1s) { twist_queue_.pop_front(); + } else { + break; } - break; } } void CudaPointcloudPreprocessorNode::imuCallback( const sensor_msgs::msg::Imu::ConstSharedPtr imu_msg) { - tf2::Transform tf2_imu_link_to_base_link{}; - getTransform(base_frame_, imu_msg->header.frame_id, &tf2_imu_link_to_base_link); - geometry_msgs::msg::TransformStamped::SharedPtr tf_base2imu_ptr = - std::make_shared<geometry_msgs::msg::TransformStamped>(); - tf_base2imu_ptr->transform.rotation = tf2::toMsg(tf2_imu_link_to_base_link.getRotation()); + tf2::Transform imu_to_base_tf2{}; + getTransform(base_frame_, imu_msg->header.frame_id, &imu_to_base_tf2); + geometry_msgs::msg::TransformStamped imu_to_base_msg; + imu_to_base_msg.transform.rotation = tf2::toMsg(imu_to_base_tf2.getRotation()); geometry_msgs::msg::Vector3Stamped angular_velocity; angular_velocity.vector = imu_msg->angular_velocity; geometry_msgs::msg::Vector3Stamped transformed_angular_velocity; - tf2::doTransform(angular_velocity, transformed_angular_velocity, *tf_base2imu_ptr); + tf2::doTransform(angular_velocity, transformed_angular_velocity, imu_to_base_msg); transformed_angular_velocity.header = imu_msg->header; angular_velocity_queue_.push_back(transformed_angular_velocity); while (!angular_velocity_queue_.empty()) { // for rosbag replay - if ( - rclcpp::Time(angular_velocity_queue_.front().header.stamp) > - rclcpp::Time(imu_msg->header.stamp)) { - angular_velocity_queue_.pop_front(); - } else if ( // NOLINT + bool backwards_time_jump_detected = rclcpp::Time(angular_velocity_queue_.front().header.stamp) > + rclcpp::Time(imu_msg->header.stamp); + bool is_queue_longer_than_1s = rclcpp::Time(angular_velocity_queue_.front().header.stamp) < - rclcpp::Time(imu_msg->header.stamp) - rclcpp::Duration::from_seconds(1.0)) { + rclcpp::Time(imu_msg->header.stamp) - rclcpp::Duration::from_seconds(1.0); + + if (backwards_time_jump_detected) { + angular_velocity_queue_.clear(); + } else if (is_queue_longer_than_1s) { angular_velocity_queue_.pop_front(); + } else { + break; } - break; } } From 9ededc51329c541ec512a7b1fd5a471d300d3f57 Mon Sep 17 00:00:00 2001 From: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp> Date: Mon, 17 Mar 2025 15:07:55 +0900 Subject: [PATCH 46/55] chore: added dummy crop box Signed-off-by: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp> --- .../config/cuda_pointcloud_preprocessor.param.yaml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/config/cuda_pointcloud_preprocessor.param.yaml b/sensing/autoware_cuda_pointcloud_preprocessor/config/cuda_pointcloud_preprocessor.param.yaml index fdea4bebae796..5a3d138484d0c 100644 --- a/sensing/autoware_cuda_pointcloud_preprocessor/config/cuda_pointcloud_preprocessor.param.yaml +++ b/sensing/autoware_cuda_pointcloud_preprocessor/config/cuda_pointcloud_preprocessor.param.yaml @@ -8,9 +8,9 @@ object_length_threshold: 0.1 num_points_threshold: 4 - crop_box.min_x: [] - crop_box.min_y: [] - crop_box.min_z: [] - crop_box.max_x: [] - crop_box.max_y: [] - crop_box.max_z: [] + crop_box.min_x: [0.0] + crop_box.min_y: [0.0] + crop_box.min_z: [0.0] + crop_box.max_x: [0.0] + crop_box.max_y: [0.0] + crop_box.max_z: [0.0] From 5b5556736e2a32af8d72f42e1fe37ce7f4618f77 Mon Sep 17 00:00:00 2001 From: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp> Date: Tue, 18 Mar 2025 15:59:49 +0900 Subject: [PATCH 47/55] chore: added new repositories to ansible Signed-off-by: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp> --- build_depends_humble.repos | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/build_depends_humble.repos b/build_depends_humble.repos index 0f35bd3d0e357..7d4fa266293d0 100644 --- a/build_depends_humble.repos +++ b/build_depends_humble.repos @@ -46,3 +46,11 @@ repositories: type: git url: https://github.com/tier4/glog.git version: v0.6.0_t4-ros + universe/external/cuda_blackboard: + type: git + url: https://github.com/autowarefoundation/cuda_blackboard.git + version: v0.1.1 + universe/external/negotiated: + type: git + url: https://github.com/osrf/negotiated.git + version: eac198b55dcd052af5988f0f174902913c5f20e7 From 405e5f87062cd538830b8910113602f21736c08f Mon Sep 17 00:00:00 2001 From: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp> Date: Tue, 18 Mar 2025 16:22:01 +0900 Subject: [PATCH 48/55] chore: CI/CD Signed-off-by: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp> --- .../autoware_cuda_pointcloud_preprocessor/CMakeLists.txt | 8 ++++---- .../cuda_pointcloud_preprocessor.hpp | 2 ++ .../cuda_pointcloud_preprocessor_node.hpp | 1 + .../autoware/cuda_pointcloud_preprocessor/types.hpp | 2 +- 4 files changed, 8 insertions(+), 5 deletions(-) diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/CMakeLists.txt b/sensing/autoware_cuda_pointcloud_preprocessor/CMakeLists.txt index 0e174ee18bbd2..3a77c27ce5f8b 100644 --- a/sensing/autoware_cuda_pointcloud_preprocessor/CMakeLists.txt +++ b/sensing/autoware_cuda_pointcloud_preprocessor/CMakeLists.txt @@ -15,13 +15,13 @@ endif() ament_auto_find_build_dependencies() # Default to C++17 -if (NOT CMAKE_CXX_STANDARD) +if(NOT CMAKE_CXX_STANDARD) set(CMAKE_CXX_STANDARD 17) -endif () +endif() -if (CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "Clang") +if(CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "Clang") add_compile_options(-Wall -Wextra -Wpedantic -Wunused-function) -endif () +endif() if(BUILD_TESTING) find_package(ament_lint_auto REQUIRED) diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.hpp b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.hpp index d14fc2ec1eb68..bd4a3a1b20601 100644 --- a/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.hpp +++ b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor.hpp @@ -31,6 +31,8 @@ #include <cstdint> #include <deque> +#include <memory> +#include <vector> namespace autoware::cuda_pointcloud_preprocessor { diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.hpp b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.hpp index 152398bd8e0d4..742407e328e23 100644 --- a/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.hpp +++ b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.hpp @@ -38,6 +38,7 @@ #include <deque> #include <memory> +#include <string> #define CHECK_OFFSET(structure1, structure2, field) \ static_assert( \ diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/types.hpp b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/types.hpp index aed91d2598193..84a36d4bd8f6c 100644 --- a/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/types.hpp +++ b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/types.hpp @@ -84,7 +84,7 @@ class MemoryPoolAllocator { public: using value_type = T; - MemoryPoolAllocator(cudaMemPool_t pool) : m_pool(pool) {} + explicit MemoryPoolAllocator(cudaMemPool_t pool) : m_pool(pool) {} T * allocate(std::size_t n) { From df442d59c53fec0b14e603a56c3d7838ff20c9d5 Mon Sep 17 00:00:00 2001 From: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp> Date: Tue, 18 Mar 2025 16:32:37 +0900 Subject: [PATCH 49/55] chore: more CI/CD Signed-off-by: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp> --- .../autoware/cuda_pointcloud_preprocessor/point_types.hpp | 1 - 1 file changed, 1 deletion(-) diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/point_types.hpp b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/point_types.hpp index 4518c6cd2a908..e92fde50d2782 100644 --- a/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/point_types.hpp +++ b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/point_types.hpp @@ -1,4 +1,3 @@ - // Copyright 2025 TIER IV, Inc. // // Licensed under the Apache License, Version 2.0 (the "License"); From f4201b6015f977e9cfbd2f5d51bc0e9a0fedc34f Mon Sep 17 00:00:00 2001 From: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp> Date: Tue, 18 Mar 2025 17:03:22 +0900 Subject: [PATCH 50/55] chore: mode CI/CD. some linters are conflicting Signed-off-by: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp> --- .../cuda_pointcloud_preprocessor_node.hpp | 4 ++-- .../autoware/cuda_pointcloud_preprocessor/types.hpp | 7 ++++--- .../cuda_pointcloud_preprocessor_node.cpp | 11 +++++++---- 3 files changed, 13 insertions(+), 9 deletions(-) diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.hpp b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.hpp index 742407e328e23..161f6900c5234 100644 --- a/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.hpp +++ b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.hpp @@ -40,8 +40,8 @@ #include <memory> #include <string> -#define CHECK_OFFSET(structure1, structure2, field) \ - static_assert( \ +#define CHECK_OFFSET(structure1, structure2, field) \ + static_assert( \ offsetof(structure1, field) == offsetof(structure2, field), \ "Offset of " #field " in " #structure1 " does not match expected offset.") diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/types.hpp b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/types.hpp index 84a36d4bd8f6c..188e3a76cf209 100644 --- a/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/types.hpp +++ b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/types.hpp @@ -79,12 +79,13 @@ struct RingOutlierFilterParameters std::size_t num_points_threshold{0}; }; -template <typename T> +template<typename T> class MemoryPoolAllocator { public: using value_type = T; - explicit MemoryPoolAllocator(cudaMemPool_t pool) : m_pool(pool) {} + explicit MemoryPoolAllocator(cudaMemPool_t pool) + : m_pool(pool) {} T * allocate(std::size_t n) { @@ -93,7 +94,7 @@ class MemoryPoolAllocator return static_cast<T *>(ptr); } - void deallocate(T * ptr, std::size_t) { cudaFreeAsync(ptr, cudaStreamDefault); } + void deallocate(T * ptr, std::size_t) {cudaFreeAsync(ptr, cudaStreamDefault);} protected: cudaMemPool_t m_pool; diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.cpp b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.cpp index 98453cb75c8ae..2096c10448271 100644 --- a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.cpp +++ b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.cpp @@ -67,7 +67,8 @@ CudaPointcloudPreprocessorNode::CudaPointcloudPreprocessorNode( crop_box_min_x_vector.size() != crop_box_min_z_vector.size() || crop_box_min_x_vector.size() != crop_box_max_x_vector.size() || crop_box_min_x_vector.size() != crop_box_max_y_vector.size() || - crop_box_min_x_vector.size() != crop_box_max_z_vector.size()) { + crop_box_min_x_vector.size() != crop_box_max_z_vector.size()) + { throw std::runtime_error("Crop box parameters must have the same size"); } @@ -90,7 +91,7 @@ CudaPointcloudPreprocessorNode::CudaPointcloudPreprocessorNode( // Publisher pub_ = std::make_unique<cuda_blackboard::CudaBlackboardPublisher<cuda_blackboard::CudaPointCloud2>>( - *this, "~/output/pointcloud"); + *this, "~/output/pointcloud"); // Subscriber rclcpp::SubscriptionOptions sub_options; @@ -240,12 +241,14 @@ void CudaPointcloudPreprocessorNode::pointcloudCallback( first_point_rel_stamp * 1e-9; while (twist_queue_.size() > 1 && - rclcpp::Time(twist_queue_.front().header.stamp).seconds() < first_point_stamp) { + rclcpp::Time(twist_queue_.front().header.stamp).seconds() < first_point_stamp) + { twist_queue_.pop_front(); } while (angular_velocity_queue_.size() > 1 && - rclcpp::Time(angular_velocity_queue_.front().header.stamp).seconds() < first_point_stamp) { + rclcpp::Time(angular_velocity_queue_.front().header.stamp).seconds() < first_point_stamp) + { angular_velocity_queue_.pop_front(); } From 3fcbc42d821ac47187704d9636407de861a6ca76 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 18 Mar 2025 08:09:39 +0000 Subject: [PATCH 51/55] style(pre-commit): autofix --- .../cuda_pointcloud_preprocessor_node.hpp | 4 ++-- .../autoware/cuda_pointcloud_preprocessor/types.hpp | 7 +++---- .../cuda_pointcloud_preprocessor_node.cpp | 11 ++++------- 3 files changed, 9 insertions(+), 13 deletions(-) diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.hpp b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.hpp index 161f6900c5234..742407e328e23 100644 --- a/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.hpp +++ b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.hpp @@ -40,8 +40,8 @@ #include <memory> #include <string> -#define CHECK_OFFSET(structure1, structure2, field) \ - static_assert( \ +#define CHECK_OFFSET(structure1, structure2, field) \ + static_assert( \ offsetof(structure1, field) == offsetof(structure2, field), \ "Offset of " #field " in " #structure1 " does not match expected offset.") diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/types.hpp b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/types.hpp index 188e3a76cf209..84a36d4bd8f6c 100644 --- a/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/types.hpp +++ b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/types.hpp @@ -79,13 +79,12 @@ struct RingOutlierFilterParameters std::size_t num_points_threshold{0}; }; -template<typename T> +template <typename T> class MemoryPoolAllocator { public: using value_type = T; - explicit MemoryPoolAllocator(cudaMemPool_t pool) - : m_pool(pool) {} + explicit MemoryPoolAllocator(cudaMemPool_t pool) : m_pool(pool) {} T * allocate(std::size_t n) { @@ -94,7 +93,7 @@ class MemoryPoolAllocator return static_cast<T *>(ptr); } - void deallocate(T * ptr, std::size_t) {cudaFreeAsync(ptr, cudaStreamDefault);} + void deallocate(T * ptr, std::size_t) { cudaFreeAsync(ptr, cudaStreamDefault); } protected: cudaMemPool_t m_pool; diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.cpp b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.cpp index 2096c10448271..98453cb75c8ae 100644 --- a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.cpp +++ b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.cpp @@ -67,8 +67,7 @@ CudaPointcloudPreprocessorNode::CudaPointcloudPreprocessorNode( crop_box_min_x_vector.size() != crop_box_min_z_vector.size() || crop_box_min_x_vector.size() != crop_box_max_x_vector.size() || crop_box_min_x_vector.size() != crop_box_max_y_vector.size() || - crop_box_min_x_vector.size() != crop_box_max_z_vector.size()) - { + crop_box_min_x_vector.size() != crop_box_max_z_vector.size()) { throw std::runtime_error("Crop box parameters must have the same size"); } @@ -91,7 +90,7 @@ CudaPointcloudPreprocessorNode::CudaPointcloudPreprocessorNode( // Publisher pub_ = std::make_unique<cuda_blackboard::CudaBlackboardPublisher<cuda_blackboard::CudaPointCloud2>>( - *this, "~/output/pointcloud"); + *this, "~/output/pointcloud"); // Subscriber rclcpp::SubscriptionOptions sub_options; @@ -241,14 +240,12 @@ void CudaPointcloudPreprocessorNode::pointcloudCallback( first_point_rel_stamp * 1e-9; while (twist_queue_.size() > 1 && - rclcpp::Time(twist_queue_.front().header.stamp).seconds() < first_point_stamp) - { + rclcpp::Time(twist_queue_.front().header.stamp).seconds() < first_point_stamp) { twist_queue_.pop_front(); } while (angular_velocity_queue_.size() > 1 && - rclcpp::Time(angular_velocity_queue_.front().header.stamp).seconds() < first_point_stamp) - { + rclcpp::Time(angular_velocity_queue_.front().header.stamp).seconds() < first_point_stamp) { angular_velocity_queue_.pop_front(); } From 47e33bfb6f963b797e6d571a006866a93b9c46d8 Mon Sep 17 00:00:00 2001 From: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp> Date: Tue, 18 Mar 2025 18:07:50 +0900 Subject: [PATCH 52/55] chore: ignoring uncrustify Signed-off-by: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp> --- .../cuda_pointcloud_preprocessor_node.hpp | 2 ++ .../include/autoware/cuda_pointcloud_preprocessor/types.hpp | 2 ++ .../cuda_pointcloud_preprocessor_node.cpp | 6 ++++++ 3 files changed, 10 insertions(+) diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.hpp b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.hpp index 742407e328e23..1c2b15b5de5d3 100644 --- a/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.hpp +++ b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.hpp @@ -40,10 +40,12 @@ #include <memory> #include <string> +/* *INDENT-OFF* */ #define CHECK_OFFSET(structure1, structure2, field) \ static_assert( \ offsetof(structure1, field) == offsetof(structure2, field), \ "Offset of " #field " in " #structure1 " does not match expected offset.") +/* *INDENT-ON* */ namespace autoware::cuda_pointcloud_preprocessor { diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/types.hpp b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/types.hpp index 84a36d4bd8f6c..dbe62b4e82f47 100644 --- a/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/types.hpp +++ b/sensing/autoware_cuda_pointcloud_preprocessor/include/autoware/cuda_pointcloud_preprocessor/types.hpp @@ -79,6 +79,7 @@ struct RingOutlierFilterParameters std::size_t num_points_threshold{0}; }; +/* *INDENT-OFF* */ template <typename T> class MemoryPoolAllocator { @@ -98,6 +99,7 @@ class MemoryPoolAllocator protected: cudaMemPool_t m_pool; }; +/* *INDENT-ON* */ } // namespace autoware::cuda_pointcloud_preprocessor diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.cpp b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.cpp index 98453cb75c8ae..5f2a04d4cb01a 100644 --- a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.cpp +++ b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.cpp @@ -88,9 +88,11 @@ CudaPointcloudPreprocessorNode::CudaPointcloudPreprocessorNode( bool use_imu = declare_parameter<bool>("use_imu"); // Publisher + /* *INDENT-OFF* */ pub_ = std::make_unique<cuda_blackboard::CudaBlackboardPublisher<cuda_blackboard::CudaPointCloud2>>( *this, "~/output/pointcloud"); + /* *INDENT-ON* */ // Subscriber rclcpp::SubscriptionOptions sub_options; @@ -112,9 +114,11 @@ CudaPointcloudPreprocessorNode::CudaPointcloudPreprocessorNode( sub_options); } + /* *INDENT-OFF* */ CudaPointcloudPreprocessor::UndistortionType undistortion_type = use_3d_undistortion ? CudaPointcloudPreprocessor::UndistortionType::Undistortion3D : CudaPointcloudPreprocessor::UndistortionType::Undistortion2D; + /* *INDENT-ON* */ cuda_pointcloud_preprocessor_ = std::make_unique<CudaPointcloudPreprocessor>(); cuda_pointcloud_preprocessor_->setRingOutlierFilterParameters(ring_outlier_filter_parameters); @@ -239,6 +243,7 @@ void CudaPointcloudPreprocessorNode::pointcloudCallback( input_pointcloud_msg_ptr->header.stamp.nanosec * 1e-9 + first_point_rel_stamp * 1e-9; + /* *INDENT-ON* */ while (twist_queue_.size() > 1 && rclcpp::Time(twist_queue_.front().header.stamp).seconds() < first_point_stamp) { twist_queue_.pop_front(); @@ -248,6 +253,7 @@ void CudaPointcloudPreprocessorNode::pointcloudCallback( rclcpp::Time(angular_velocity_queue_.front().header.stamp).seconds() < first_point_stamp) { angular_velocity_queue_.pop_front(); } + /* *INDENT-ON* */ // Obtain the base link to input pointcloud transform geometry_msgs::msg::TransformStamped transform_msg; From 8b4700bcd92e08d966cf8fe32340459a324ccb23 Mon Sep 17 00:00:00 2001 From: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp> Date: Tue, 18 Mar 2025 18:23:28 +0900 Subject: [PATCH 53/55] chore: ignoring more uncrustify Signed-off-by: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp> --- .../cuda_pointcloud_preprocessor_node.cpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.cpp b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.cpp index 5f2a04d4cb01a..520ce8fdf89fe 100644 --- a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.cpp +++ b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.cpp @@ -62,6 +62,7 @@ CudaPointcloudPreprocessorNode::CudaPointcloudPreprocessorNode( const auto crop_box_max_y_vector = declare_parameter<std::vector<double>>("crop_box.max_y"); const auto crop_box_max_z_vector = declare_parameter<std::vector<double>>("crop_box.max_z"); + /* *INDENT-OFF* */ if ( crop_box_min_x_vector.size() != crop_box_min_y_vector.size() || crop_box_min_x_vector.size() != crop_box_min_z_vector.size() || @@ -70,6 +71,7 @@ CudaPointcloudPreprocessorNode::CudaPointcloudPreprocessorNode( crop_box_min_x_vector.size() != crop_box_max_z_vector.size()) { throw std::runtime_error("Crop box parameters must have the same size"); } + /* *INDENT-ON* */ std::vector<CropBoxParameters> crop_box_parameters; @@ -201,6 +203,7 @@ void CudaPointcloudPreprocessorNode::imuCallback( transformed_angular_velocity.header = imu_msg->header; angular_velocity_queue_.push_back(transformed_angular_velocity); + /* *INDENT-OFF* */ while (!angular_velocity_queue_.empty()) { // for rosbag replay bool backwards_time_jump_detected = rclcpp::Time(angular_velocity_queue_.front().header.stamp) > @@ -217,6 +220,7 @@ void CudaPointcloudPreprocessorNode::imuCallback( break; } } + /* *INDENT-ON* */ } void CudaPointcloudPreprocessorNode::pointcloudCallback( @@ -243,7 +247,7 @@ void CudaPointcloudPreprocessorNode::pointcloudCallback( input_pointcloud_msg_ptr->header.stamp.nanosec * 1e-9 + first_point_rel_stamp * 1e-9; - /* *INDENT-ON* */ + /* *INDENT-OFF* */ while (twist_queue_.size() > 1 && rclcpp::Time(twist_queue_.front().header.stamp).seconds() < first_point_stamp) { twist_queue_.pop_front(); From 368009e8f2bc99e92b45cd630b2eb54c12c902c3 Mon Sep 17 00:00:00 2001 From: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp> Date: Tue, 18 Mar 2025 18:37:15 +0900 Subject: [PATCH 54/55] chore: missed one more uncrustify exception Signed-off-by: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp> --- .../cuda_pointcloud_preprocessor_node.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.cpp b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.cpp index 520ce8fdf89fe..b206cddb3e203 100644 --- a/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.cpp +++ b/sensing/autoware_cuda_pointcloud_preprocessor/src/cuda_pointcloud_preprocessor/cuda_pointcloud_preprocessor_node.cpp @@ -241,13 +241,13 @@ void CudaPointcloudPreprocessorNode::pointcloudCallback( sensor_msgs::PointCloud2ConstIterator<std::uint32_t> iter_stamp( *input_pointcloud_msg_ptr, "time_stamp"); + /* *INDENT-OFF* */ auto num_points = input_pointcloud_msg_ptr->width * input_pointcloud_msg_ptr->height; std::uint32_t first_point_rel_stamp = num_points > 0 ? *iter_stamp : 0; double first_point_stamp = input_pointcloud_msg_ptr->header.stamp.sec + input_pointcloud_msg_ptr->header.stamp.nanosec * 1e-9 + first_point_rel_stamp * 1e-9; - /* *INDENT-OFF* */ while (twist_queue_.size() > 1 && rclcpp::Time(twist_queue_.front().header.stamp).seconds() < first_point_stamp) { twist_queue_.pop_front(); From a2713a0c8fe4aa4290ecde0ad142547545c621a1 Mon Sep 17 00:00:00 2001 From: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp> Date: Tue, 18 Mar 2025 19:05:44 +0900 Subject: [PATCH 55/55] chore: added meta dep Signed-off-by: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp> --- sensing/autoware_cuda_pointcloud_preprocessor/package.xml | 1 + 1 file changed, 1 insertion(+) diff --git a/sensing/autoware_cuda_pointcloud_preprocessor/package.xml b/sensing/autoware_cuda_pointcloud_preprocessor/package.xml index fecb38c2f48f4..abbdb5ecb6b78 100644 --- a/sensing/autoware_cuda_pointcloud_preprocessor/package.xml +++ b/sensing/autoware_cuda_pointcloud_preprocessor/package.xml @@ -10,6 +10,7 @@ <buildtool_depend>ament_cmake_auto</buildtool_depend> <buildtool_depend>autoware_cmake</buildtool_depend> + <depend>autoware_cuda_dependency_meta</depend> <depend>autoware_point_types</depend> <depend>autoware_pointcloud_preprocessor</depend> <depend>autoware_universe_utils</depend>