|
31 | 31 | #include "lidar_transfusion/cuda_utils.hpp"
|
32 | 32 | #include "lidar_transfusion/preprocess/preprocess_kernel.hpp"
|
33 | 33 |
|
| 34 | +#include <cstdint> |
| 35 | + |
34 | 36 | namespace lidar_transfusion
|
35 | 37 | {
|
36 | 38 |
|
@@ -99,9 +101,12 @@ __global__ void generateVoxels_random_kernel(
|
99 | 101 | cudaError_t PreprocessCuda::generateVoxels_random_launch(
|
100 | 102 | float * points, unsigned int points_size, unsigned int * mask, float * voxels)
|
101 | 103 | {
|
102 |
| - int threadNum = config_.threads_for_voxel_; |
103 |
| - dim3 blocks((points_size + threadNum - 1) / threadNum); |
104 |
| - dim3 threads(threadNum); |
| 104 | + if (points_size == 0) { |
| 105 | + return cudaGetLastError(); |
| 106 | + } |
| 107 | + dim3 blocks(divup(points_size, config_.threads_for_voxel_)); |
| 108 | + dim3 threads(config_.threads_for_voxel_); |
| 109 | + |
105 | 110 | generateVoxels_random_kernel<<<blocks, threads, 0, stream_>>>(
|
106 | 111 | points, points_size, config_.min_x_range_, config_.max_x_range_, config_.min_y_range_,
|
107 | 112 | config_.max_y_range_, config_.min_z_range_, config_.max_z_range_, config_.voxel_x_size_,
|
@@ -165,40 +170,48 @@ cudaError_t PreprocessCuda::generateBaseFeatures_launch(
|
165 | 170 | }
|
166 | 171 |
|
167 | 172 | __global__ void generateSweepPoints_kernel(
|
168 |
| - const float * input_points, size_t points_size, int input_point_step, float time_lag, |
| 173 | + const uint8_t * input_data, size_t points_size, int input_point_step, float time_lag, |
169 | 174 | const float * transform_array, int num_features, float * output_points)
|
170 | 175 | {
|
171 | 176 | int point_idx = blockIdx.x * blockDim.x + threadIdx.x;
|
172 | 177 | if (point_idx >= points_size) return;
|
173 | 178 |
|
174 |
| - const float input_x = input_points[point_idx * input_point_step + 0]; |
175 |
| - const float input_y = input_points[point_idx * input_point_step + 1]; |
176 |
| - const float input_z = input_points[point_idx * input_point_step + 2]; |
177 |
| - const float intensity = input_points[point_idx * input_point_step + 3]; |
178 |
| - |
179 |
| - output_points[point_idx * num_features] = transform_array[0] * input_x + |
180 |
| - transform_array[4] * input_y + |
181 |
| - transform_array[8] * input_z + transform_array[12]; |
182 |
| - output_points[point_idx * num_features + 1] = transform_array[1] * input_x + |
183 |
| - transform_array[5] * input_y + |
184 |
| - transform_array[9] * input_z + transform_array[13]; |
185 |
| - output_points[point_idx * num_features + 2] = transform_array[2] * input_x + |
186 |
| - transform_array[6] * input_y + |
187 |
| - transform_array[10] * input_z + transform_array[14]; |
188 |
| - output_points[point_idx * num_features + 3] = intensity; |
| 179 | + union { |
| 180 | + uint32_t raw{0}; |
| 181 | + float value; |
| 182 | + } input_x, input_y, input_z; |
| 183 | + |
| 184 | +#pragma unroll |
| 185 | + for (int i = 0; i < 4; i++) { // 4 bytes for float32 |
| 186 | + input_x.raw |= input_data[point_idx * input_point_step + i] << i * 8; |
| 187 | + input_y.raw |= input_data[point_idx * input_point_step + i + 4] << i * 8; |
| 188 | + input_z.raw |= input_data[point_idx * input_point_step + i + 8] << i * 8; |
| 189 | + } |
| 190 | + |
| 191 | + float input_intensity = static_cast<float>(input_data[point_idx * input_point_step + 12]); |
| 192 | + |
| 193 | + output_points[point_idx * num_features] = |
| 194 | + transform_array[0] * input_x.value + transform_array[4] * input_y.value + |
| 195 | + transform_array[8] * input_z.value + transform_array[12]; |
| 196 | + output_points[point_idx * num_features + 1] = |
| 197 | + transform_array[1] * input_x.value + transform_array[5] * input_y.value + |
| 198 | + transform_array[9] * input_z.value + transform_array[13]; |
| 199 | + output_points[point_idx * num_features + 2] = |
| 200 | + transform_array[2] * input_x.value + transform_array[6] * input_y.value + |
| 201 | + transform_array[10] * input_z.value + transform_array[14]; |
| 202 | + output_points[point_idx * num_features + 3] = input_intensity; |
189 | 203 | output_points[point_idx * num_features + 4] = time_lag;
|
190 | 204 | }
|
191 | 205 |
|
192 | 206 | cudaError_t PreprocessCuda::generateSweepPoints_launch(
|
193 |
| - const float * input_points, size_t points_size, int input_point_step, float time_lag, |
| 207 | + const uint8_t * input_data, size_t points_size, int input_point_step, float time_lag, |
194 | 208 | const float * transform_array, float * output_points)
|
195 | 209 | {
|
196 |
| - int threadNum = config_.threads_for_voxel_; |
197 |
| - dim3 blocks((points_size + threadNum - 1) / threadNum); |
198 |
| - dim3 threads(threadNum); |
| 210 | + dim3 blocks(divup(points_size, config_.threads_for_voxel_)); |
| 211 | + dim3 threads(config_.threads_for_voxel_); |
199 | 212 |
|
200 | 213 | generateSweepPoints_kernel<<<blocks, threads, 0, stream_>>>(
|
201 |
| - input_points, points_size, input_point_step, time_lag, transform_array, |
| 214 | + input_data, points_size, input_point_step, time_lag, transform_array, |
202 | 215 | config_.num_point_feature_size_, output_points);
|
203 | 216 |
|
204 | 217 | cudaError_t err = cudaGetLastError();
|
|
0 commit comments