|
31 | 31 | #include "lidar_transfusion/cuda_utils.hpp"
|
32 | 32 | #include "lidar_transfusion/preprocess/preprocess_kernel.hpp"
|
33 | 33 |
|
34 |
| -#include <cstdint> |
35 |
| - |
36 | 34 | namespace lidar_transfusion
|
37 | 35 | {
|
38 | 36 |
|
@@ -101,12 +99,9 @@ __global__ void generateVoxels_random_kernel(
|
101 | 99 | cudaError_t PreprocessCuda::generateVoxels_random_launch(
|
102 | 100 | float * points, unsigned int points_size, unsigned int * mask, float * voxels)
|
103 | 101 | {
|
104 |
| - if (points_size == 0) { |
105 |
| - return cudaGetLastError(); |
106 |
| - } |
107 |
| - dim3 blocks(divup(points_size, config_.threads_for_voxel_)); |
108 |
| - dim3 threads(config_.threads_for_voxel_); |
109 |
| - |
| 102 | + int threadNum = config_.threads_for_voxel_; |
| 103 | + dim3 blocks((points_size + threadNum - 1) / threadNum); |
| 104 | + dim3 threads(threadNum); |
110 | 105 | generateVoxels_random_kernel<<<blocks, threads, 0, stream_>>>(
|
111 | 106 | points, points_size, config_.min_x_range_, config_.max_x_range_, config_.min_y_range_,
|
112 | 107 | config_.max_y_range_, config_.min_z_range_, config_.max_z_range_, config_.voxel_x_size_,
|
@@ -170,48 +165,40 @@ cudaError_t PreprocessCuda::generateBaseFeatures_launch(
|
170 | 165 | }
|
171 | 166 |
|
172 | 167 | __global__ void generateSweepPoints_kernel(
|
173 |
| - const uint8_t * input_data, size_t points_size, int input_point_step, float time_lag, |
| 168 | + const float * input_points, size_t points_size, int input_point_step, float time_lag, |
174 | 169 | const float * transform_array, int num_features, float * output_points)
|
175 | 170 | {
|
176 | 171 | int point_idx = blockIdx.x * blockDim.x + threadIdx.x;
|
177 | 172 | if (point_idx >= points_size) return;
|
178 | 173 |
|
179 |
| - union { |
180 |
| - uint32_t raw{0}; |
181 |
| - float value; |
182 |
| - } input_x, input_y, input_z; |
183 |
| - |
184 |
| -#pragma unroll |
185 |
| - for (int i = 0; i < 4; i++) { // 4 bytes for float32 |
186 |
| - input_x.raw |= input_data[point_idx * input_point_step + i] << i * 8; |
187 |
| - input_y.raw |= input_data[point_idx * input_point_step + i + 4] << i * 8; |
188 |
| - input_z.raw |= input_data[point_idx * input_point_step + i + 8] << i * 8; |
189 |
| - } |
190 |
| - |
191 |
| - float input_intensity = static_cast<float>(input_data[point_idx * input_point_step + 12]); |
192 |
| - |
193 |
| - output_points[point_idx * num_features] = |
194 |
| - transform_array[0] * input_x.value + transform_array[4] * input_y.value + |
195 |
| - transform_array[8] * input_z.value + transform_array[12]; |
196 |
| - output_points[point_idx * num_features + 1] = |
197 |
| - transform_array[1] * input_x.value + transform_array[5] * input_y.value + |
198 |
| - transform_array[9] * input_z.value + transform_array[13]; |
199 |
| - output_points[point_idx * num_features + 2] = |
200 |
| - transform_array[2] * input_x.value + transform_array[6] * input_y.value + |
201 |
| - transform_array[10] * input_z.value + transform_array[14]; |
202 |
| - output_points[point_idx * num_features + 3] = input_intensity; |
| 174 | + const float input_x = input_points[point_idx * input_point_step + 0]; |
| 175 | + const float input_y = input_points[point_idx * input_point_step + 1]; |
| 176 | + const float input_z = input_points[point_idx * input_point_step + 2]; |
| 177 | + const float intensity = input_points[point_idx * input_point_step + 3]; |
| 178 | + |
| 179 | + output_points[point_idx * num_features] = transform_array[0] * input_x + |
| 180 | + transform_array[4] * input_y + |
| 181 | + transform_array[8] * input_z + transform_array[12]; |
| 182 | + output_points[point_idx * num_features + 1] = transform_array[1] * input_x + |
| 183 | + transform_array[5] * input_y + |
| 184 | + transform_array[9] * input_z + transform_array[13]; |
| 185 | + output_points[point_idx * num_features + 2] = transform_array[2] * input_x + |
| 186 | + transform_array[6] * input_y + |
| 187 | + transform_array[10] * input_z + transform_array[14]; |
| 188 | + output_points[point_idx * num_features + 3] = intensity; |
203 | 189 | output_points[point_idx * num_features + 4] = time_lag;
|
204 | 190 | }
|
205 | 191 |
|
206 | 192 | cudaError_t PreprocessCuda::generateSweepPoints_launch(
|
207 |
| - const uint8_t * input_data, size_t points_size, int input_point_step, float time_lag, |
| 193 | + const float * input_points, size_t points_size, int input_point_step, float time_lag, |
208 | 194 | const float * transform_array, float * output_points)
|
209 | 195 | {
|
210 |
| - dim3 blocks(divup(points_size, config_.threads_for_voxel_)); |
211 |
| - dim3 threads(config_.threads_for_voxel_); |
| 196 | + int threadNum = config_.threads_for_voxel_; |
| 197 | + dim3 blocks((points_size + threadNum - 1) / threadNum); |
| 198 | + dim3 threads(threadNum); |
212 | 199 |
|
213 | 200 | generateSweepPoints_kernel<<<blocks, threads, 0, stream_>>>(
|
214 |
| - input_data, points_size, input_point_step, time_lag, transform_array, |
| 201 | + input_points, points_size, input_point_step, time_lag, transform_array, |
215 | 202 | config_.num_point_feature_size_, output_points);
|
216 | 203 |
|
217 | 204 | cudaError_t err = cudaGetLastError();
|
|
0 commit comments