diff --git a/automotive/3d-object-detection/README.md b/automotive/3d-object-detection/README.md index f4848647f..80e58a5db 100644 --- a/automotive/3d-object-detection/README.md +++ b/automotive/3d-object-detection/README.md @@ -1,14 +1,27 @@ ## Reference implementation fo automotive 3D detection benchmark -## TODO: Instructions for dataset download after it is uploaded somewhere appropriate - -## TODO: Instructions for checkpoints downloads after it is uploaded somewhere appropriate +## Dataset and model checkpoints +Contact MLCommons support for accessing the Waymo Open Dataset along with the model checkpoints for the reference implementation. You will need to accept a license agreement and will be given directions to download the data. You will need to place the kitti_format folder under a directory named waymo. There are four total checkpoints 2 for pytorch and 2 for onnx. ## Running with docker +Build the container and mount the inference repo and Waymo dataset directory. ``` docker build -t auto_inference -f dockerfile.gpu . -docker run --gpus=all -it -v /inference/:/inference -v /waymo:/waymo --rm auto_inference - +docker run --gpus=all -it -v /inference/:/inference -v /waymo:/waymo --rm auto_inference +``` +### Run with GPU +``` cd /inference/automotive/3d-object-detection python main.py --dataset waymo --dataset-path /waymo/kitti_format/ --lidar-path /pp_ep36.pth --segmentor-path /best_deeplabv3plus_resnet50_waymo_os16.pth --mlperf_conf /inference/mlperf.conf +``` + +### Run with CPU and ONNX +``` +python main.py --dataset waymo --dataset-path /waymo/kitti_format/ --lidar-path /pp.onnx --segmentor-path /deeplabv3+.onnx --mlperf_conf /inference/mlperf.conf +``` + +### Run the accuracy checker +``` +python accuracy_waymo.py --mlperf-accuracy-file /mlperf_log_accuracy.json --waymo-dir /waymo/kitti_format/ +``` diff --git a/automotive/3d-object-detection/accuracy_waymo.py b/automotive/3d-object-detection/accuracy_waymo.py index c8b5cb72c..6e4b2f91a 100644 --- a/automotive/3d-object-detection/accuracy_waymo.py +++ b/automotive/3d-object-detection/accuracy_waymo.py @@ -95,13 +95,13 @@ def main(): 'bbox': [], 'score': [] } - - detections[image_idx]['name'].append(LABEL2CLASSES[label]) - detections[image_idx]['dimensions'].append(dimension) - detections[image_idx]['location'].append(location) - detections[image_idx]['rotation_y'].append(rotation_y) - detections[image_idx]['bbox'].append(bbox) - detections[image_idx]['score'].append(score) + if dimension[0] > 0: + detections[image_idx]['name'].append(LABEL2CLASSES[label]) + detections[image_idx]['dimensions'].append(dimension) + detections[image_idx]['location'].append(location) + detections[image_idx]['rotation_y'].append(rotation_y) + detections[image_idx]['bbox'].append(bbox) + detections[image_idx]['score'].append(score) image_ids.add(image_idx) with open(args.output_file, "w") as fp: @@ -115,6 +115,7 @@ def main(): val_dataset.data_infos, CLASSES, cam_sync=False) + map_stats['Total'] = np.mean(list(map_stats.values())) print(map_stats) if args.verbose: diff --git a/automotive/3d-object-detection/backend_deploy.py b/automotive/3d-object-detection/backend_deploy.py index 1a2f3dee4..2207ea279 100644 --- a/automotive/3d-object-detection/backend_deploy.py +++ b/automotive/3d-object-detection/backend_deploy.py @@ -78,13 +78,11 @@ def load(self): return self def predict(self, inputs): - # TODO: implement predict dimensions, locations, rotation_y, box2d, class_labels, class_scores, ids = [ ], [], [], [], [], [], [] with torch.inference_mode(): device = torch.device( "cuda:0" if torch.cuda.is_available() else "cpu") - format_results = {} model_input = inputs[0] batched_pts = model_input['pts'] scores_from_cam = [] @@ -114,7 +112,7 @@ def predict(self, inputs): calib_info = model_input['calib_info'] image_info = model_input['image_info'] idx = model_input['image_info']['image_idx'] - + format_result['idx'] = idx calib_info = change_calib_device(calib_info, False) result_filter = keep_bbox_from_image_range( result, calib_info, 5, image_info, False) @@ -135,9 +133,6 @@ def predict(self, inputs): format_result['location'].append(camera_bbox[:3]) format_result['rotation_y'].append(camera_bbox[6].item()) format_result['score'].append(score.item()) - format_results['idx'] = idx - - # write_label(format_result, os.path.join(saved_submit_path, f'{idx:06d}.txt')) if len(format_result['dimensions']) > 0: format_result['dimensions'] = torch.stack( @@ -150,6 +145,5 @@ def predict(self, inputs): class_labels.append(format_result['class']) class_scores.append(format_result['score']) box2d.append(format_result['bbox']) - ids.append(format_results['idx']) - # return Boxes, Classes, Scores # Change to desired output + ids.append(format_result['idx']) return dimensions, locations, rotation_y, box2d, class_labels, class_scores, ids diff --git a/automotive/3d-object-detection/backend_onnx.py b/automotive/3d-object-detection/backend_onnx.py new file mode 100644 index 000000000..f26f9ad99 --- /dev/null +++ b/automotive/3d-object-detection/backend_onnx.py @@ -0,0 +1,168 @@ +from typing import Optional, List, Union +import os +import torch +import logging +import backend +from collections import namedtuple +from model.painter import Painter +from model.pointpillars_core import PointPillarsPre, PointPillarsPos +import numpy as np +from tools.process import keep_bbox_from_image_range +from waymo import Waymo +import onnxruntime as ort + + +logging.basicConfig(level=logging.INFO) +log = logging.getLogger("backend-onnx") + + +def change_calib_device(calib, cuda): + result = {} + if cuda: + device = 'cuda' + else: + device = 'cpu' + result['R0_rect'] = calib['R0_rect'].to(device=device, dtype=torch.float) + for i in range(5): + result['P' + str(i)] = calib['P' + str(i) + ].to(device=device, dtype=torch.float) + result['Tr_velo_to_cam_' + + str(i)] = calib['Tr_velo_to_cam_' + + str(i)].to(device=device, dtype=torch.float) + return result + + +def to_numpy(tensor): + return tensor.detach().cpu().numpy() if tensor.requires_grad else tensor.cpu().numpy() + + +class BackendOnnx(backend.Backend): + def __init__( + self, + segmentor_path, + lidar_detector_path, + data_path + ): + super(BackendOnnx, self).__init__() + self.segmentor_path = segmentor_path + self.lidar_detector_path = lidar_detector_path + # self.segmentation_classes = 18 + self.detection_classes = 3 + self.data_root = data_path + CLASSES = Waymo.CLASSES + self.LABEL2CLASSES = {v: k for k, v in CLASSES.items()} + + def version(self): + return torch.__version__ + + def name(self): + return "python-SUT" + + def load(self): + device = torch.device("cpu") + PaintArgs = namedtuple( + 'PaintArgs', [ + 'training_path', 'model_path', 'cam_sync']) + painting_args = PaintArgs( + os.path.join( + self.data_root, + 'training'), + self.segmentor_path, + False) + self.painter = Painter(painting_args, onnx=True) + self.segmentor = self.painter.model + model_pre = PointPillarsPre() + model_post = PointPillarsPos(self.detection_classes) + model_pre.eval() + model_post.eval() + ort_sess = ort.InferenceSession(self.lidar_detector_path) + self.lidar_detector = ort_sess + self.model_pre = model_pre + self.model_post = model_post + return self + + def predict(self, inputs): + dimensions, locations, rotation_y, box2d, class_labels, class_scores, ids = [ + ], [], [], [], [], [], [] + with torch.inference_mode(): + model_input = inputs[0] + batched_pts = model_input['pts'] + scores_from_cam = [] + for i in range(len(model_input['images'])): + input_image_name = self.segmentor.get_inputs()[0].name + input_data = { + input_image_name: to_numpy( + model_input['images'][i])} + segmentation_score = self.segmentor.run(None, input_data) + segmentation_score = [ + torch.from_numpy(item) for item in segmentation_score] + scores_from_cam.append( + self.painter.get_score( + segmentation_score[0].squeeze(0)).cpu()) + points = self.painter.augment_lidar_class_scores_both( + scores_from_cam, batched_pts, model_input['calib_info']) + pillars, coors_batch, npoints_per_pillar = self.model_pre(batched_pts=[ + points]) + input_pillars_name = self.lidar_detector.get_inputs()[0].name + input_coors_batch_name = self.lidar_detector.get_inputs()[1].name + input_npoints_per_pillar_name = self.lidar_detector.get_inputs()[ + 2].name + input_data = {input_pillars_name: to_numpy(pillars), + input_coors_batch_name: to_numpy(coors_batch), + input_npoints_per_pillar_name: to_numpy(npoints_per_pillar)} + result = self.lidar_detector.run(None, input_data) + result = [torch.from_numpy(item) for item in result] + batch_results = self.model_post(result) + for j, result in enumerate(batch_results): + format_result = { + 'class': [], + 'truncated': [], + 'occluded': [], + 'alpha': [], + 'bbox': [], + 'dimensions': [], + 'location': [], + 'rotation_y': [], + 'score': [], + 'idx': -1 + } + + calib_info = model_input['calib_info'] + image_info = model_input['image_info'] + idx = model_input['image_info']['image_idx'] + format_result['idx'] = idx + calib_info = change_calib_device(calib_info, False) + result_filter = keep_bbox_from_image_range( + result, calib_info, 5, image_info, False) + + lidar_bboxes = result_filter['lidar_bboxes'] + labels, scores = result_filter['labels'], result_filter['scores'] + bboxes2d, camera_bboxes = result_filter['bboxes2d'], result_filter['camera_bboxes'] + for lidar_bbox, label, score, bbox2d, camera_bbox in \ + zip(lidar_bboxes, labels, scores, bboxes2d, camera_bboxes): + format_result['class'].append(label.item()) + format_result['truncated'].append(0.0) + format_result['occluded'].append(0) + alpha = camera_bbox[6] - \ + np.arctan2(camera_bbox[0], camera_bbox[2]) + format_result['alpha'].append(alpha.item()) + format_result['bbox'].append(bbox2d.tolist()) + format_result['dimensions'].append(camera_bbox[3:6]) + format_result['location'].append(camera_bbox[:3]) + format_result['rotation_y'].append(camera_bbox[6].item()) + format_result['score'].append(score.item()) + + if len(format_result['dimensions']) > 0: + format_result['dimensions'] = torch.stack( + format_result['dimensions']) + format_result['location'] = torch.stack( + format_result['location']) + dimensions.append(format_result['dimensions']) + locations.append(format_result['location']) + rotation_y.append(format_result['rotation_y']) + class_labels.append(format_result['class']) + class_scores.append(format_result['score']) + box2d.append(format_result['bbox']) + ids.append(format_result['idx']) + + return dimensions, locations, rotation_y, box2d, class_labels, class_scores, ids diff --git a/automotive/3d-object-detection/dockerfile.gpu b/automotive/3d-object-detection/dockerfile.gpu index 02acca7a3..ad4237cd0 100644 --- a/automotive/3d-object-detection/dockerfile.gpu +++ b/automotive/3d-object-detection/dockerfile.gpu @@ -1,4 +1,4 @@ -ARG FROM_IMAGE_NAME=pytorch/pytorch:2.0.1-cuda11.7-cudnn8-devel +ARG FROM_IMAGE_NAME=pytorch/pytorch:2.2.2-cuda11.8-cudnn8-devel FROM ${FROM_IMAGE_NAME} ENV DEBIAN_FRONTEND=noninteractive @@ -20,12 +20,12 @@ RUN cd /tmp && \ CFLAGS="-std=c++14" python setup.py install && \ rm -rf mlperf -RUN pip install tqdm -RUN pip install numba -RUN pip install opencv-python -RUN pip install open3d -RUN pip install tensorboard -RUN pip install scikit-image -RUN pip install ninja -RUN pip install visdom -RUN pip install shapely \ No newline at end of file +RUN pip install tqdm==4.65.0 +RUN pip install numba==0.60.0 +RUN pip install opencv-python==4.11.0.86 +RUN pip install open3d==0.19.0 +RUN pip install scikit-image==0.25.0 +RUN pip install ninja==1.11.1 +RUN pip install shapely==2.0.6 +RUN pip install tensorboard==2.18.0 +RUN pip install onnxruntime==1.20.1 \ No newline at end of file diff --git a/automotive/3d-object-detection/main.py b/automotive/3d-object-detection/main.py index 04269e428..caafd9e47 100644 --- a/automotive/3d-object-detection/main.py +++ b/automotive/3d-object-detection/main.py @@ -167,7 +167,9 @@ def get_backend(backend, **kwargs): from backend_deploy import BackendDeploy backend = BackendDeploy(**kwargs) - + elif backend == 'onnx': + from backend_onnx import BackendOnnx + backend = BackendOnnx(**kwargs) elif backend == "debug": from backend_debug import BackendDebug @@ -403,7 +405,6 @@ def flush_queries(): log_settings.log_output = log_output_settings settings = lg.TestSettings() - settings.FromConfig(mlperf_conf, args.model_name, args.scenario) settings.FromConfig(user_conf, args.model_name, args.scenario) settings.scenario = scenario settings.mode = lg.TestMode.PerformanceOnly diff --git a/automotive/3d-object-detection/model/painter.py b/automotive/3d-object-detection/model/painter.py index f0680931f..36bd2aa0b 100644 --- a/automotive/3d-object-detection/model/painter.py +++ b/automotive/3d-object-detection/model/painter.py @@ -1,3 +1,4 @@ +import onnxruntime as ort import argparse import model.segmentation as network import os @@ -34,10 +35,15 @@ def get_calib_from_file(calib_file): return data +def to_numpy(tensor): + return tensor.detach().cpu().numpy() if tensor.requires_grad else tensor.cpu().numpy() + + class Painter: - def __init__(self, args): + def __init__(self, args, onnx=False): self.root_split_path = args.training_path self.save_path = os.path.join(args.training_path, "painted_lidar/") + self.onnx = onnx if not os.path.exists(self.save_path): os.mkdir(self.save_path) @@ -45,13 +51,18 @@ def __init__(self, args): self.model = None print(f'Using Segmentation Network -- deeplabv3plus') checkpoint_file = args.model_path - model = network.modeling.__dict__['deeplabv3plus_resnet50']( - num_classes=19, output_stride=16) - checkpoint = torch.load(checkpoint_file) - model.load_state_dict(checkpoint["model_state"]) - model.eval() - device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') - model.to(device) + if self.onnx: + model = ort.InferenceSession(checkpoint_file) + self.input_image_name = model.get_inputs()[0].name + else: + model = network.modeling.__dict__['deeplabv3plus_resnet50']( + num_classes=19, output_stride=16) + checkpoint = torch.load(checkpoint_file) + model.load_state_dict(checkpoint["model_state"]) + model.eval() + device = torch.device( + 'cuda' if torch.cuda.is_available() else 'cpu') + model.to(device) self.model = model self.cam_sync = args.cam_sync diff --git a/automotive/3d-object-detection/model/pointpillars.py b/automotive/3d-object-detection/model/pointpillars.py index 49257fe5a..ded0d1685 100644 --- a/automotive/3d-object-detection/model/pointpillars.py +++ b/automotive/3d-object-detection/model/pointpillars.py @@ -397,7 +397,9 @@ def get_predicted_bboxes_single( # 3.2 nms core keep_inds = ml3d.ops.nms( - cur_bbox_pred2d, cur_bbox_cls_pred, self.nms_thr) + cur_bbox_pred2d.cpu(), + cur_bbox_cls_pred.cpu(), + self.nms_thr) cur_bbox_cls_pred = cur_bbox_cls_pred[keep_inds] cur_bbox_pred = cur_bbox_pred[keep_inds] diff --git a/automotive/3d-object-detection/model/pointpillars_core.py b/automotive/3d-object-detection/model/pointpillars_core.py new file mode 100644 index 000000000..a92f46be1 --- /dev/null +++ b/automotive/3d-object-detection/model/pointpillars_core.py @@ -0,0 +1,547 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +from model.anchors import Anchors, anchor_target, anchors2bboxes +from ops import Voxelization +from tools.process import limit_period +import math +import open3d.ml.torch as ml3d + + +class PillarLayer(nn.Module): + def __init__(self, voxel_size, point_cloud_range, + max_num_points, max_voxels): + super().__init__() + self.voxel_layer = Voxelization(voxel_size=voxel_size, + point_cloud_range=point_cloud_range, + max_num_points=max_num_points, + max_voxels=max_voxels) + self.point_cloud_range = point_cloud_range + self.voxel_size = voxel_size + + @torch.no_grad() + def forward(self, batched_pts): + ''' + batched_pts: list[tensor], len(batched_pts) = bs + return: + pillars: (p1 + p2 + ... + pb, num_points, c), + coors_batch: (p1 + p2 + ... + pb, 1 + 3), + num_points_per_pillar: (p1 + p2 + ... + pb, ), (b: batch size) + ''' + pillars, coors, npoints_per_pillar = [], [], [] + for i, pts in enumerate(batched_pts): + voxels_out, coors_out, num_points_per_voxel_out = self.voxel_layer( + pts) + # voxels_out: (max_voxel, num_points, c), coors_out: (max_voxel, 3) + # num_points_per_voxel_out: (max_voxel, ) + pillars.append(voxels_out) + coors.append(coors_out.long()) + npoints_per_pillar.append(num_points_per_voxel_out) + + # (p1 + p2 + ... + pb, num_points, c) + pillars = torch.cat(pillars, dim=0) + npoints_per_pillar = torch.cat( + npoints_per_pillar, + dim=0) # (p1 + p2 + ... + pb, ) + coors_batch = [] + for i, cur_coors in enumerate(coors): + coors_batch.append(F.pad(cur_coors, (1, 0), value=i)) + # (p1 + p2 + ... + pb, 1 + 3) + coors_batch = torch.cat(coors_batch, dim=0) + + return pillars, coors_batch, npoints_per_pillar + + +class PillarEncoder(nn.Module): + def __init__(self, voxel_size, point_cloud_range, in_channel, out_channel): + super().__init__() + self.out_channel = out_channel + self.vx, self.vy = voxel_size[0], voxel_size[1] + self.x_offset = voxel_size[0] / 2 + point_cloud_range[0] + self.y_offset = voxel_size[1] / 2 + point_cloud_range[1] + self.x_l = math.ceil( + (point_cloud_range[3] - + point_cloud_range[0]) / + voxel_size[0]) + self.y_l = math.ceil( + (point_cloud_range[4] - + point_cloud_range[1]) / + voxel_size[1]) + + self.conv = nn.Conv1d(in_channel, out_channel, 1, bias=False) + self.bn = nn.BatchNorm1d(out_channel, eps=1e-3, momentum=0.01) + + def forward(self, pillars, coors_batch, npoints_per_pillar): + ''' + pillars: (p1 + p2 + ... + pb, num_points, c), c = 4 + coors_batch: (p1 + p2 + ... + pb, 1 + 3) + npoints_per_pillar: (p1 + p2 + ... + pb, ) + return: (bs, out_channel, y_l, x_l) + ''' + device = pillars.device + # 1. calculate offset to the points center (in each pillar) + offset_pt_center = pillars[:, + :, + :3] - torch.sum(pillars[:, + :, + :3], + dim=1, + keepdim=True) / npoints_per_pillar[:, + None, + None] # (p1 + p2 + ... + pb, num_points, 3) + + # 2. calculate offset to the pillar center + # (p1 + p2 + ... + pb, num_points, 1) + x_offset_pi_center = pillars[:, :, :1] - \ + (coors_batch[:, None, 1:2] * self.vx + self.x_offset) + # (p1 + p2 + ... + pb, num_points, 1) + y_offset_pi_center = pillars[:, :, 1:2] - \ + (coors_batch[:, None, 2:3] * self.vy + self.y_offset) + + # 3. encoder + features = torch.cat([pillars, + offset_pt_center, + x_offset_pi_center, + y_offset_pi_center], + dim=-1) # (p1 + p2 + ... + pb, num_points, 9) + features[:, :, 0:1] = x_offset_pi_center # tmp + features[:, :, 1:2] = y_offset_pi_center # tmp + # In consitent with mmdet3d. + # The reason can be referenced to + # https://github.com/open-mmlab/mmdetection3d/issues/1150 + + # 4. find mask for (0, 0, 0) and update the encoded features + # a very beautiful implementation + voxel_ids = torch.arange( + 0, pillars.size(1)).to(device) # (num_points, ) + # (num_points, p1 + p2 + ... + pb) + mask = voxel_ids[:, None] < npoints_per_pillar[None, :] + # (p1 + p2 + ... + pb, num_points) + mask = mask.permute(1, 0).contiguous() + features *= mask[:, :, None] + + # 5. embedding + # (p1 + p2 + ... + pb, 9, num_points) + features = features.permute(0, 2, 1).contiguous() + # (p1 + p2 + ... + pb, out_channels, num_points) + features = F.relu(self.bn(self.conv(features))) + # (p1 + p2 + ... + pb, out_channels) + pooling_features = torch.max(features, dim=-1)[0] + + # 6. pillar scatter + batched_canvas = [] + bs = coors_batch[-1, 0] + 1 + for i in range(bs): + cur_coors_idx = coors_batch[:, 0] == i + cur_coors = coors_batch[cur_coors_idx, :] + cur_features = pooling_features[cur_coors_idx] + + canvas = torch.zeros( + (self.x_l, + self.y_l, + self.out_channel), + dtype=torch.float32, + device=device) + canvas[cur_coors[:, 1], cur_coors[:, 2]] = cur_features + canvas = canvas.permute(2, 1, 0).contiguous() + batched_canvas.append(canvas) + # (bs, in_channel, self.y_l, self.x_l) + batched_canvas = torch.stack(batched_canvas, dim=0) + return batched_canvas + + +class Backbone(nn.Module): + def __init__(self, in_channel, out_channels, + layer_nums, layer_strides=[2, 2, 2]): + super().__init__() + assert len(out_channels) == len(layer_nums) + assert len(out_channels) == len(layer_strides) + + self.multi_blocks = nn.ModuleList() + for i in range(len(layer_strides)): + blocks = [] + blocks.append( + nn.Conv2d( + in_channel, + out_channels[i], + 3, + stride=layer_strides[i], + bias=False, + padding=1)) + blocks.append( + nn.BatchNorm2d( + out_channels[i], + eps=1e-3, + momentum=0.01)) + blocks.append(nn.ReLU(inplace=True)) + + for _ in range(layer_nums[i]): + blocks.append( + nn.Conv2d( + out_channels[i], + out_channels[i], + 3, + bias=False, + padding=1)) + blocks.append( + nn.BatchNorm2d( + out_channels[i], + eps=1e-3, + momentum=0.01)) + blocks.append(nn.ReLU(inplace=True)) + + in_channel = out_channels[i] + self.multi_blocks.append(nn.Sequential(*blocks)) + + # in consitent with mmdet3d + for m in self.modules(): + if isinstance(m, nn.Conv2d): + nn.init.kaiming_normal_( + m.weight, mode='fan_out', nonlinearity='relu') + + def forward(self, x): + ''' + x: (b, c, y_l, x_l). Default: (6, 64, 496, 432) + return: list[]. Default: [(6, 64, 248, 216), (6, 128, 124, 108), (6, 256, 62, 54)] + ''' + outs = [] + for i in range(len(self.multi_blocks)): + x = self.multi_blocks[i](x) + outs.append(x) + return outs + + +class Neck(nn.Module): + def __init__(self, in_channels, upsample_strides, out_channels): + super().__init__() + assert len(in_channels) == len(upsample_strides) + assert len(upsample_strides) == len(out_channels) + + self.decoder_blocks = nn.ModuleList() + for i in range(len(in_channels)): + decoder_block = [] + decoder_block.append(nn.ConvTranspose2d(in_channels[i], + out_channels[i], + upsample_strides[i], + stride=upsample_strides[i], + bias=False)) + decoder_block.append( + nn.BatchNorm2d( + out_channels[i], + eps=1e-3, + momentum=0.01)) + decoder_block.append(nn.ReLU(inplace=True)) + + self.decoder_blocks.append(nn.Sequential(*decoder_block)) + + # in consitent with mmdet3d + for m in self.modules(): + if isinstance(m, nn.ConvTranspose2d): + nn.init.kaiming_normal_( + m.weight, mode='fan_out', nonlinearity='relu') + + def forward(self, x): + ''' + x: [(bs, 64, 248, 216), (bs, 128, 124, 108), (bs, 256, 62, 54)] + return: (bs, 384, 248, 216) + ''' + outs = [] + for i in range(len(self.decoder_blocks)): + xi = self.decoder_blocks[i](x[i]) # (bs, 128, 248, 216) + outs.append(xi) + out = torch.cat(outs, dim=1) + return out + + +class Head(nn.Module): + def __init__(self, in_channel, n_anchors, n_classes): + super().__init__() + + self.conv_cls = nn.Conv2d(in_channel, n_anchors * n_classes, 1) + self.conv_reg = nn.Conv2d(in_channel, n_anchors * 7, 1) + self.conv_dir_cls = nn.Conv2d(in_channel, n_anchors * 2, 1) + + # in consitent with mmdet3d + conv_layer_id = 0 + for m in self.modules(): + if isinstance(m, nn.Conv2d): + nn.init.normal_(m.weight, mean=0, std=0.01) + if conv_layer_id == 0: + prior_prob = 0.01 + bias_init = float(-math.log((1 - prior_prob) / prior_prob)) + nn.init.constant_(m.bias, bias_init) + else: + nn.init.constant_(m.bias, 0) + conv_layer_id += 1 + + def forward(self, x): + ''' + x: (bs, 384, 248, 216) + return: + bbox_cls_pred: (bs, n_anchors*3, 248, 216) + bbox_pred: (bs, n_anchors*7, 248, 216) + bbox_dir_cls_pred: (bs, n_anchors*2, 248, 216) + ''' + bbox_cls_pred = self.conv_cls(x) + bbox_pred = self.conv_reg(x) + bbox_dir_cls_pred = self.conv_dir_cls(x) + return bbox_cls_pred, bbox_pred, bbox_dir_cls_pred + + +class PointPillarsPre(nn.Module): + def __init__(self, + nclasses=3, + voxel_size=[0.32, 0.32, 6], + point_cloud_range=[-74.88, -74.88, -2, 74.88, 74.88, 4], + max_num_points=20, + max_voxels=(32000, 32000), + painted=False): + super().__init__() + self.pillar_layer = PillarLayer(voxel_size=voxel_size, + point_cloud_range=point_cloud_range, + max_num_points=max_num_points, + max_voxels=max_voxels) + + def forward(self, batched_pts): + pillars, coors_batch, npoints_per_pillar = self.pillar_layer( + batched_pts) + return pillars, coors_batch, npoints_per_pillar + + +class PointPillarsCore(nn.Module): + def __init__(self, + nclasses=3, + voxel_size=[0.32, 0.32, 6], + point_cloud_range=[-74.88, -74.88, -2, 74.88, 74.88, 4], + max_num_points=20, + max_voxels=(32000, 32000), + painted=False): + super().__init__() + self.nclasses = nclasses + if painted: + pillar_channel = 16 + else: + pillar_channel = 10 + self.pillar_encoder = PillarEncoder(voxel_size=voxel_size, + point_cloud_range=point_cloud_range, + in_channel=pillar_channel, + out_channel=64) + self.backbone = Backbone(in_channel=64, + out_channels=[64, 128, 256], + layer_nums=[3, 5, 5], + layer_strides=[1, 2, 2]) + self.neck = Neck(in_channels=[64, 128, 256], + upsample_strides=[1, 2, 4], + out_channels=[128, 128, 128]) + self.head = Head( + in_channel=384, + n_anchors=2 * nclasses, + n_classes=nclasses) + + # anchors + ranges = [[-74.88, -74.88, -0.0345, 74.88, 74.88, -0.0345], + [-74.88, -74.88, 0, 74.88, 74.88, 0], + [-74.88, -74.88, -0.1188, 74.88, 74.88, -0.1188]] + sizes = [[0.84, .91, 1.74], [.84, 1.81, 1.77], [2.08, 4.73, 1.77]] + rotations = [0, 1.57] + self.anchors_generator = Anchors(ranges=ranges, + sizes=sizes, + rotations=rotations) + + # train + self.assigners = [ + {'pos_iou_thr': 0.5, 'neg_iou_thr': 0.3, 'min_iou_thr': 0.3}, + {'pos_iou_thr': 0.5, 'neg_iou_thr': 0.3, 'min_iou_thr': 0.3}, + {'pos_iou_thr': 0.55, 'neg_iou_thr': 0.4, 'min_iou_thr': 0.4}, + ] + + # val and test + self.nms_pre = 4096 + self.nms_thr = 0.25 + self.score_thr = 0.1 + self.max_num = 500 + + def get_predicted_bboxes_single( + self, bbox_cls_pred, bbox_pred, bbox_dir_cls_pred, anchors): + ''' + bbox_cls_pred: (n_anchors*3, 248, 216) + bbox_pred: (n_anchors*7, 248, 216) + bbox_dir_cls_pred: (n_anchors*2, 248, 216) + anchors: (y_l, x_l, 3, 2, 7) + return: + bboxes: (k, 7) + labels: (k, ) + scores: (k, ) + ''' + # 0. pre-process + bbox_cls_pred = bbox_cls_pred.permute( + 1, 2, 0).reshape(-1, self.nclasses) + bbox_pred = bbox_pred.permute(1, 2, 0).reshape(-1, 7) + bbox_dir_cls_pred = bbox_dir_cls_pred.permute(1, 2, 0).reshape(-1, 2) + anchors = anchors.reshape(-1, 7) + + bbox_cls_pred = torch.sigmoid(bbox_cls_pred) + bbox_dir_cls_pred = torch.max(bbox_dir_cls_pred, dim=1)[1] + + # 1. obtain self.nms_pre bboxes based on scores + inds = bbox_cls_pred.max(1)[0].topk(self.nms_pre)[1] + bbox_cls_pred = bbox_cls_pred[inds] + bbox_pred = bbox_pred[inds] + bbox_dir_cls_pred = bbox_dir_cls_pred[inds] + anchors = anchors[inds] + + # 2. decode predicted offsets to bboxes + bbox_pred = anchors2bboxes(anchors, bbox_pred) + return torch.cat( + [bbox_pred, bbox_cls_pred, bbox_dir_cls_pred[:, None]], 1) + + def get_predicted_bboxes( + self, bbox_cls_pred, bbox_pred, bbox_dir_cls_pred, batched_anchors): + ''' + bbox_cls_pred: (bs, n_anchors*3, 248, 216) + bbox_pred: (bs, n_anchors*7, 248, 216) + bbox_dir_cls_pred: (bs, n_anchors*2, 248, 216) + batched_anchors: (bs, y_l, x_l, 3, 2, 7) + return: + bboxes: [(k1, 7), (k2, 7), ... ] + labels: [(k1, ), (k2, ), ... ] + scores: [(k1, ), (k2, ), ... ] + ''' + results = [] + bs = bbox_cls_pred.size(0) + for i in range(bs): + result = self.get_predicted_bboxes_single(bbox_cls_pred=bbox_cls_pred[i], + bbox_pred=bbox_pred[i], + bbox_dir_cls_pred=bbox_dir_cls_pred[i], + anchors=batched_anchors[i]) + results.append(result) + return results + + def forward(self, pillars, coors_batch, npoints_per_pillar, + mode='test', batched_gt_bboxes=None, batched_gt_labels=None): + pillar_features = self.pillar_encoder( + pillars, coors_batch, npoints_per_pillar) + + # xs: [(bs, 64, 248, 216), (bs, 128, 124, 108), (bs, 256, 62, 54)] + xs = self.backbone(pillar_features) + + # x: (bs, 384, 248, 216) + x = self.neck(xs) + + # bbox_cls_pred: (bs, n_anchors*3, 248, 216) + # bbox_pred: (bs, n_anchors*7, 248, 216) + # bbox_dir_cls_pred: (bs, n_anchors*2, 248, 216) + bbox_cls_pred, bbox_pred, bbox_dir_cls_pred = self.head(x) + + # anchors + device = bbox_cls_pred.device + feature_map_size = torch.tensor( + list(bbox_cls_pred.size()[-2:]), device=device) + anchors = self.anchors_generator.get_multi_anchors(feature_map_size) + batch_size = pillar_features.shape[0] + batched_anchors = [anchors for _ in range(batch_size)] + + if mode == 'train': + anchor_target_dict = anchor_target(batched_anchors=batched_anchors, + batched_gt_bboxes=batched_gt_bboxes, + batched_gt_labels=batched_gt_labels, + assigners=self.assigners, + nclasses=self.nclasses) + + return bbox_cls_pred, bbox_pred, bbox_dir_cls_pred, anchor_target_dict + elif mode == 'val': + results = self.get_predicted_bboxes(bbox_cls_pred=bbox_cls_pred, + bbox_pred=bbox_pred, + bbox_dir_cls_pred=bbox_dir_cls_pred, + batched_anchors=batched_anchors) + return results + + elif mode == 'test': + results = self.get_predicted_bboxes(bbox_cls_pred=bbox_cls_pred, + bbox_pred=bbox_pred, + bbox_dir_cls_pred=bbox_dir_cls_pred, + batched_anchors=batched_anchors) + return results + else: + raise ValueError + + +class PointPillarsPos(nn.Module): + def __init__(self, nclasses=3): + super().__init__() + self.nclasses = nclasses + self.nms_thr = 0.25 + self.score_thr = 0.1 + self.max_num = 500 + + def nms_filter(self, bbox_pred, bbox_cls_pred, bbox_dir_cls_pred): + # 3. nms + bbox_pred2d_xy = bbox_pred[:, [0, 1]] + bbox_pred2d_lw = bbox_pred[:, [3, 4]] + bbox_pred2d = torch.cat([bbox_pred2d_xy - bbox_pred2d_lw / 2, + bbox_pred2d_xy + bbox_pred2d_lw / 2, + bbox_pred[:, 6:]], dim=-1) # (n_anchors, 5) + + ret_bboxes, ret_labels, ret_scores = [], [], [] + for i in range(self.nclasses): + # 3.1 filter bboxes with scores below self.score_thr + cur_bbox_cls_pred = bbox_cls_pred[:, i] + score_inds = cur_bbox_cls_pred > self.score_thr + if score_inds.sum() == 0: + continue + + cur_bbox_cls_pred = cur_bbox_cls_pred[score_inds] + cur_bbox_pred2d = bbox_pred2d[score_inds] + cur_bbox_pred = bbox_pred[score_inds] + cur_bbox_dir_cls_pred = bbox_dir_cls_pred[score_inds] + + # 3.2 nms core + keep_inds = ml3d.ops.nms( + cur_bbox_pred2d.detach().cpu(), + cur_bbox_cls_pred.detach().cpu(), + self.nms_thr) + cur_bbox_cls_pred = cur_bbox_cls_pred[keep_inds] + cur_bbox_pred = cur_bbox_pred[keep_inds] + cur_bbox_dir_cls_pred = cur_bbox_dir_cls_pred[keep_inds] + cur_bbox_pred[:, - + 1] = limit_period(cur_bbox_pred[:, - + 1].detach().cpu(), 1, math.pi).to(cur_bbox_pred) # [-pi, 0] + cur_bbox_pred[:, -1] += (1 - cur_bbox_dir_cls_pred) * math.pi + + ret_bboxes.append(cur_bbox_pred) + ret_labels.append(torch.zeros_like( + cur_bbox_pred[:, 0], dtype=torch.long) + i) + ret_scores.append(cur_bbox_cls_pred) + + # 4. filter some bboxes if bboxes number is above self.max_num + if len(ret_bboxes) == 0: + return { + 'lidar_bboxes': torch.empty((0, 7)).detach().cpu(), + 'labels': torch.empty(0).detach().cpu(), + 'scores': torch.empty(0).detach().cpu() + } + ret_bboxes = torch.cat(ret_bboxes, 0) + ret_labels = torch.cat(ret_labels, 0) + ret_scores = torch.cat(ret_scores, 0) + if ret_bboxes.size(0) > self.max_num: + final_inds = ret_scores.topk(self.max_num)[1] + ret_bboxes = ret_bboxes[final_inds] + ret_labels = ret_labels[final_inds] + ret_scores = ret_scores[final_inds] + result = { + 'lidar_bboxes': ret_bboxes.detach().cpu(), + 'labels': ret_labels.detach().cpu(), + 'scores': ret_scores.detach().cpu() + } + return result + + def forward(self, results): + pos_results = [] + for result in results: + bbox_pred, bbox_cls_pred, bbox_dir_cls_pred = result[:, + :7], result[:, 7:10], result[:, 10] + pos_result = self.nms_filter( + bbox_pred, bbox_cls_pred, bbox_dir_cls_pred) + if pos_result is not None: + pos_results.append(pos_result) + return pos_results diff --git a/automotive/3d-object-detection/tools/evaluate.py b/automotive/3d-object-detection/tools/evaluate.py index 3bc56ee4f..eb3c5d32a 100644 --- a/automotive/3d-object-detection/tools/evaluate.py +++ b/automotive/3d-object-detection/tools/evaluate.py @@ -53,7 +53,6 @@ def do_eval(det_results, gt_results, CLASSES, cam_sync=False): gt_results: dict(id -> det_results) CLASSES: dict ''' - assert len(det_results) == len(gt_results) # 1. calculate iou ious = { @@ -66,6 +65,8 @@ def do_eval(det_results, gt_results, CLASSES, cam_sync=False): annos_label = 'annos' for id in range(len(gt_results)): gt_result = gt_results[id][annos_label] + if gt_results[id]['image']['image_idx'] not in det_results: + continue det_result = det_results[gt_results[id]['image']['image_idx']] # 1.2, bev iou diff --git a/automotive/3d-object-detection/waymo.py b/automotive/3d-object-detection/waymo.py index 5818ed01e..71a870f2f 100644 --- a/automotive/3d-object-detection/waymo.py +++ b/automotive/3d-object-detection/waymo.py @@ -78,10 +78,6 @@ def __init__(self, data_root, split, self.painted = painted self.cam_sync = cam_sync self.point_range_filter = [-74.88, -74.88, -2, 74.88, 74.88, 4] - if painted or cam_sync: - info_file = f'painted_waymo_infos_{split}.pkl' - else: - info_file = f'waymo_infos_{split}.pkl' self.data_infos = read_pickle(os.path.join(data_root, info_file)) self.sorted_ids = range(len(self.data_infos)) @@ -202,6 +198,9 @@ def get_image(self, idx, camera): input_batch = input_tensor.unsqueeze(0) return input_batch + def get_item_count(self): + return len(self.data_infos) + class PostProcessWaymo: def __init__( @@ -220,6 +219,23 @@ def __call__(self, results, content_id, inputs, result_dict): for idx in range(len(content_id)): processed_results.append([]) detection_num = len(results[0][idx]) + if detection_num == 0: + processed_results[idx].append([ + -1, + -1, + -1, + -1, + -1, + -1, + -1, + -1, + -1, + -1, + -1, + -1, + -1, + results[6][idx] + ]) for detection in range(0, detection_num): processed_results[idx].append([ results[0][idx][detection][0],