diff --git a/automotive/3d-object-detection/README.md b/automotive/3d-object-detection/README.md
index f4848647f..80e58a5db 100644
--- a/automotive/3d-object-detection/README.md
+++ b/automotive/3d-object-detection/README.md
@@ -1,14 +1,27 @@
 ## Reference implementation fo automotive 3D detection benchmark
 
-## TODO: Instructions for dataset download after it is uploaded somewhere appropriate
-
-## TODO: Instructions for checkpoints downloads after it is uploaded somewhere appropriate
+## Dataset and model checkpoints
+Contact MLCommons support for accessing the Waymo Open Dataset along with the model checkpoints for the reference implementation. You will need to accept a license agreement and will be given directions to download the data. You will need to place the kitti_format folder under a directory named waymo. There are four total checkpoints 2 for pytorch and 2 for onnx.
 
 ## Running with docker
+Build the container and mount the inference repo and Waymo dataset directory.
 ```
 docker build -t auto_inference -f dockerfile.gpu .
 
-docker run --gpus=all -it -v <directory to inference repo>/inference/:/inference -v <directory to waymo dataset>/waymo:/waymo --rm auto_inference
-
+docker run --gpus=all -it -v <directory to inference repo>/inference/:/inference -v <path to waymo dataset>/waymo:/waymo --rm auto_inference
+```
+### Run with GPU
+```
 cd /inference/automotive/3d-object-detection
 python main.py --dataset waymo --dataset-path /waymo/kitti_format/ --lidar-path <checkpoint_path>/pp_ep36.pth --segmentor-path <checkpoint_path>/best_deeplabv3plus_resnet50_waymo_os16.pth --mlperf_conf /inference/mlperf.conf
+```
+
+### Run with CPU and ONNX
+```
+python main.py --dataset waymo --dataset-path /waymo/kitti_format/ --lidar-path <checkpoint_path>/pp.onnx --segmentor-path <checkpoint_path>/deeplabv3+.onnx --mlperf_conf /inference/mlperf.conf
+```
+
+### Run the accuracy checker
+```
+python accuracy_waymo.py --mlperf-accuracy-file <path to accuracy file>/mlperf_log_accuracy.json --waymo-dir /waymo/kitti_format/
+```
diff --git a/automotive/3d-object-detection/accuracy_waymo.py b/automotive/3d-object-detection/accuracy_waymo.py
index c8b5cb72c..6e4b2f91a 100644
--- a/automotive/3d-object-detection/accuracy_waymo.py
+++ b/automotive/3d-object-detection/accuracy_waymo.py
@@ -95,13 +95,13 @@ def main():
                     'bbox': [],
                     'score': []
                 }
-
-            detections[image_idx]['name'].append(LABEL2CLASSES[label])
-            detections[image_idx]['dimensions'].append(dimension)
-            detections[image_idx]['location'].append(location)
-            detections[image_idx]['rotation_y'].append(rotation_y)
-            detections[image_idx]['bbox'].append(bbox)
-            detections[image_idx]['score'].append(score)
+            if dimension[0] > 0:
+                detections[image_idx]['name'].append(LABEL2CLASSES[label])
+                detections[image_idx]['dimensions'].append(dimension)
+                detections[image_idx]['location'].append(location)
+                detections[image_idx]['rotation_y'].append(rotation_y)
+                detections[image_idx]['bbox'].append(bbox)
+                detections[image_idx]['score'].append(score)
             image_ids.add(image_idx)
 
     with open(args.output_file, "w") as fp:
@@ -115,6 +115,7 @@ def main():
         val_dataset.data_infos,
         CLASSES,
         cam_sync=False)
+    map_stats['Total'] = np.mean(list(map_stats.values()))
 
     print(map_stats)
     if args.verbose:
diff --git a/automotive/3d-object-detection/backend_deploy.py b/automotive/3d-object-detection/backend_deploy.py
index 1a2f3dee4..2207ea279 100644
--- a/automotive/3d-object-detection/backend_deploy.py
+++ b/automotive/3d-object-detection/backend_deploy.py
@@ -78,13 +78,11 @@ def load(self):
         return self
 
     def predict(self, inputs):
-        # TODO: implement predict
         dimensions, locations, rotation_y, box2d, class_labels, class_scores, ids = [
         ], [], [], [], [], [], []
         with torch.inference_mode():
             device = torch.device(
                 "cuda:0" if torch.cuda.is_available() else "cpu")
-            format_results = {}
             model_input = inputs[0]
             batched_pts = model_input['pts']
             scores_from_cam = []
@@ -114,7 +112,7 @@ def predict(self, inputs):
                 calib_info = model_input['calib_info']
                 image_info = model_input['image_info']
                 idx = model_input['image_info']['image_idx']
-
+                format_result['idx'] = idx
                 calib_info = change_calib_device(calib_info, False)
                 result_filter = keep_bbox_from_image_range(
                     result, calib_info, 5, image_info, False)
@@ -135,9 +133,6 @@ def predict(self, inputs):
                     format_result['location'].append(camera_bbox[:3])
                     format_result['rotation_y'].append(camera_bbox[6].item())
                     format_result['score'].append(score.item())
-                    format_results['idx'] = idx
-
-                # write_label(format_result, os.path.join(saved_submit_path, f'{idx:06d}.txt'))
 
                 if len(format_result['dimensions']) > 0:
                     format_result['dimensions'] = torch.stack(
@@ -150,6 +145,5 @@ def predict(self, inputs):
                 class_labels.append(format_result['class'])
                 class_scores.append(format_result['score'])
                 box2d.append(format_result['bbox'])
-                ids.append(format_results['idx'])
-            # return Boxes, Classes, Scores # Change to desired output
+                ids.append(format_result['idx'])
         return dimensions, locations, rotation_y, box2d, class_labels, class_scores, ids
diff --git a/automotive/3d-object-detection/backend_onnx.py b/automotive/3d-object-detection/backend_onnx.py
new file mode 100644
index 000000000..f26f9ad99
--- /dev/null
+++ b/automotive/3d-object-detection/backend_onnx.py
@@ -0,0 +1,168 @@
+from typing import Optional, List, Union
+import os
+import torch
+import logging
+import backend
+from collections import namedtuple
+from model.painter import Painter
+from model.pointpillars_core import PointPillarsPre, PointPillarsPos
+import numpy as np
+from tools.process import keep_bbox_from_image_range
+from waymo import Waymo
+import onnxruntime as ort
+
+
+logging.basicConfig(level=logging.INFO)
+log = logging.getLogger("backend-onnx")
+
+
+def change_calib_device(calib, cuda):
+    result = {}
+    if cuda:
+        device = 'cuda'
+    else:
+        device = 'cpu'
+    result['R0_rect'] = calib['R0_rect'].to(device=device, dtype=torch.float)
+    for i in range(5):
+        result['P' + str(i)] = calib['P' + str(i)
+                                     ].to(device=device, dtype=torch.float)
+        result['Tr_velo_to_cam_' +
+               str(i)] = calib['Tr_velo_to_cam_' +
+                               str(i)].to(device=device, dtype=torch.float)
+    return result
+
+
+def to_numpy(tensor):
+    return tensor.detach().cpu().numpy() if tensor.requires_grad else tensor.cpu().numpy()
+
+
+class BackendOnnx(backend.Backend):
+    def __init__(
+        self,
+        segmentor_path,
+        lidar_detector_path,
+        data_path
+    ):
+        super(BackendOnnx, self).__init__()
+        self.segmentor_path = segmentor_path
+        self.lidar_detector_path = lidar_detector_path
+        # self.segmentation_classes = 18
+        self.detection_classes = 3
+        self.data_root = data_path
+        CLASSES = Waymo.CLASSES
+        self.LABEL2CLASSES = {v: k for k, v in CLASSES.items()}
+
+    def version(self):
+        return torch.__version__
+
+    def name(self):
+        return "python-SUT"
+
+    def load(self):
+        device = torch.device("cpu")
+        PaintArgs = namedtuple(
+            'PaintArgs', [
+                'training_path', 'model_path', 'cam_sync'])
+        painting_args = PaintArgs(
+            os.path.join(
+                self.data_root,
+                'training'),
+            self.segmentor_path,
+            False)
+        self.painter = Painter(painting_args, onnx=True)
+        self.segmentor = self.painter.model
+        model_pre = PointPillarsPre()
+        model_post = PointPillarsPos(self.detection_classes)
+        model_pre.eval()
+        model_post.eval()
+        ort_sess = ort.InferenceSession(self.lidar_detector_path)
+        self.lidar_detector = ort_sess
+        self.model_pre = model_pre
+        self.model_post = model_post
+        return self
+
+    def predict(self, inputs):
+        dimensions, locations, rotation_y, box2d, class_labels, class_scores, ids = [
+        ], [], [], [], [], [], []
+        with torch.inference_mode():
+            model_input = inputs[0]
+            batched_pts = model_input['pts']
+            scores_from_cam = []
+            for i in range(len(model_input['images'])):
+                input_image_name = self.segmentor.get_inputs()[0].name
+                input_data = {
+                    input_image_name: to_numpy(
+                        model_input['images'][i])}
+                segmentation_score = self.segmentor.run(None, input_data)
+                segmentation_score = [
+                    torch.from_numpy(item) for item in segmentation_score]
+                scores_from_cam.append(
+                    self.painter.get_score(
+                        segmentation_score[0].squeeze(0)).cpu())
+            points = self.painter.augment_lidar_class_scores_both(
+                scores_from_cam, batched_pts, model_input['calib_info'])
+            pillars, coors_batch, npoints_per_pillar = self.model_pre(batched_pts=[
+                                                                      points])
+            input_pillars_name = self.lidar_detector.get_inputs()[0].name
+            input_coors_batch_name = self.lidar_detector.get_inputs()[1].name
+            input_npoints_per_pillar_name = self.lidar_detector.get_inputs()[
+                2].name
+            input_data = {input_pillars_name: to_numpy(pillars),
+                          input_coors_batch_name: to_numpy(coors_batch),
+                          input_npoints_per_pillar_name: to_numpy(npoints_per_pillar)}
+            result = self.lidar_detector.run(None, input_data)
+            result = [torch.from_numpy(item) for item in result]
+            batch_results = self.model_post(result)
+            for j, result in enumerate(batch_results):
+                format_result = {
+                    'class': [],
+                    'truncated': [],
+                    'occluded': [],
+                    'alpha': [],
+                    'bbox': [],
+                    'dimensions': [],
+                    'location': [],
+                    'rotation_y': [],
+                    'score': [],
+                    'idx': -1
+                }
+
+                calib_info = model_input['calib_info']
+                image_info = model_input['image_info']
+                idx = model_input['image_info']['image_idx']
+                format_result['idx'] = idx
+                calib_info = change_calib_device(calib_info, False)
+                result_filter = keep_bbox_from_image_range(
+                    result, calib_info, 5, image_info, False)
+
+                lidar_bboxes = result_filter['lidar_bboxes']
+                labels, scores = result_filter['labels'], result_filter['scores']
+                bboxes2d, camera_bboxes = result_filter['bboxes2d'], result_filter['camera_bboxes']
+                for lidar_bbox, label, score, bbox2d, camera_bbox in \
+                        zip(lidar_bboxes, labels, scores, bboxes2d, camera_bboxes):
+                    format_result['class'].append(label.item())
+                    format_result['truncated'].append(0.0)
+                    format_result['occluded'].append(0)
+                    alpha = camera_bbox[6] - \
+                        np.arctan2(camera_bbox[0], camera_bbox[2])
+                    format_result['alpha'].append(alpha.item())
+                    format_result['bbox'].append(bbox2d.tolist())
+                    format_result['dimensions'].append(camera_bbox[3:6])
+                    format_result['location'].append(camera_bbox[:3])
+                    format_result['rotation_y'].append(camera_bbox[6].item())
+                    format_result['score'].append(score.item())
+
+                if len(format_result['dimensions']) > 0:
+                    format_result['dimensions'] = torch.stack(
+                        format_result['dimensions'])
+                    format_result['location'] = torch.stack(
+                        format_result['location'])
+                dimensions.append(format_result['dimensions'])
+                locations.append(format_result['location'])
+                rotation_y.append(format_result['rotation_y'])
+                class_labels.append(format_result['class'])
+                class_scores.append(format_result['score'])
+                box2d.append(format_result['bbox'])
+                ids.append(format_result['idx'])
+
+        return dimensions, locations, rotation_y, box2d, class_labels, class_scores, ids
diff --git a/automotive/3d-object-detection/dockerfile.gpu b/automotive/3d-object-detection/dockerfile.gpu
index 02acca7a3..ad4237cd0 100644
--- a/automotive/3d-object-detection/dockerfile.gpu
+++ b/automotive/3d-object-detection/dockerfile.gpu
@@ -1,4 +1,4 @@
-ARG FROM_IMAGE_NAME=pytorch/pytorch:2.0.1-cuda11.7-cudnn8-devel
+ARG FROM_IMAGE_NAME=pytorch/pytorch:2.2.2-cuda11.8-cudnn8-devel
 FROM ${FROM_IMAGE_NAME}
 
 ENV DEBIAN_FRONTEND=noninteractive
@@ -20,12 +20,12 @@ RUN cd /tmp && \
     CFLAGS="-std=c++14" python setup.py install && \
     rm -rf mlperf
 
-RUN pip install tqdm
-RUN pip install numba
-RUN pip install opencv-python
-RUN pip install open3d
-RUN pip install tensorboard
-RUN pip install scikit-image
-RUN pip install ninja
-RUN pip install visdom
-RUN pip install shapely
\ No newline at end of file
+RUN pip install tqdm==4.65.0
+RUN pip install numba==0.60.0
+RUN pip install opencv-python==4.11.0.86
+RUN pip install open3d==0.19.0
+RUN pip install scikit-image==0.25.0
+RUN pip install ninja==1.11.1
+RUN pip install shapely==2.0.6
+RUN pip install tensorboard==2.18.0
+RUN pip install onnxruntime==1.20.1
\ No newline at end of file
diff --git a/automotive/3d-object-detection/main.py b/automotive/3d-object-detection/main.py
index 04269e428..caafd9e47 100644
--- a/automotive/3d-object-detection/main.py
+++ b/automotive/3d-object-detection/main.py
@@ -167,7 +167,9 @@ def get_backend(backend, **kwargs):
         from backend_deploy import BackendDeploy
 
         backend = BackendDeploy(**kwargs)
-
+    elif backend == 'onnx':
+        from backend_onnx import BackendOnnx
+        backend = BackendOnnx(**kwargs)
     elif backend == "debug":
         from backend_debug import BackendDebug
 
@@ -403,7 +405,6 @@ def flush_queries():
     log_settings.log_output = log_output_settings
 
     settings = lg.TestSettings()
-    settings.FromConfig(mlperf_conf, args.model_name, args.scenario)
     settings.FromConfig(user_conf, args.model_name, args.scenario)
     settings.scenario = scenario
     settings.mode = lg.TestMode.PerformanceOnly
diff --git a/automotive/3d-object-detection/model/painter.py b/automotive/3d-object-detection/model/painter.py
index f0680931f..36bd2aa0b 100644
--- a/automotive/3d-object-detection/model/painter.py
+++ b/automotive/3d-object-detection/model/painter.py
@@ -1,3 +1,4 @@
+import onnxruntime as ort
 import argparse
 import model.segmentation as network
 import os
@@ -34,10 +35,15 @@ def get_calib_from_file(calib_file):
     return data
 
 
+def to_numpy(tensor):
+    return tensor.detach().cpu().numpy() if tensor.requires_grad else tensor.cpu().numpy()
+
+
 class Painter:
-    def __init__(self, args):
+    def __init__(self, args, onnx=False):
         self.root_split_path = args.training_path
         self.save_path = os.path.join(args.training_path, "painted_lidar/")
+        self.onnx = onnx
         if not os.path.exists(self.save_path):
             os.mkdir(self.save_path)
 
@@ -45,13 +51,18 @@ def __init__(self, args):
         self.model = None
         print(f'Using Segmentation Network -- deeplabv3plus')
         checkpoint_file = args.model_path
-        model = network.modeling.__dict__['deeplabv3plus_resnet50'](
-            num_classes=19, output_stride=16)
-        checkpoint = torch.load(checkpoint_file)
-        model.load_state_dict(checkpoint["model_state"])
-        model.eval()
-        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-        model.to(device)
+        if self.onnx:
+            model = ort.InferenceSession(checkpoint_file)
+            self.input_image_name = model.get_inputs()[0].name
+        else:
+            model = network.modeling.__dict__['deeplabv3plus_resnet50'](
+                num_classes=19, output_stride=16)
+            checkpoint = torch.load(checkpoint_file)
+            model.load_state_dict(checkpoint["model_state"])
+            model.eval()
+            device = torch.device(
+                'cuda' if torch.cuda.is_available() else 'cpu')
+            model.to(device)
         self.model = model
         self.cam_sync = args.cam_sync
 
diff --git a/automotive/3d-object-detection/model/pointpillars.py b/automotive/3d-object-detection/model/pointpillars.py
index 49257fe5a..ded0d1685 100644
--- a/automotive/3d-object-detection/model/pointpillars.py
+++ b/automotive/3d-object-detection/model/pointpillars.py
@@ -397,7 +397,9 @@ def get_predicted_bboxes_single(
 
             # 3.2 nms core
             keep_inds = ml3d.ops.nms(
-                cur_bbox_pred2d, cur_bbox_cls_pred, self.nms_thr)
+                cur_bbox_pred2d.cpu(),
+                cur_bbox_cls_pred.cpu(),
+                self.nms_thr)
 
             cur_bbox_cls_pred = cur_bbox_cls_pred[keep_inds]
             cur_bbox_pred = cur_bbox_pred[keep_inds]
diff --git a/automotive/3d-object-detection/model/pointpillars_core.py b/automotive/3d-object-detection/model/pointpillars_core.py
new file mode 100644
index 000000000..a92f46be1
--- /dev/null
+++ b/automotive/3d-object-detection/model/pointpillars_core.py
@@ -0,0 +1,547 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from model.anchors import Anchors, anchor_target, anchors2bboxes
+from ops import Voxelization
+from tools.process import limit_period
+import math
+import open3d.ml.torch as ml3d
+
+
+class PillarLayer(nn.Module):
+    def __init__(self, voxel_size, point_cloud_range,
+                 max_num_points, max_voxels):
+        super().__init__()
+        self.voxel_layer = Voxelization(voxel_size=voxel_size,
+                                        point_cloud_range=point_cloud_range,
+                                        max_num_points=max_num_points,
+                                        max_voxels=max_voxels)
+        self.point_cloud_range = point_cloud_range
+        self.voxel_size = voxel_size
+
+    @torch.no_grad()
+    def forward(self, batched_pts):
+        '''
+        batched_pts: list[tensor], len(batched_pts) = bs
+        return:
+               pillars: (p1 + p2 + ... + pb, num_points, c),
+               coors_batch: (p1 + p2 + ... + pb, 1 + 3),
+               num_points_per_pillar: (p1 + p2 + ... + pb, ), (b: batch size)
+        '''
+        pillars, coors, npoints_per_pillar = [], [], []
+        for i, pts in enumerate(batched_pts):
+            voxels_out, coors_out, num_points_per_voxel_out = self.voxel_layer(
+                pts)
+            # voxels_out: (max_voxel, num_points, c), coors_out: (max_voxel, 3)
+            # num_points_per_voxel_out: (max_voxel, )
+            pillars.append(voxels_out)
+            coors.append(coors_out.long())
+            npoints_per_pillar.append(num_points_per_voxel_out)
+
+        # (p1 + p2 + ... + pb, num_points, c)
+        pillars = torch.cat(pillars, dim=0)
+        npoints_per_pillar = torch.cat(
+            npoints_per_pillar,
+            dim=0)  # (p1 + p2 + ... + pb, )
+        coors_batch = []
+        for i, cur_coors in enumerate(coors):
+            coors_batch.append(F.pad(cur_coors, (1, 0), value=i))
+        # (p1 + p2 + ... + pb, 1 + 3)
+        coors_batch = torch.cat(coors_batch, dim=0)
+
+        return pillars, coors_batch, npoints_per_pillar
+
+
+class PillarEncoder(nn.Module):
+    def __init__(self, voxel_size, point_cloud_range, in_channel, out_channel):
+        super().__init__()
+        self.out_channel = out_channel
+        self.vx, self.vy = voxel_size[0], voxel_size[1]
+        self.x_offset = voxel_size[0] / 2 + point_cloud_range[0]
+        self.y_offset = voxel_size[1] / 2 + point_cloud_range[1]
+        self.x_l = math.ceil(
+            (point_cloud_range[3] -
+             point_cloud_range[0]) /
+            voxel_size[0])
+        self.y_l = math.ceil(
+            (point_cloud_range[4] -
+             point_cloud_range[1]) /
+            voxel_size[1])
+
+        self.conv = nn.Conv1d(in_channel, out_channel, 1, bias=False)
+        self.bn = nn.BatchNorm1d(out_channel, eps=1e-3, momentum=0.01)
+
+    def forward(self, pillars, coors_batch, npoints_per_pillar):
+        '''
+        pillars: (p1 + p2 + ... + pb, num_points, c), c = 4
+        coors_batch: (p1 + p2 + ... + pb, 1 + 3)
+        npoints_per_pillar: (p1 + p2 + ... + pb, )
+        return:  (bs, out_channel, y_l, x_l)
+        '''
+        device = pillars.device
+        # 1. calculate offset to the points center (in each pillar)
+        offset_pt_center = pillars[:,
+                                   :,
+                                   :3] - torch.sum(pillars[:,
+                                                           :,
+                                                           :3],
+                                                   dim=1,
+                                                   keepdim=True) / npoints_per_pillar[:,
+                                                                                      None,
+                                                                                      None]  # (p1 + p2 + ... + pb, num_points, 3)
+
+        # 2. calculate offset to the pillar center
+        # (p1 + p2 + ... + pb, num_points, 1)
+        x_offset_pi_center = pillars[:, :, :1] - \
+            (coors_batch[:, None, 1:2] * self.vx + self.x_offset)
+        # (p1 + p2 + ... + pb, num_points, 1)
+        y_offset_pi_center = pillars[:, :, 1:2] - \
+            (coors_batch[:, None, 2:3] * self.vy + self.y_offset)
+
+        # 3. encoder
+        features = torch.cat([pillars,
+                              offset_pt_center,
+                              x_offset_pi_center,
+                              y_offset_pi_center],
+                             dim=-1)  # (p1 + p2 + ... + pb, num_points, 9)
+        features[:, :, 0:1] = x_offset_pi_center  # tmp
+        features[:, :, 1:2] = y_offset_pi_center  # tmp
+        # In consitent with mmdet3d.
+        # The reason can be referenced to
+        # https://github.com/open-mmlab/mmdetection3d/issues/1150
+
+        # 4. find mask for (0, 0, 0) and update the encoded features
+        # a very beautiful implementation
+        voxel_ids = torch.arange(
+            0, pillars.size(1)).to(device)  # (num_points, )
+        # (num_points, p1 + p2 + ... + pb)
+        mask = voxel_ids[:, None] < npoints_per_pillar[None, :]
+        # (p1 + p2 + ... + pb, num_points)
+        mask = mask.permute(1, 0).contiguous()
+        features *= mask[:, :, None]
+
+        # 5. embedding
+        # (p1 + p2 + ... + pb, 9, num_points)
+        features = features.permute(0, 2, 1).contiguous()
+        # (p1 + p2 + ... + pb, out_channels, num_points)
+        features = F.relu(self.bn(self.conv(features)))
+        # (p1 + p2 + ... + pb, out_channels)
+        pooling_features = torch.max(features, dim=-1)[0]
+
+        # 6. pillar scatter
+        batched_canvas = []
+        bs = coors_batch[-1, 0] + 1
+        for i in range(bs):
+            cur_coors_idx = coors_batch[:, 0] == i
+            cur_coors = coors_batch[cur_coors_idx, :]
+            cur_features = pooling_features[cur_coors_idx]
+
+            canvas = torch.zeros(
+                (self.x_l,
+                 self.y_l,
+                 self.out_channel),
+                dtype=torch.float32,
+                device=device)
+            canvas[cur_coors[:, 1], cur_coors[:, 2]] = cur_features
+            canvas = canvas.permute(2, 1, 0).contiguous()
+            batched_canvas.append(canvas)
+        # (bs, in_channel, self.y_l, self.x_l)
+        batched_canvas = torch.stack(batched_canvas, dim=0)
+        return batched_canvas
+
+
+class Backbone(nn.Module):
+    def __init__(self, in_channel, out_channels,
+                 layer_nums, layer_strides=[2, 2, 2]):
+        super().__init__()
+        assert len(out_channels) == len(layer_nums)
+        assert len(out_channels) == len(layer_strides)
+
+        self.multi_blocks = nn.ModuleList()
+        for i in range(len(layer_strides)):
+            blocks = []
+            blocks.append(
+                nn.Conv2d(
+                    in_channel,
+                    out_channels[i],
+                    3,
+                    stride=layer_strides[i],
+                    bias=False,
+                    padding=1))
+            blocks.append(
+                nn.BatchNorm2d(
+                    out_channels[i],
+                    eps=1e-3,
+                    momentum=0.01))
+            blocks.append(nn.ReLU(inplace=True))
+
+            for _ in range(layer_nums[i]):
+                blocks.append(
+                    nn.Conv2d(
+                        out_channels[i],
+                        out_channels[i],
+                        3,
+                        bias=False,
+                        padding=1))
+                blocks.append(
+                    nn.BatchNorm2d(
+                        out_channels[i],
+                        eps=1e-3,
+                        momentum=0.01))
+                blocks.append(nn.ReLU(inplace=True))
+
+            in_channel = out_channels[i]
+            self.multi_blocks.append(nn.Sequential(*blocks))
+
+        # in consitent with mmdet3d
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(
+                    m.weight, mode='fan_out', nonlinearity='relu')
+
+    def forward(self, x):
+        '''
+        x: (b, c, y_l, x_l). Default: (6, 64, 496, 432)
+        return: list[]. Default: [(6, 64, 248, 216), (6, 128, 124, 108), (6, 256, 62, 54)]
+        '''
+        outs = []
+        for i in range(len(self.multi_blocks)):
+            x = self.multi_blocks[i](x)
+            outs.append(x)
+        return outs
+
+
+class Neck(nn.Module):
+    def __init__(self, in_channels, upsample_strides, out_channels):
+        super().__init__()
+        assert len(in_channels) == len(upsample_strides)
+        assert len(upsample_strides) == len(out_channels)
+
+        self.decoder_blocks = nn.ModuleList()
+        for i in range(len(in_channels)):
+            decoder_block = []
+            decoder_block.append(nn.ConvTranspose2d(in_channels[i],
+                                                    out_channels[i],
+                                                    upsample_strides[i],
+                                                    stride=upsample_strides[i],
+                                                    bias=False))
+            decoder_block.append(
+                nn.BatchNorm2d(
+                    out_channels[i],
+                    eps=1e-3,
+                    momentum=0.01))
+            decoder_block.append(nn.ReLU(inplace=True))
+
+            self.decoder_blocks.append(nn.Sequential(*decoder_block))
+
+        # in consitent with mmdet3d
+        for m in self.modules():
+            if isinstance(m, nn.ConvTranspose2d):
+                nn.init.kaiming_normal_(
+                    m.weight, mode='fan_out', nonlinearity='relu')
+
+    def forward(self, x):
+        '''
+        x: [(bs, 64, 248, 216), (bs, 128, 124, 108), (bs, 256, 62, 54)]
+        return: (bs, 384, 248, 216)
+        '''
+        outs = []
+        for i in range(len(self.decoder_blocks)):
+            xi = self.decoder_blocks[i](x[i])  # (bs, 128, 248, 216)
+            outs.append(xi)
+        out = torch.cat(outs, dim=1)
+        return out
+
+
+class Head(nn.Module):
+    def __init__(self, in_channel, n_anchors, n_classes):
+        super().__init__()
+
+        self.conv_cls = nn.Conv2d(in_channel, n_anchors * n_classes, 1)
+        self.conv_reg = nn.Conv2d(in_channel, n_anchors * 7, 1)
+        self.conv_dir_cls = nn.Conv2d(in_channel, n_anchors * 2, 1)
+
+        # in consitent with mmdet3d
+        conv_layer_id = 0
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.normal_(m.weight, mean=0, std=0.01)
+                if conv_layer_id == 0:
+                    prior_prob = 0.01
+                    bias_init = float(-math.log((1 - prior_prob) / prior_prob))
+                    nn.init.constant_(m.bias, bias_init)
+                else:
+                    nn.init.constant_(m.bias, 0)
+                conv_layer_id += 1
+
+    def forward(self, x):
+        '''
+        x: (bs, 384, 248, 216)
+        return:
+              bbox_cls_pred: (bs, n_anchors*3, 248, 216)
+              bbox_pred: (bs, n_anchors*7, 248, 216)
+              bbox_dir_cls_pred: (bs, n_anchors*2, 248, 216)
+        '''
+        bbox_cls_pred = self.conv_cls(x)
+        bbox_pred = self.conv_reg(x)
+        bbox_dir_cls_pred = self.conv_dir_cls(x)
+        return bbox_cls_pred, bbox_pred, bbox_dir_cls_pred
+
+
+class PointPillarsPre(nn.Module):
+    def __init__(self,
+                 nclasses=3,
+                 voxel_size=[0.32, 0.32, 6],
+                 point_cloud_range=[-74.88, -74.88, -2, 74.88, 74.88, 4],
+                 max_num_points=20,
+                 max_voxels=(32000, 32000),
+                 painted=False):
+        super().__init__()
+        self.pillar_layer = PillarLayer(voxel_size=voxel_size,
+                                        point_cloud_range=point_cloud_range,
+                                        max_num_points=max_num_points,
+                                        max_voxels=max_voxels)
+
+    def forward(self, batched_pts):
+        pillars, coors_batch, npoints_per_pillar = self.pillar_layer(
+            batched_pts)
+        return pillars, coors_batch, npoints_per_pillar
+
+
+class PointPillarsCore(nn.Module):
+    def __init__(self,
+                 nclasses=3,
+                 voxel_size=[0.32, 0.32, 6],
+                 point_cloud_range=[-74.88, -74.88, -2, 74.88, 74.88, 4],
+                 max_num_points=20,
+                 max_voxels=(32000, 32000),
+                 painted=False):
+        super().__init__()
+        self.nclasses = nclasses
+        if painted:
+            pillar_channel = 16
+        else:
+            pillar_channel = 10
+        self.pillar_encoder = PillarEncoder(voxel_size=voxel_size,
+                                            point_cloud_range=point_cloud_range,
+                                            in_channel=pillar_channel,
+                                            out_channel=64)
+        self.backbone = Backbone(in_channel=64,
+                                 out_channels=[64, 128, 256],
+                                 layer_nums=[3, 5, 5],
+                                 layer_strides=[1, 2, 2])
+        self.neck = Neck(in_channels=[64, 128, 256],
+                         upsample_strides=[1, 2, 4],
+                         out_channels=[128, 128, 128])
+        self.head = Head(
+            in_channel=384,
+            n_anchors=2 * nclasses,
+            n_classes=nclasses)
+
+        # anchors
+        ranges = [[-74.88, -74.88, -0.0345, 74.88, 74.88, -0.0345],
+                  [-74.88, -74.88, 0, 74.88, 74.88, 0],
+                  [-74.88, -74.88, -0.1188, 74.88, 74.88, -0.1188]]
+        sizes = [[0.84, .91, 1.74], [.84, 1.81, 1.77], [2.08, 4.73, 1.77]]
+        rotations = [0, 1.57]
+        self.anchors_generator = Anchors(ranges=ranges,
+                                         sizes=sizes,
+                                         rotations=rotations)
+
+        # train
+        self.assigners = [
+            {'pos_iou_thr': 0.5, 'neg_iou_thr': 0.3, 'min_iou_thr': 0.3},
+            {'pos_iou_thr': 0.5, 'neg_iou_thr': 0.3, 'min_iou_thr': 0.3},
+            {'pos_iou_thr': 0.55, 'neg_iou_thr': 0.4, 'min_iou_thr': 0.4},
+        ]
+
+        # val and test
+        self.nms_pre = 4096
+        self.nms_thr = 0.25
+        self.score_thr = 0.1
+        self.max_num = 500
+
+    def get_predicted_bboxes_single(
+            self, bbox_cls_pred, bbox_pred, bbox_dir_cls_pred, anchors):
+        '''
+        bbox_cls_pred: (n_anchors*3, 248, 216)
+        bbox_pred: (n_anchors*7, 248, 216)
+        bbox_dir_cls_pred: (n_anchors*2, 248, 216)
+        anchors: (y_l, x_l, 3, 2, 7)
+        return:
+            bboxes: (k, 7)
+            labels: (k, )
+            scores: (k, )
+        '''
+        # 0. pre-process
+        bbox_cls_pred = bbox_cls_pred.permute(
+            1, 2, 0).reshape(-1, self.nclasses)
+        bbox_pred = bbox_pred.permute(1, 2, 0).reshape(-1, 7)
+        bbox_dir_cls_pred = bbox_dir_cls_pred.permute(1, 2, 0).reshape(-1, 2)
+        anchors = anchors.reshape(-1, 7)
+
+        bbox_cls_pred = torch.sigmoid(bbox_cls_pred)
+        bbox_dir_cls_pred = torch.max(bbox_dir_cls_pred, dim=1)[1]
+
+        # 1. obtain self.nms_pre bboxes based on scores
+        inds = bbox_cls_pred.max(1)[0].topk(self.nms_pre)[1]
+        bbox_cls_pred = bbox_cls_pred[inds]
+        bbox_pred = bbox_pred[inds]
+        bbox_dir_cls_pred = bbox_dir_cls_pred[inds]
+        anchors = anchors[inds]
+
+        # 2. decode predicted offsets to bboxes
+        bbox_pred = anchors2bboxes(anchors, bbox_pred)
+        return torch.cat(
+            [bbox_pred, bbox_cls_pred, bbox_dir_cls_pred[:, None]], 1)
+
+    def get_predicted_bboxes(
+            self, bbox_cls_pred, bbox_pred, bbox_dir_cls_pred, batched_anchors):
+        '''
+        bbox_cls_pred: (bs, n_anchors*3, 248, 216)
+        bbox_pred: (bs, n_anchors*7, 248, 216)
+        bbox_dir_cls_pred: (bs, n_anchors*2, 248, 216)
+        batched_anchors: (bs, y_l, x_l, 3, 2, 7)
+        return:
+            bboxes: [(k1, 7), (k2, 7), ... ]
+            labels: [(k1, ), (k2, ), ... ]
+            scores: [(k1, ), (k2, ), ... ]
+        '''
+        results = []
+        bs = bbox_cls_pred.size(0)
+        for i in range(bs):
+            result = self.get_predicted_bboxes_single(bbox_cls_pred=bbox_cls_pred[i],
+                                                      bbox_pred=bbox_pred[i],
+                                                      bbox_dir_cls_pred=bbox_dir_cls_pred[i],
+                                                      anchors=batched_anchors[i])
+            results.append(result)
+        return results
+
+    def forward(self, pillars, coors_batch, npoints_per_pillar,
+                mode='test', batched_gt_bboxes=None, batched_gt_labels=None):
+        pillar_features = self.pillar_encoder(
+            pillars, coors_batch, npoints_per_pillar)
+
+        # xs:  [(bs, 64, 248, 216), (bs, 128, 124, 108), (bs, 256, 62, 54)]
+        xs = self.backbone(pillar_features)
+
+        # x: (bs, 384, 248, 216)
+        x = self.neck(xs)
+
+        # bbox_cls_pred: (bs, n_anchors*3, 248, 216)
+        # bbox_pred: (bs, n_anchors*7, 248, 216)
+        # bbox_dir_cls_pred: (bs, n_anchors*2, 248, 216)
+        bbox_cls_pred, bbox_pred, bbox_dir_cls_pred = self.head(x)
+
+        # anchors
+        device = bbox_cls_pred.device
+        feature_map_size = torch.tensor(
+            list(bbox_cls_pred.size()[-2:]), device=device)
+        anchors = self.anchors_generator.get_multi_anchors(feature_map_size)
+        batch_size = pillar_features.shape[0]
+        batched_anchors = [anchors for _ in range(batch_size)]
+
+        if mode == 'train':
+            anchor_target_dict = anchor_target(batched_anchors=batched_anchors,
+                                               batched_gt_bboxes=batched_gt_bboxes,
+                                               batched_gt_labels=batched_gt_labels,
+                                               assigners=self.assigners,
+                                               nclasses=self.nclasses)
+
+            return bbox_cls_pred, bbox_pred, bbox_dir_cls_pred, anchor_target_dict
+        elif mode == 'val':
+            results = self.get_predicted_bboxes(bbox_cls_pred=bbox_cls_pred,
+                                                bbox_pred=bbox_pred,
+                                                bbox_dir_cls_pred=bbox_dir_cls_pred,
+                                                batched_anchors=batched_anchors)
+            return results
+
+        elif mode == 'test':
+            results = self.get_predicted_bboxes(bbox_cls_pred=bbox_cls_pred,
+                                                bbox_pred=bbox_pred,
+                                                bbox_dir_cls_pred=bbox_dir_cls_pred,
+                                                batched_anchors=batched_anchors)
+            return results
+        else:
+            raise ValueError
+
+
+class PointPillarsPos(nn.Module):
+    def __init__(self, nclasses=3):
+        super().__init__()
+        self.nclasses = nclasses
+        self.nms_thr = 0.25
+        self.score_thr = 0.1
+        self.max_num = 500
+
+    def nms_filter(self, bbox_pred, bbox_cls_pred, bbox_dir_cls_pred):
+        # 3. nms
+        bbox_pred2d_xy = bbox_pred[:, [0, 1]]
+        bbox_pred2d_lw = bbox_pred[:, [3, 4]]
+        bbox_pred2d = torch.cat([bbox_pred2d_xy - bbox_pred2d_lw / 2,
+                                 bbox_pred2d_xy + bbox_pred2d_lw / 2,
+                                 bbox_pred[:, 6:]], dim=-1)  # (n_anchors, 5)
+
+        ret_bboxes, ret_labels, ret_scores = [], [], []
+        for i in range(self.nclasses):
+            # 3.1 filter bboxes with scores below self.score_thr
+            cur_bbox_cls_pred = bbox_cls_pred[:, i]
+            score_inds = cur_bbox_cls_pred > self.score_thr
+            if score_inds.sum() == 0:
+                continue
+
+            cur_bbox_cls_pred = cur_bbox_cls_pred[score_inds]
+            cur_bbox_pred2d = bbox_pred2d[score_inds]
+            cur_bbox_pred = bbox_pred[score_inds]
+            cur_bbox_dir_cls_pred = bbox_dir_cls_pred[score_inds]
+
+            # 3.2 nms core
+            keep_inds = ml3d.ops.nms(
+                cur_bbox_pred2d.detach().cpu(),
+                cur_bbox_cls_pred.detach().cpu(),
+                self.nms_thr)
+            cur_bbox_cls_pred = cur_bbox_cls_pred[keep_inds]
+            cur_bbox_pred = cur_bbox_pred[keep_inds]
+            cur_bbox_dir_cls_pred = cur_bbox_dir_cls_pred[keep_inds]
+            cur_bbox_pred[:, -
+                          1] = limit_period(cur_bbox_pred[:, -
+                                                          1].detach().cpu(), 1, math.pi).to(cur_bbox_pred)  # [-pi, 0]
+            cur_bbox_pred[:, -1] += (1 - cur_bbox_dir_cls_pred) * math.pi
+
+            ret_bboxes.append(cur_bbox_pred)
+            ret_labels.append(torch.zeros_like(
+                cur_bbox_pred[:, 0], dtype=torch.long) + i)
+            ret_scores.append(cur_bbox_cls_pred)
+
+        # 4. filter some bboxes if bboxes number is above self.max_num
+        if len(ret_bboxes) == 0:
+            return {
+                'lidar_bboxes': torch.empty((0, 7)).detach().cpu(),
+                'labels': torch.empty(0).detach().cpu(),
+                'scores': torch.empty(0).detach().cpu()
+            }
+        ret_bboxes = torch.cat(ret_bboxes, 0)
+        ret_labels = torch.cat(ret_labels, 0)
+        ret_scores = torch.cat(ret_scores, 0)
+        if ret_bboxes.size(0) > self.max_num:
+            final_inds = ret_scores.topk(self.max_num)[1]
+            ret_bboxes = ret_bboxes[final_inds]
+            ret_labels = ret_labels[final_inds]
+            ret_scores = ret_scores[final_inds]
+        result = {
+            'lidar_bboxes': ret_bboxes.detach().cpu(),
+            'labels': ret_labels.detach().cpu(),
+            'scores': ret_scores.detach().cpu()
+        }
+        return result
+
+    def forward(self, results):
+        pos_results = []
+        for result in results:
+            bbox_pred, bbox_cls_pred, bbox_dir_cls_pred = result[:,
+                                                                 :7], result[:, 7:10], result[:, 10]
+            pos_result = self.nms_filter(
+                bbox_pred, bbox_cls_pred, bbox_dir_cls_pred)
+            if pos_result is not None:
+                pos_results.append(pos_result)
+        return pos_results
diff --git a/automotive/3d-object-detection/tools/evaluate.py b/automotive/3d-object-detection/tools/evaluate.py
index 3bc56ee4f..eb3c5d32a 100644
--- a/automotive/3d-object-detection/tools/evaluate.py
+++ b/automotive/3d-object-detection/tools/evaluate.py
@@ -53,7 +53,6 @@ def do_eval(det_results, gt_results, CLASSES, cam_sync=False):
     gt_results: dict(id -> det_results)
     CLASSES: dict
     '''
-    assert len(det_results) == len(gt_results)
 
     # 1. calculate iou
     ious = {
@@ -66,6 +65,8 @@ def do_eval(det_results, gt_results, CLASSES, cam_sync=False):
         annos_label = 'annos'
     for id in range(len(gt_results)):
         gt_result = gt_results[id][annos_label]
+        if gt_results[id]['image']['image_idx'] not in det_results:
+            continue
         det_result = det_results[gt_results[id]['image']['image_idx']]
 
         # 1.2, bev iou
diff --git a/automotive/3d-object-detection/waymo.py b/automotive/3d-object-detection/waymo.py
index 5818ed01e..71a870f2f 100644
--- a/automotive/3d-object-detection/waymo.py
+++ b/automotive/3d-object-detection/waymo.py
@@ -78,10 +78,6 @@ def __init__(self, data_root, split,
         self.painted = painted
         self.cam_sync = cam_sync
         self.point_range_filter = [-74.88, -74.88, -2, 74.88, 74.88, 4]
-        if painted or cam_sync:
-            info_file = f'painted_waymo_infos_{split}.pkl'
-        else:
-            info_file = f'waymo_infos_{split}.pkl'
         self.data_infos = read_pickle(os.path.join(data_root, info_file))
         self.sorted_ids = range(len(self.data_infos))
 
@@ -202,6 +198,9 @@ def get_image(self, idx, camera):
         input_batch = input_tensor.unsqueeze(0)
         return input_batch
 
+    def get_item_count(self):
+        return len(self.data_infos)
+
 
 class PostProcessWaymo:
     def __init__(
@@ -220,6 +219,23 @@ def __call__(self, results, content_id, inputs, result_dict):
         for idx in range(len(content_id)):
             processed_results.append([])
             detection_num = len(results[0][idx])
+            if detection_num == 0:
+                processed_results[idx].append([
+                    -1,
+                    -1,
+                    -1,
+                    -1,
+                    -1,
+                    -1,
+                    -1,
+                    -1,
+                    -1,
+                    -1,
+                    -1,
+                    -1,
+                    -1,
+                    results[6][idx]
+                ])
             for detection in range(0, detection_num):
                 processed_results[idx].append([
                     results[0][idx][detection][0],