Add training details (#150)

* [Major] Update training details. * [Minor] Update README.md. * [Minor] Remove comment.
2022-09-26 18:24:38 -04:00 · 2022-09-26 18:24:38 -04:00 · f39a4a0752
parent e4d599edd5
commit f39a4a0752
15 changed files with 275 additions and 101 deletions
--- a/README.md
+++ b/README.md
@ -34,8 +34,8 @@ Multi-sensor fusion is essential for an accurate and reliable autonomous driving

 |        Model         | Modality | mAP  | NDS  | Checkpoint  |
 | :------------------: | :------: | :--: | :--: | :---------: |
-|    [BEVFusion](configs/nuscenes/det/transfusion/secfpn/camera+lidar/swint_v0p075/convfuser.yaml)       |   C+L    | 68.39 | 71.32 | [Link](https://bevfusion.mit.edu/files/pretrained/bevfusion-det.pth) |
-| [Camera-Only Baseline](configs/nuscenes/det/centerhead/lssfpn/camera/256x704/swint/default.yaml) |    C     | 33.25 | 40.15 | [Link](https://bevfusion.mit.edu/files/pretrained/camera-only-det.pth) |
+|    [BEVFusion](configs/nuscenes/det/transfusion/secfpn/camera+lidar/swint_v0p075/convfuser.yaml)       |   C+L    | 68.52 | 71.38 | [Link](https://bevfusion.mit.edu/files/pretrained_updated/bevfusion-det.pth) |
+| [Camera-Only Baseline](configs/nuscenes/det/centerhead/lssfpn/camera/256x704/swint/default.yaml) |    C     | 35.56 | 41.21 | [Link](https://bevfusion.mit.edu/files/pretrained_updated/camera-only-det.pth) |
 | [LiDAR-Only Baseline](configs/nuscenes/det/transfusion/secfpn/lidar/voxelnet_0p075.yaml)  |    L     | 64.68 | 69.28 | [Link](https://bevfusion.mit.edu/files/pretrained/lidar-only-det.pth) |

 *Note*: The camera-only object detection baseline is a variant of BEVDet-Tiny with a much heavier view transformer and other differences in hyperparameters. Thanks to our [efficient BEV pooling](mmdet3d/ops/bev_pool) operator, this model runs fast and has higher mAP than BEVDet-Tiny under the same input resolution. Please refer to [BEVDet repo](https://github.com/HuangJunjie2017/BEVDet) for the original BEVDet-Tiny implementation. The LiDAR-only baseline is TransFusion-L.
@ -44,8 +44,8 @@ Multi-sensor fusion is essential for an accurate and reliable autonomous driving

 |        Model         | Modality | mIoU | Checkpoint  |
 | :------------------: | :------: | :--: | :---------: |
-| [BEVFusion](configs/nuscenes/seg/fusion-bev256d2-lss.yaml)       |   C+L    | 62.69 | [Link](https://bevfusion.mit.edu/files/pretrained/bevfusion-seg.pth) |
-| [Camera-Only Baseline](configs/nuscenes/seg/camera-bev256d2.yaml) |    C     | 56.56 | [Link](https://bevfusion.mit.edu/files/pretrained/camera-only-seg.pth) |
+| [BEVFusion](configs/nuscenes/seg/fusion-bev256d2-lss.yaml)       |   C+L    | 62.95 | [Link](https://bevfusion.mit.edu/files/pretrained_updated/bevfusion-seg.pth) |
+| [Camera-Only Baseline](configs/nuscenes/seg/camera-bev256d2.yaml) |    C     | 57.09 | [Link](https://bevfusion.mit.edu/files/pretrained_updated/camera-only-seg.pth) |
 | [LiDAR-Only Baseline](configs/nuscenes/seg/lidar-centerpoint-bev128.yaml)  |    L     | 48.56 | [Link](https://bevfusion.mit.edu/files/pretrained/lidar-only-seg.pth) |

 ## Usage
@ -122,6 +122,34 @@ While for the segmentation variant of BEVFusion, this command will be helpful:
 torchpack dist-run -np 8 python tools/test.py configs/nuscenes/seg/fusion-bev256d2-lss.yaml pretrained/bevfusion-seg.pth --eval map
 ```

+### Training
+
+We provide instructions to reproduce our results on nuScenes.
+
+For example, if you want to train the camera-only variant for object detection, please run:
+
+```bash
+torchpack dist-run -np 8 python tools/train.py configs/nuscenes/det/centerhead/lssfpn/camera/256x704/swint/default.yaml --model.encoders.camera.backbone.init_cfg.checkpoint pretrained/swint-nuimages-pretrained.pth
+```
+
+For camera-only BEV segmentation model, please run:
+
+```bash
+torchpack dist-run -np 8 python tools/train.py configs/nuscenes/seg/camera-bev256d2.yaml --model.encoders.camera.backbone.init_cfg.checkpoint pretrained/swint-nuimages-pretrained.pth
+```
+
+For LiDAR-only detector, please run:
+
+```bash
+torchpack dist-run -np 8 python tools/train.py configs/nuscenes/det/transfusion/secfpn/lidar/voxelnet_0p075.yaml
+```
+
+For LiDAR-only BEV segmentation model, please run:
+
+```bash
+torchpack dist-run -np 8 python tools/train.py configs/nuscenes/seg/lidar-centerpoint-bev128.yaml
+```
+
 ## FAQs

 Q: Can we directly use the info files prepared by mmdetection3d?
--- a/configs/nuscenes/default.yaml
+++ b/configs/nuscenes/default.yaml
@ -11,16 +11,16 @@ voxel_size: [0.1, 0.1, 0.2]
 image_size: [256, 704]

 augment2d:
-  resize: [[0.48, 0.48], [0.48, 0.48]]
-  rotate: [0.0, 0.0]
+  resize: [[0.38, 0.55], [0.48, 0.48]]
+  rotate: [-5.4, 5.4]
  gridmask:
    prob: 0.0
    fixed_prob: true

 augment3d:
-  scale: [1.0, 1.0]
-  rotate: [0.0, 0.0]
-  translate: 0.0
+  scale: [0.9, 1.1]
+  rotate: [-0.78539816, 0.78539816]
+  translate: 0.5

 object_classes:
  - car
@ -121,7 +121,7 @@ train_pipeline:
    resize_lim: ${augment2d.resize[0]}
    bot_pct_lim: [0.0, 0.0]
    rot_lim: ${augment2d.rotate}
-    rand_flip: false
+    rand_flip: true
    is_train: true
  -
    type: GlobalRotScaleTrans
@ -148,9 +148,8 @@ train_pipeline:
    classes: ${object_classes}
  -
    type: ImageNormalize
-    mean: [103.530, 116.280, 123.675]
-    std: [58.395, 57.12, 57.375]
-    to_rgb: true
+    mean: [0.485, 0.456, 0.406]
+    std: [0.229, 0.224, 0.225]
  - 
    type: GridMask
    use_h: true
@ -180,6 +179,7 @@ train_pipeline:
      - camera2ego
      - lidar2ego
      - lidar2camera
+      - camera2lidar
      - lidar2image
      - img_aug_matrix
      - lidar_aug_matrix
@ -234,9 +234,8 @@ test_pipeline:
    point_cloud_range: ${point_cloud_range}
  -
    type: ImageNormalize
-    mean: [103.530, 116.280, 123.675]
-    std: [58.395, 57.12, 57.375]
-    to_rgb: true
+    mean: [0.485, 0.456, 0.406]
+    std: [0.229, 0.224, 0.225]
  -
    type: DefaultFormatBundle3D
    classes: ${object_classes}
@ -253,6 +252,7 @@ test_pipeline:
      - camera2ego
      - lidar2ego
      - lidar2camera
+      - camera2lidar
      - lidar2image
      - img_aug_matrix
      - lidar_aug_matrix
--- a/configs/nuscenes/det/centerhead/lssfpn/camera/256x704/swint/default.yaml
+++ b/configs/nuscenes/det/centerhead/lssfpn/camera/256x704/swint/default.yaml
@ -59,3 +59,27 @@ model:
      out_channels: 256
      scale_factor: 2

+optimizer:
+  paramwise_cfg:
+    custom_keys:
+      absolute_pos_embed:
+        decay_mult: 0
+      relative_position_bias_table:
+        decay_mult: 0
+      encoders.camera.backbone:
+        lr_mult: 0.1
+
+
+lr_config:
+  policy: cyclic
+  target_ratio: 5.0
+  cyclic_times: 1
+  step_ratio_up: 0.4
+
+momentum_config:
+  policy: cyclic
+  cyclic_times: 1
+  step_ratio_up: 0.4
+
+data:
+  samples_per_gpu: 6
--- a/configs/nuscenes/det/centerhead/lssfpn/camera/default.yaml
+++ b/configs/nuscenes/det/centerhead/lssfpn/camera/default.yaml
@ -1,6 +1,6 @@
 augment3d:
-  scale: [1.0, 1.0]
-  rotate: [0.0, 0.0]
+  scale: [0.95, 1.05]
+  rotate: [-0.3925, 0.3925]
  translate: 0

 model:
--- a/configs/nuscenes/det/centerhead/lssfpn/default.yaml
+++ b/configs/nuscenes/det/centerhead/lssfpn/default.yaml
@ -21,3 +21,15 @@ optimizer:
  type: AdamW
  lr: 2.0e-4
  weight_decay: 0.01
+
+optimizer_config:
+  grad_clip:
+    max_norm: 35
+    norm_type: 2
+
+lr_config:
+  policy: CosineAnnealing
+  warmup: linear
+  warmup_iters: 500
+  warmup_ratio: 0.33333333
+  min_lr_ratio: 1.0e-3
--- a/configs/nuscenes/det/transfusion/secfpn/default.yaml
+++ b/configs/nuscenes/det/transfusion/secfpn/default.yaml
@ -39,3 +39,9 @@ optimizer_config:
  grad_clip:
    max_norm: 35
    norm_type: 2
+
+lr_config:
+  policy: cyclic
+
+momentum_config:
+  policy: cyclic
--- a/configs/nuscenes/seg/camera-bev256d2.yaml
+++ b/configs/nuscenes/seg/camera-bev256d2.yaml
@ -18,6 +18,9 @@ model:
        out_indices: [1, 2, 3]
        with_cp: false
        convert_weights: true
+        init_cfg:
+          type: Pretrained
+          checkpoint: https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_tiny_patch4_window7_224.pth
      neck:
        type: GeneralizedLSSFPN
        in_channels: [192, 384, 768]
@ -63,3 +66,21 @@ model:
 optimizer:
  type: AdamW
  lr: 1.0e-4
+  weight_decay: 0.01
+  paramwise_cfg:
+    custom_keys:
+      absolute_pos_embed:
+        decay_mult: 0
+      relative_position_bias_table:
+        decay_mult: 0
+
+optimizer_config:
+  grad_clip:
+    max_norm: 35
+    norm_type: 2
+
+lr_config:
+  policy: cyclic
+
+momentum_config:
+  policy: cyclic
--- a/configs/nuscenes/seg/lidar-centerpoint-bev128.yaml
+++ b/configs/nuscenes/seg/lidar-centerpoint-bev128.yaml
@ -62,3 +62,14 @@ model:
 optimizer:
  type: AdamW
  lr: 1.0e-4
+
+optimizer_config:
+  grad_clip:
+    max_norm: 35
+    norm_type: 2
+
+lr_config:
+  policy: cyclic
+
+momentum_config:
+  policy: cyclic
--- a/mmdet3d/datasets/nuscenes_dataset.py
+++ b/mmdet3d/datasets/nuscenes_dataset.py
@ -211,7 +211,6 @@ class NuScenesDataset(Custom3DDataset):

        data = dict(
            token=info["token"],
-            sample_idx=info['token'],
            lidar_path=info["lidar_path"],
            sweeps=info["sweeps"],
            timestamp=info["timestamp"],
@ -236,6 +235,7 @@ class NuScenesDataset(Custom3DDataset):
            data["lidar2image"] = []
            data["camera2ego"] = []
            data["camera_intrinsics"] = []
+            data["camera2lidar"] = []

            for _, camera_info in info["cams"].items():
                data["image_paths"].append(camera_info["data_path"])
@ -267,10 +267,14 @@ class NuScenesDataset(Custom3DDataset):
                camera2ego[:3, 3] = camera_info["sensor2ego_translation"]
                data["camera2ego"].append(camera2ego)

-        # TODO (Haotian): test set submission.
+                # camera to lidar transform
+                camera2lidar = np.eye(4).astype(np.float32)
+                camera2lidar[:3, :3] = camera_info["sensor2lidar_rotation"]
+                camera2lidar[:3, 3] = camera_info["sensor2lidar_translation"]
+                data["camera2lidar"].append(camera2lidar)
+
        annos = self.get_ann_info(index)
        data["ann_info"] = annos
-        
        return data

    def get_ann_info(self, index):
--- a/mmdet3d/datasets/pipelines/formating.py
+++ b/mmdet3d/datasets/pipelines/formating.py
@ -7,6 +7,8 @@ from mmdet3d.core.points import BasePoints
 from mmdet.datasets.builder import PIPELINES
 from mmdet.datasets.pipelines import to_tensor

+import torch
+

@PIPELINES.register_module()
 class DefaultFormatBundle3D:
@ -96,14 +98,7 @@ class DefaultFormatBundle3D:
                        dtype=np.int64,
                    )
        if "img" in results:
-            if isinstance(results["img"], list):
-                # process multiple imgs in single frame
-                imgs = [img.transpose(2, 0, 1) for img in results["img"]]
-                imgs = np.ascontiguousarray(np.stack(imgs, axis=0))
-                results["img"] = DC(to_tensor(imgs), stack=True)
-            else:
-                img = np.ascontiguousarray(results["img"].transpose(2, 0, 1))
-                results["img"] = DC(to_tensor(img), stack=True)
+            results["img"] = DC(torch.stack(results["img"]), stack=True)

        for key in [
            "proposals",
--- a/mmdet3d/datasets/pipelines/loading.py
+++ b/mmdet3d/datasets/pipelines/loading.py
@ -5,6 +5,8 @@ import mmcv
 import numpy as np
 from nuscenes.map_expansion.map_api import NuScenesMap
 from nuscenes.map_expansion.map_api import locations as LOCATIONS
+from PIL import Image
+

 from mmdet3d.core.points import BasePoints, get_points_type
 from mmdet.datasets.builder import PIPELINES
@ -49,26 +51,25 @@ class LoadMultiViewImageFromFiles:
        """
        filename = results["image_paths"]
        # img is of shape (h, w, c, num_views)
-        img = np.stack(
-            [mmcv.imread(name, self.color_type) for name in filename], axis=-1
-        )
-        if self.to_float32:
-            img = img.astype(np.float32)
+        # modified for waymo
+        images = []
+        h, w = 0, 0
+        for name in filename:
+            images.append(Image.open(name))
+        
+        #TODO: consider image padding in waymo
+
        results["filename"] = filename
        # unravel to list, see `DefaultFormatBundle` in formating.py
        # which will transpose each image separately and then stack into array
-        results["img"] = [img[..., i] for i in range(img.shape[-1])]
-        results["img_shape"] = img.shape
-        results["ori_shape"] = img.shape
+        results["img"] = images
+        # [1600, 900]
+        results["img_shape"] = images[0].size
+        results["ori_shape"] = images[0].size
        # Set initial values for default meta_keys
-        results["pad_shape"] = img.shape
+        results["pad_shape"] = images[0].size
        results["scale_factor"] = 1.0
-        num_channels = 1 if len(img.shape) < 3 else img.shape[2]
-        results["img_norm_cfg"] = dict(
-            mean=np.zeros(num_channels, dtype=np.float32),
-            std=np.ones(num_channels, dtype=np.float32),
-            to_rgb=False,
-        )
+        
        return results

    def __repr__(self):
--- a/mmdet3d/datasets/pipelines/transforms_3d.py
+++ b/mmdet3d/datasets/pipelines/transforms_3d.py
@ -3,6 +3,7 @@ from typing import Any, Dict
 import mmcv
 import numpy as np
 import torch
+import torchvision
 from mmcv import is_tuple_of
 from mmcv.utils import build_from_cfg
 from numpy import random
@ -34,7 +35,7 @@ class ImageAug3D:
        self.is_train = is_train

    def sample_augmentation(self, results):
-        H, W, _, _ = results["ori_shape"]
+        W, H = results["ori_shape"]
        fH, fW = self.final_dim
        if self.is_train:
            resize = np.random.uniform(*self.resize_lim)
@ -62,13 +63,11 @@ class ImageAug3D:
        self, img, rotation, translation, resize, resize_dims, crop, flip, rotate
    ):
        # adjust image
-        img = Image.fromarray(img.astype(np.uint8))
        img = img.resize(resize_dims)
        img = img.crop(crop)
        if flip:
            img = img.transpose(method=Image.FLIP_LEFT_RIGHT)
        img = img.rotate(rotate)
-        img = np.array(img).astype(np.float32)

        # post-homography transformation
        rotation *= resize
@ -902,17 +901,19 @@ class ImagePad:

@PIPELINES.register_module()
 class ImageNormalize:
-    def __init__(self, mean, std, to_rgb=True):
-        self.mean = np.array(mean, dtype=np.float32)
-        self.std = np.array(std, dtype=np.float32)
-        self.to_rgb = to_rgb
+    def __init__(self, mean, std):
+        self.mean = mean
+        self.std = std
+        self.compose = torchvision.transforms.Compose(
+            [
+                torchvision.transforms.ToTensor(),
+                torchvision.transforms.Normalize(mean=mean, std=std),
+            ]
+        )

    def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
-        data["img"] = [
-            mmcv.imnormalize(img, self.mean, self.std, self.to_rgb)
-            for img in data["img"]
-        ]
-        data["img_norm_cfg"] = dict(mean=self.mean, std=self.std, to_rgb=self.to_rgb)
+        data["img"] = [self.compose(img) for img in data["img"]]
+        data["img_norm_cfg"] = dict(mean=self.mean, std=self.std)
        return data


--- a/mmdet3d/models/fusion_models/bevfusion.py
+++ b/mmdet3d/models/fusion_models/bevfusion.py
@ -12,9 +12,10 @@ from mmdet3d.models.builder import (
    build_neck,
    build_vtransform,
 )
-from mmdet3d.ops import Voxelization
+from mmdet3d.ops import Voxelization, DynamicScatter
 from mmdet3d.models import FUSIONMODELS

+
 from .base import Base3DFusionModel

 __all__ = ["BEVFusion"]
@ -42,9 +43,13 @@ class BEVFusion(Base3DFusionModel):
                }
            )
        if encoders.get("lidar") is not None:
+            if encoders["lidar"]["voxelize"].get("max_num_points", -1) > 0:
+                voxelize_module = Voxelization(**encoders["lidar"]["voxelize"])
+            else:
+                voxelize_module = DynamicScatter(**encoders["lidar"]["voxelize"])
            self.encoders["lidar"] = nn.ModuleDict(
                {
-                    "voxelize": Voxelization(**encoders["lidar"]["voxelize"]),
+                    "voxelize": voxelize_module,
                    "backbone": build_backbone(encoders["lidar"]["backbone"]),
                }
            )
@ -89,6 +94,7 @@ class BEVFusion(Base3DFusionModel):
        lidar2camera,
        lidar2image,
        camera_intrinsics,
+        camera2lidar,
        img_aug_matrix,
        lidar_aug_matrix,
        img_metas,
@ -113,6 +119,7 @@ class BEVFusion(Base3DFusionModel):
            lidar2camera,
            lidar2image,
            camera_intrinsics,
+            camera2lidar,
            img_aug_matrix,
            lidar_aug_matrix,
            img_metas,
@ -130,18 +137,28 @@ class BEVFusion(Base3DFusionModel):
    def voxelize(self, points):
        feats, coords, sizes = [], [], []
        for k, res in enumerate(points):
-            f, c, n = self.encoders["lidar"]["voxelize"](res)
+            ret = self.encoders["lidar"]["voxelize"](res)
+            if len(ret) == 3:
+                # hard voxelize
+                f, c, n = ret
+            else:
+                assert len(ret) == 2
+                f, c = ret
+                n = None
            feats.append(f)
            coords.append(F.pad(c, (1, 0), mode="constant", value=k))
-            sizes.append(n)
+            if n is not None:
+                sizes.append(n)

        feats = torch.cat(feats, dim=0)
        coords = torch.cat(coords, dim=0)
-        sizes = torch.cat(sizes, dim=0)
-
-        if self.voxelize_reduce:
-            feats = feats.sum(dim=1, keepdim=False) / sizes.type_as(feats).view(-1, 1)
-            feats = feats.contiguous()
+        if len(sizes) > 0:
+            sizes = torch.cat(sizes, dim=0)
+            if self.voxelize_reduce:
+                feats = feats.sum(dim=1, keepdim=False) / sizes.type_as(feats).view(
+                    -1, 1
+                )
+                feats = feats.contiguous()

        return feats, coords, sizes

@ -155,6 +172,48 @@ class BEVFusion(Base3DFusionModel):
        lidar2camera,
        lidar2image,
        camera_intrinsics,
+        camera2lidar,
+        img_aug_matrix,
+        lidar_aug_matrix,
+        metas,
+        gt_masks_bev=None,
+        gt_bboxes_3d=None,
+        gt_labels_3d=None,
+        **kwargs,
+    ):
+        if isinstance(img, list):
+            raise NotImplementedError
+        else:
+            outputs = self.forward_single(
+                img,
+                points,
+                camera2ego,
+                lidar2ego,
+                lidar2camera,
+                lidar2image,
+                camera_intrinsics,
+                camera2lidar,
+                img_aug_matrix,
+                lidar_aug_matrix,
+                metas,
+                gt_masks_bev,
+                gt_bboxes_3d,
+                gt_labels_3d,
+                **kwargs,
+            )
+            return outputs
+
+    @auto_fp16(apply_to=("img", "points"))
+    def forward_single(
+        self,
+        img,
+        points,
+        camera2ego,
+        lidar2ego,
+        lidar2camera,
+        lidar2image,
+        camera_intrinsics,
+        camera2lidar,
        img_aug_matrix,
        lidar_aug_matrix,
        metas,
@ -164,7 +223,9 @@ class BEVFusion(Base3DFusionModel):
        **kwargs,
    ):
        features = []
-        for sensor in self.encoders:
+        for sensor in (
+            self.encoders if self.training else list(self.encoders.keys())[::-1]
+        ):
            if sensor == "camera":
                feature = self.extract_camera_features(
                    img,
@ -174,6 +235,7 @@ class BEVFusion(Base3DFusionModel):
                    lidar2camera,
                    lidar2image,
                    camera_intrinsics,
+                    camera2lidar,
                    img_aug_matrix,
                    lidar_aug_matrix,
                    metas,
@ -184,6 +246,10 @@ class BEVFusion(Base3DFusionModel):
                raise ValueError(f"unsupported sensor: {sensor}")
            features.append(feature)

+        if not self.training:
+            # avoid OOM
+            features = features[::-1]
+
        if self.fuser is not None:
            x = self.fuser(features)
        else:
--- a/mmdet3d/models/vtransforms/base.py
+++ b/mmdet3d/models/vtransforms/base.py
@ -78,16 +78,15 @@ class BaseTransform(nn.Module):
    @force_fp32()
    def get_geometry(
        self,
-        rots,
-        trans,
+        camera2lidar_rots,
+        camera2lidar_trans,
        intrins,
        post_rots,
        post_trans,
-        lidar2ego_rots,
-        lidar2ego_trans,
        **kwargs,
    ):
-        B, N, _ = trans.shape
+        B, N, _ = camera2lidar_trans.shape
+
        # undo post-transformation
        # B x N x D x H x W x 3
        points = self.frustum - post_trans.view(B, N, 1, 1, 1, 3)
@ -96,7 +95,7 @@ class BaseTransform(nn.Module):
            .view(B, N, 1, 1, 1, 3, 3)
            .matmul(points.unsqueeze(-1))
        )
-        # cam_to_ego
+        # cam_to_lidar
        points = torch.cat(
            (
                points[:, :, :, :, :, :2] * points[:, :, :, :, :, 2:3],
@ -104,17 +103,9 @@ class BaseTransform(nn.Module):
            ),
            5,
        )
-        combine = rots.matmul(torch.inverse(intrins))
+        combine = camera2lidar_rots.matmul(torch.inverse(intrins))
        points = combine.view(B, N, 1, 1, 1, 3, 3).matmul(points).squeeze(-1)
-        points += trans.view(B, N, 1, 1, 1, 3)
-        # ego_to_lidar
-        points -= lidar2ego_trans.view(B, 1, 1, 1, 1, 3)
-        points = (
-            torch.inverse(lidar2ego_rots)
-            .view(B, 1, 1, 1, 1, 3, 3)
-            .matmul(points.unsqueeze(-1))
-            .squeeze(-1)
-        )
+        points += camera2lidar_trans.view(B, N, 1, 1, 1, 3)

        if "extra_rots" in kwargs:
            extra_rots = kwargs["extra_rots"]
@ -181,9 +172,9 @@ class BaseTransform(nn.Module):
        lidar2camera,
        lidar2image,
        camera_intrinsics,
+        camera2lidar,
        img_aug_matrix,
        lidar_aug_matrix,
-        metas=None,
        **kwargs,
    ):
        rots = camera2ego[..., :3, :3]
@ -193,19 +184,20 @@ class BaseTransform(nn.Module):
        post_trans = img_aug_matrix[..., :3, 3]
        lidar2ego_rots = lidar2ego[..., :3, :3]
        lidar2ego_trans = lidar2ego[..., :3, 3]
+        camera2lidar_rots = camera2lidar[..., :3, :3]
+        camera2lidar_trans = camera2lidar[..., :3, 3]
+
        extra_rots = lidar_aug_matrix[..., :3, :3]
        extra_trans = lidar_aug_matrix[..., :3, 3]

        geom = self.get_geometry(
-            rots,
-            trans,
+            camera2lidar_rots,
+            camera2lidar_trans,
            intrins,
            post_rots,
            post_trans,
-            lidar2ego_rots,
-            lidar2ego_trans,
            extra_rots=extra_rots,
-            extra_trans=extra_trans
+            extra_trans=extra_trans,
        )

        x = self.get_cam_feats(img)
@ -224,6 +216,7 @@ class BaseDepthTransform(BaseTransform):
        lidar2camera,
        lidar2image,
        cam_intrinsic,
+        camera2lidar,
        img_aug_matrix,
        lidar_aug_matrix,
        metas,
@ -236,18 +229,27 @@ class BaseDepthTransform(BaseTransform):
        post_trans = img_aug_matrix[..., :3, 3]
        lidar2ego_rots = lidar2ego[..., :3, :3]
        lidar2ego_trans = lidar2ego[..., :3, 3]
-        extra_rots = lidar_aug_matrix[..., :3, :3]
-        extra_trans = lidar_aug_matrix[..., :3, 3]
+        camera2lidar_rots = camera2lidar[..., :3, :3]
+        camera2lidar_trans = camera2lidar[..., :3, 3]
+
+        # print(img.shape, self.image_size, self.feature_size)

        batch_size = len(points)
-        depth = torch.zeros(batch_size, 6, 1, *self.image_size).to(points[0].device)
+        depth = torch.zeros(batch_size, img.shape[1], 1, *self.image_size).to(
+            points[0].device
+        )

        for b in range(batch_size):
-            cur_coords = points[b][:, :3].transpose(1, 0)
+            cur_coords = points[b][:, :3]
            cur_img_aug_matrix = img_aug_matrix[b]
            cur_lidar_aug_matrix = lidar_aug_matrix[b]
            cur_lidar2image = lidar2image[b]

+            # inverse aug
+            cur_coords -= cur_lidar_aug_matrix[:3, 3]
+            cur_coords = torch.inverse(cur_lidar_aug_matrix[:3, :3]).matmul(
+                cur_coords.transpose(1, 0)
+            )
            # lidar2image
            cur_coords = cur_lidar2image[:, :3, :3].matmul(cur_coords)
            cur_coords += cur_lidar2image[:, :3, 3].reshape(-1, 3, 1)
@ -270,19 +272,21 @@ class BaseDepthTransform(BaseTransform):
                & (cur_coords[..., 1] < self.image_size[1])
                & (cur_coords[..., 1] >= 0)
            )
-            for c in range(6):
+            for c in range(on_img.shape[0]):
                masked_coords = cur_coords[c, on_img[c]].long()
                masked_dist = dist[c, on_img[c]]
                depth[b, c, 0, masked_coords[:, 0], masked_coords[:, 1]] = masked_dist

+        extra_rots = lidar_aug_matrix[..., :3, :3]
+        extra_trans = lidar_aug_matrix[..., :3, 3]
        geom = self.get_geometry(
-            rots,
-            trans,
+            camera2lidar_rots,
+            camera2lidar_trans,
            intrins,
            post_rots,
            post_trans,
-            lidar2ego_rots,
-            lidar2ego_trans,
+            extra_rots=extra_rots,
+            extra_trans=extra_trans,
        )

        x = self.get_cam_feats(img, depth)
--- a/tools/download_pretrained.sh
+++ b/tools/download_pretrained.sh
@ -1,9 +1,10 @@
 mkdir pretrained &&
 cd pretrained &&
-wget https://bevfusion.mit.edu/files/pretrained/bevfusion-det.pth &&
-wget https://bevfusion.mit.edu/files/pretrained/bevfusion-seg.pth &&
+wget https://bevfusion.mit.edu/files/pretrained_updated/bevfusion-det.pth &&
+wget https://bevfusion.mit.edu/files/pretrained_updated/bevfusion-seg.pth &&
 wget https://bevfusion.mit.edu/files/pretrained/lidar-only-det.pth &&
 wget https://bevfusion.mit.edu/files/pretrained/lidar-only-seg.pth &&
-wget https://bevfusion.mit.edu/files/pretrained/camera-only-det.pth &&
-wget https://bevfusion.mit.edu/files/pretrained/camera-only-seg.pth 
+wget https://bevfusion.mit.edu/files/pretrained_updated/camera-only-det.pth &&
+wget https://bevfusion.mit.edu/files/pretrained_updated/camera-only-seg.pth &&
+wget https://bevfusion.mit.edu/files/pretrained_updated/swint-nuimages-pretrained.pth