model:
  encoders:
    camera:
      backbone:
        type: ResNet
        depth: 50
        num_stages: 4
        out_indices: [0, 1, 2, 3]
        norm_cfg:
          type: BN2d
          requires_grad: true
        norm_eval: false
        init_cfg:
          type: Pretrained
          checkpoint: torchvision://resnet50
      neck:
        type: SECONDFPN
        in_channels: [256, 512, 1024, 2048]
        out_channels: [128, 128, 128, 128]
        upsample_strides: [0.25, 0.5, 1, 2]    
      vtransform:
        type: LSSTransform
        in_channels: 512
        out_channels: 64
        image_size: ${image_size}
        feature_size: ${[image_size[0] // 16, image_size[1] // 16]}
        xbound: [-51.2, 51.2, 0.8]
        ybound: [-51.2, 51.2, 0.8]
        zbound: [-10.0, 10.0, 20.0]
        dbound: [1.0, 60.0, 1.0]
        downsample: 1
  decoder:
    backbone:
      type: GeneralizedResNet
      in_channels: 64
      blocks:
        - [2, 128, 2]
        - [2, 256, 2]
        - [2, 512, 1]
    neck:
      type: LSSFPN
      in_indices: [-1, 0]
      in_channels: [512, 128]
      out_channels: 256
      scale_factor: 2