model:
  encoders:
    camera:
      vtransform:
        type: AwareBEVDepth
        bevdepth_downsample: 16
        bevdepth_refine: false
        depth_loss_factor: 3.0
        in_channels: 512
        out_channels: 64
        feature_size: ${[image_size[0] // 16, image_size[1] // 16]}
        xbound: [-51.2, 51.2, 0.8]
        ybound: [-51.2, 51.2, 0.8]
        zbound: [-10.0, 10.0, 20.0]
        dbound: [1.0, 60.0, 1.0]
        downsample: 1