model: encoders: camera: neck: type: GeneralizedLSSFPN in_channels: [512, 1024, 2048] out_channels: 256 start_level: 0 num_outs: 3 norm_cfg: type: BN2d requires_grad: true act_cfg: type: ReLU inplace: true upsample_cfg: mode: bilinear align_corners: false vtransform: type: DepthLSSTransform in_channels: 256 out_channels: 80 image_size: ${image_size} feature_size: ${[image_size[0] // 8, image_size[1] // 8]} xbound: [-51.2, 51.2, 0.4] ybound: [-51.2, 51.2, 0.4] zbound: [-10.0, 10.0, 20.0] dbound: [1.0, 60.0, 0.5] downsample: 2 lidar: voxelize: max_num_points: 10 point_cloud_range: ${point_cloud_range} voxel_size: ${voxel_size} max_voxels: [90000, 120000] backbone: type: SparseEncoder in_channels: 5 sparse_shape: [1024, 1024, 41] output_channels: 128 order: - conv - norm - act encoder_channels: - [16, 16, 32] - [32, 32, 64] - [64, 64, 128] - [128, 128] encoder_paddings: - [0, 0, 1] - [0, 0, 1] - [0, 0, [1, 1, 0]] - [0, 0] block_type: basicblock