# BEVFusion 多任务配置:检测 + 分割(SwinTransformer版本) _base_: - ../det/transfusion/secfpn/camera+lidar/swint_v0p075/convfuser.yaml voxel_size: [0.075, 0.075, 0.2] point_cloud_range: [-54.0, -54.0, -5.0, 54.0, 54.0, 3.0] model: encoders: camera: backbone: type: SwinTransformer embed_dims: 96 depths: [2, 2, 6, 2] num_heads: [3, 6, 12, 24] window_size: 7 mlp_ratio: 4 qkv_bias: true qk_scale: null drop_rate: 0. attn_drop_rate: 0. drop_path_rate: 0.2 patch_norm: true out_indices: [1, 2, 3] with_cp: false convert_weights: true init_cfg: type: Pretrained checkpoint: pretrained/swint-nuimages-pretrained.pth neck: type: GeneralizedLSSFPN in_channels: [192, 384, 768] out_channels: 256 start_level: 0 num_outs: 3 norm_cfg: type: BN2d requires_grad: true act_cfg: type: ReLU inplace: true upsample_cfg: mode: bilinear align_corners: false vtransform: type: DepthLSSTransform in_channels: 256 out_channels: 80 image_size: ${image_size} feature_size: ${[image_size[0] // 8, image_size[1] // 8]} xbound: [-54.0, 54.0, 0.3] ybound: [-54.0, 54.0, 0.3] zbound: [-10.0, 10.0, 20.0] dbound: [1.0, 60.0, 0.5] downsample: 2 lidar: voxelize: max_num_points: 10 point_cloud_range: ${point_cloud_range} voxel_size: ${voxel_size} max_voxels: [120000, 160000] backbone: type: SparseEncoder in_channels: 5 sparse_shape: [1440, 1440, 41] output_channels: 128 order: - conv - norm - act encoder_channels: - [16, 16, 32] - [32, 32, 64] - [64, 64, 128] - [128, 128] encoder_paddings: - [0, 0, 1] - [0, 0, 1] - [0, 0, [1, 1, 0]] - [0, 0] block_type: basicblock fuser: type: ConvFuser in_channels: [80, 256] out_channels: 256 decoder: backbone: type: SECOND in_channels: 256 out_channels: [128, 256] layer_nums: [5, 5] layer_strides: [1, 2] norm_cfg: type: BN eps: 1.0e-3 momentum: 0.01 conv_cfg: type: Conv2d bias: false neck: type: SECONDFPN in_channels: [128, 256] out_channels: [256, 256] upsample_strides: [1, 2] norm_cfg: type: BN eps: 1.0e-3 momentum: 0.01 upsample_cfg: type: deconv bias: false use_conv_for_no_stride: true heads: # 3D检测头 object: in_channels: 512 train_cfg: grid_size: [1440, 1440, 41] test_cfg: grid_size: [1440, 1440, 41] # BEV分割头 map: in_channels: 512 grid_transform: input_scope: [[-54.0, 54.0, 0.75], [-54.0, 54.0, 0.75]] output_scope: [[-50, 50, 0.5], [-50, 50, 0.5]] # 损失权重 loss_scale: object: 1.0 map: 1.0 # 训练超参数 max_epochs: 20 lr_config: policy: CosineAnnealing warmup: linear warmup_iters: 500 warmup_ratio: 0.33333333 min_lr_ratio: 1.0e-3 log_config: interval: 50 hooks: - type: TextLoggerHook # - type: TensorboardLoggerHook # 可选:启用tensorboard