# 三任务配置: 3D检测 + BEV分割 + 矢量地图 # 基于SwinTransformer + ConvFuser架构 _base_: ./default.yaml # 模型配置 model: type: BEVFusion # Encoder配置(复用训练好的) encoders: camera: backbone: type: SwinTransformer embed_dims: 96 depths: [2, 2, 6, 2] num_heads: [3, 6, 12, 24] window_size: 7 mlp_ratio: 4 qkv_bias: true qk_scale: null drop_rate: 0. attn_drop_rate: 0. drop_path_rate: 0.2 patch_norm: true out_indices: [1, 2, 3] with_cp: false convert_weights: true init_cfg: type: Pretrained checkpoint: https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_tiny_patch4_window7_224.pth neck: type: GeneralizedLSSFPN in_channels: [192, 384, 768] out_channels: 256 start_level: 0 num_outs: 3 norm_cfg: type: BN2d requires_grad: true act_cfg: type: ReLU inplace: true upsample_cfg: mode: bilinear align_corners: false vtransform: type: DepthLSSTransform in_channels: 256 out_channels: 80 image_size: ${image_size} feature_size: [${[image_size[0] // 8, image_size[1] // 8]}] xbound: [-54.0, 54.0, 0.3] ybound: [-54.0, 54.0, 0.3] zbound: [-10.0, 10.0, 20.0] dbound: [1.0, 60.0, 0.5] downsample: 2 lidar: voxelize: max_num_points: 10 point_cloud_range: ${point_cloud_range} voxel_size: ${voxel_size} max_voxels: [120000, 160000] backbone: type: SparseEncoder in_channels: 5 sparse_shape: [1440, 1440, 41] output_channels: 128 order: - conv - norm - act encoder_channels: - [16, 16, 32] - [32, 32, 64] - [64, 64, 128] - [128, 128] encoder_paddings: - [0, 0, 1] - [0, 0, 1] - [0, 0, [1, 1, 0]] - [0, 0] block_type: basicblock # Fuser配置 fuser: type: ConvFuser in_channels: [80, 256] out_channels: 256 # Decoder配置 decoder: backbone: type: SECOND in_channels: 256 out_channels: [128, 256] layer_nums: [5, 5] layer_strides: [1, 2] norm_cfg: type: BN eps: 1.0e-3 momentum: 0.01 conv_cfg: type: Conv2d bias: false neck: type: SECONDFPN in_channels: [128, 256] out_channels: [256, 256] upsample_strides: [1, 2] norm_cfg: type: BN eps: 1.0e-3 momentum: 0.01 upsample_cfg: type: deconv bias: false use_conv_for_no_stride: true # 三个任务头 heads: # 任务1: 3D目标检测 object: type: TransFusionHead num_proposals: 200 auxiliary: true in_channels: 512 hidden_channel: 128 num_classes: 10 num_decoder_layers: 1 num_heads: 8 nms_kernel_size: 3 bn_momentum: 0.1 activation: relu common_heads: center: [2, 2] height: [1, 2] dim: [3, 2] rot: [2, 2] vel: [2, 2] bbox_coder: type: TransFusionBBoxCoder pc_range: ${point_cloud_range[:2]} voxel_size: ${voxel_size[:2]} out_size_factor: 8 post_center_range: [-61.2, -61.2, -10.0, 61.2, 61.2, 10.0] score_threshold: 0.0 code_size: 10 loss_cls: type: FocalLoss use_sigmoid: true gamma: 2.0 alpha: 0.25 reduction: mean loss_weight: 1.0 loss_bbox: type: L1Loss reduction: mean loss_weight: 0.25 loss_heatmap: type: GaussianFocalLoss reduction: mean loss_weight: 1.0 # 任务2: BEV地图分割 map: type: VanillaSegmentationHead in_channels: 512 num_classes: 6 align_corners: false loss_decode: type: CrossEntropyLoss use_sigmoid: false class_weight: [1.0, 2.0, 2.0, 2.0, 2.0, 2.0] loss_weight: 1.0 # 任务3: 矢量地图预测 🆕 vector_map: type: MapTRHead in_channels: 512 num_classes: 3 num_queries: 50 num_points: 20 embed_dims: 256 num_decoder_layers: 6 num_heads: 8 dropout: 0.1 loss_cls_weight: 2.0 loss_reg_weight: 5.0 loss_chamfer_weight: 2.0 score_threshold: 0.3 nms_threshold: 0.5 # 损失权重 loss_scale: object: 1.0 map: 1.0 vector_map: 1.0 # 矢量地图任务权重 # 训练配置 max_epochs: 20 batch_size_per_gpu: 1 num_workers_per_gpu: 0 # 避免shared memory问题 # 学习率(三任务可能需要稍微调整) optimizer: type: AdamW lr: 1.5e-4 # 稍微降低学习率 weight_decay: 0.01 # 评估配置 evaluation: interval: 1 pipeline: ${val_pipeline} metric: - bbox # 3D检测mAP - map # BEV分割mIoU - vector # 矢量地图AP 🆕 # Checkpoint checkpoint_config: interval: 1 max_keep_ckpts: 5 # 日志 log_config: interval: 50 hooks: - type: TextLoggerHook - type: TensorboardLoggerHook # 运行时 dist_params: backend: nccl log_level: INFO work_dir: runs/three_tasks load_from: null # 从当前训练的多任务模型加载 resume_from: null workflow: [['train', 1]]