bev-project/configs/nuscenes/three_tasks/bevfusion_det_seg_vec.yaml

# 三任务配置: 3D检测 + BEV分割 + 矢量地图
# 基于SwinTransformer + ConvFuser架构

_base_: ./default.yaml

# 模型配置
model:
  type: BEVFusion

  # Encoder配置（复用训练好的）
  encoders:
    camera:
      backbone:
        type: SwinTransformer
        embed_dims: 96
        depths: [2, 2, 6, 2]
        num_heads: [3, 6, 12, 24]
        window_size: 7
        mlp_ratio: 4
        qkv_bias: true
        qk_scale: null
        drop_rate: 0.
        attn_drop_rate: 0.
        drop_path_rate: 0.2
        patch_norm: true
        out_indices: [1, 2, 3]
        with_cp: false
        convert_weights: true
        init_cfg:
          type: Pretrained
          checkpoint: https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_tiny_patch4_window7_224.pth

      neck:
        type: GeneralizedLSSFPN
        in_channels: [192, 384, 768]
        out_channels: 256
        start_level: 0
        num_outs: 3
        norm_cfg:
          type: BN2d
          requires_grad: true
        act_cfg:
          type: ReLU
          inplace: true
        upsample_cfg:
          mode: bilinear
          align_corners: false

      vtransform:
        type: DepthLSSTransform
        in_channels: 256
        out_channels: 80
        image_size: ${image_size}
        feature_size: [${[image_size[0] // 8, image_size[1] // 8]}]
        xbound: [-54.0, 54.0, 0.3]
        ybound: [-54.0, 54.0, 0.3]
        zbound: [-10.0, 10.0, 20.0]
        dbound: [1.0, 60.0, 0.5]
        downsample: 2

    lidar:
      voxelize:
        max_num_points: 10
        point_cloud_range: ${point_cloud_range}
        voxel_size: ${voxel_size}
        max_voxels: [120000, 160000]

      backbone:
        type: SparseEncoder
        in_channels: 5
        sparse_shape: [1440, 1440, 41]
        output_channels: 128
        order:
          - conv
          - norm
          - act
        encoder_channels:
          - [16, 16, 32]
          - [32, 32, 64]
          - [64, 64, 128]
          - [128, 128]
        encoder_paddings:
          - [0, 0, 1]
          - [0, 0, 1]
          - [0, 0, [1, 1, 0]]
          - [0, 0]
        block_type: basicblock

  # Fuser配置
  fuser:
    type: ConvFuser
    in_channels: [80, 256]
    out_channels: 256

  # Decoder配置
  decoder:
    backbone:
      type: SECOND
      in_channels: 256
      out_channels: [128, 256]
      layer_nums: [5, 5]
      layer_strides: [1, 2]
      norm_cfg:
        type: BN
        eps: 1.0e-3
        momentum: 0.01
      conv_cfg:
        type: Conv2d
        bias: false

    neck:
      type: SECONDFPN
      in_channels: [128, 256]
      out_channels: [256, 256]
      upsample_strides: [1, 2]
      norm_cfg:
        type: BN
        eps: 1.0e-3
        momentum: 0.01
      upsample_cfg:
        type: deconv
        bias: false
      use_conv_for_no_stride: true

  # 三个任务头
  heads:
    # 任务1: 3D目标检测
    object:
      type: TransFusionHead
      num_proposals: 200
      auxiliary: true
      in_channels: 512
      hidden_channel: 128
      num_classes: 10
      num_decoder_layers: 1
      num_heads: 8
      nms_kernel_size: 3
      bn_momentum: 0.1
      activation: relu
      common_heads:
        center: [2, 2]
        height: [1, 2]
        dim: [3, 2]
        rot: [2, 2]
        vel: [2, 2]
      bbox_coder:
        type: TransFusionBBoxCoder
        pc_range: ${point_cloud_range[:2]}
        voxel_size: ${voxel_size[:2]}
        out_size_factor: 8
        post_center_range: [-61.2, -61.2, -10.0, 61.2, 61.2, 10.0]
        score_threshold: 0.0
        code_size: 10
      loss_cls:
        type: FocalLoss
        use_sigmoid: true
        gamma: 2.0
        alpha: 0.25
        reduction: mean
        loss_weight: 1.0
      loss_bbox:
        type: L1Loss
        reduction: mean
        loss_weight: 0.25
      loss_heatmap:
        type: GaussianFocalLoss
        reduction: mean
        loss_weight: 1.0

    # 任务2: BEV地图分割
    map:
      type: VanillaSegmentationHead
      in_channels: 512
      num_classes: 6
      align_corners: false
      loss_decode:
        type: CrossEntropyLoss
        use_sigmoid: false
        class_weight: [1.0, 2.0, 2.0, 2.0, 2.0, 2.0]
        loss_weight: 1.0

    # 任务3: 矢量地图预测 🆕
    vector_map:
      type: MapTRHead
      in_channels: 512
      num_classes: 3
      num_queries: 50
      num_points: 20
      embed_dims: 256
      num_decoder_layers: 6
      num_heads: 8
      dropout: 0.1
      loss_cls_weight: 2.0
      loss_reg_weight: 5.0
      loss_chamfer_weight: 2.0
      score_threshold: 0.3
      nms_threshold: 0.5

  # 损失权重
  loss_scale:
    object: 1.0
    map: 1.0
    vector_map: 1.0  # 矢量地图任务权重

# 训练配置
max_epochs: 20
batch_size_per_gpu: 1
num_workers_per_gpu: 0  # 避免shared memory问题

# 学习率（三任务可能需要稍微调整）
optimizer:
  type: AdamW
  lr: 1.5e-4  # 稍微降低学习率
  weight_decay: 0.01

# 评估配置
evaluation:
  interval: 1
  pipeline: ${val_pipeline}
  metric:
    - bbox  # 3D检测mAP
    - map   # BEV分割mIoU
    - vector  # 矢量地图AP 🆕

# Checkpoint
checkpoint_config:
  interval: 1
  max_keep_ckpts: 5

# 日志
log_config:
  interval: 50
  hooks:
    - type: TextLoggerHook
    - type: TensorboardLoggerHook

# 运行时
dist_params:
  backend: nccl

log_level: INFO
work_dir: runs/three_tasks
load_from: null  # 从当前训练的多任务模型加载
resume_from: null
workflow: [['train', 1]]