bev-project/configs/nuscenes/multitask/fusion-det-seg-swint.yaml

# BEVFusion 多任务配置：检测 + 分割（SwinTransformer版本）
_base_:
  - ../det/transfusion/secfpn/camera+lidar/swint_v0p075/convfuser.yaml

voxel_size: [0.075, 0.075, 0.2]
point_cloud_range: [-54.0, -54.0, -5.0, 54.0, 54.0, 3.0]

model:
  encoders:
    camera:
      backbone:
        type: SwinTransformer
        embed_dims: 96
        depths: [2, 2, 6, 2]
        num_heads: [3, 6, 12, 24]
        window_size: 7
        mlp_ratio: 4
        qkv_bias: true
        qk_scale: null
        drop_rate: 0.
        attn_drop_rate: 0.
        drop_path_rate: 0.2
        patch_norm: true
        out_indices: [1, 2, 3]
        with_cp: false
        convert_weights: true
        init_cfg:
          type: Pretrained
          checkpoint: pretrained/swint-nuimages-pretrained.pth
      neck:
        type: GeneralizedLSSFPN
        in_channels: [192, 384, 768]
        out_channels: 256
        start_level: 0
        num_outs: 3
        norm_cfg:
          type: BN2d
          requires_grad: true
        act_cfg:
          type: ReLU
          inplace: true
        upsample_cfg:
          mode: bilinear
          align_corners: false
      vtransform:
        type: DepthLSSTransform
        in_channels: 256
        out_channels: 80
        image_size: ${image_size}
        feature_size: ${[image_size[0] // 8, image_size[1] // 8]}
        xbound: [-54.0, 54.0, 0.3]
        ybound: [-54.0, 54.0, 0.3]
        zbound: [-10.0, 10.0, 20.0]
        dbound: [1.0, 60.0, 0.5]
        downsample: 2

    lidar:
      voxelize:
        max_num_points: 10
        point_cloud_range: ${point_cloud_range}
        voxel_size: ${voxel_size}
        max_voxels: [120000, 160000]
      backbone:
        type: SparseEncoder
        in_channels: 5
        sparse_shape: [1440, 1440, 41]
        output_channels: 128
        order:
          - conv
          - norm
          - act
        encoder_channels:
          - [16, 16, 32]
          - [32, 32, 64]
          - [64, 64, 128]
          - [128, 128]
        encoder_paddings:
          - [0, 0, 1]
          - [0, 0, 1]
          - [0, 0, [1, 1, 0]]
          - [0, 0]
        block_type: basicblock

  fuser:
    type: ConvFuser
    in_channels: [80, 256]
    out_channels: 256

  decoder:
    backbone:
      type: SECOND
      in_channels: 256
      out_channels: [128, 256]
      layer_nums: [5, 5]
      layer_strides: [1, 2]
      norm_cfg:
        type: BN
        eps: 1.0e-3
        momentum: 0.01
      conv_cfg:
        type: Conv2d
        bias: false
    neck:
      type: SECONDFPN
      in_channels: [128, 256]
      out_channels: [256, 256]
      upsample_strides: [1, 2]
      norm_cfg:
        type: BN
        eps: 1.0e-3
        momentum: 0.01
      upsample_cfg:
        type: deconv
        bias: false
      use_conv_for_no_stride: true

  heads:
    # 3D检测头
    object:
      in_channels: 512
      train_cfg:
        grid_size: [1440, 1440, 41]
      test_cfg:
        grid_size: [1440, 1440, 41]

    # BEV分割头
    map:
      in_channels: 512
      grid_transform:
        input_scope: [[-54.0, 54.0, 0.75], [-54.0, 54.0, 0.75]]
        output_scope: [[-50, 50, 0.5], [-50, 50, 0.5]]

  # 损失权重
  loss_scale:
    object: 1.0
    map: 1.0

# 训练超参数
max_epochs: 20

lr_config:
  policy: CosineAnnealing
  warmup: linear
  warmup_iters: 500
  warmup_ratio: 0.33333333
  min_lr_ratio: 1.0e-3

log_config:
  interval: 50
  hooks:
    - type: TextLoggerHook
    # - type: TensorboardLoggerHook  # 可选：启用tensorboard