bev-project/configs/nuscenes/multitask/fusion-det-seg-swint-enhanc...

# 增强版多任务配置：检测 + 分割（Enhanced Segmentation Head）
#
# 改进点：
# 1. EnhancedBEVSegmentationHead (ASPP + 双注意力 + 深层解码器)
# 2. Focal Loss修复 (alpha=0.25)
# 3. Dice Loss混合 (weight=0.5)
# 4. 类别权重平衡 (stop_line×4, divider×3)
# 5. Deep supervision
# 6. 分割损失权重×5
# 7. 学习率降低(复用权重)
#
# 复用: epoch_19.pth (93.7%参数)
# 预期: mIoU 60-65% (2-3天)

_base_: ./convfuser.yaml

# 添加分割头和pipeline
model:
  # 确保fuser配置正确
  fuser:
    type: ConvFuser
    in_channels: [80, 256]
    out_channels: 256

  heads:
    # object head从convfuser.yaml继承
    # 添加增强版map head
    map:
      type: EnhancedBEVSegmentationHead  # ← 使用增强版！
      in_channels: 512
      grid_transform:
        input_scope: [[-54.0, 54.0, 0.75], [-54.0, 54.0, 0.75]]
        output_scope: [[-50, 50, 0.5], [-50, 50, 0.5]]
      classes: ${map_classes}
      loss: focal

      # 类别权重（针对nuScenes不平衡）
      loss_weight:
        drivable_area: 1.0
        ped_crossing: 3.0
        walkway: 1.5
        stop_line: 4.0
        carpark_area: 2.0
        divider: 3.0

      # Focal Loss参数
      focal_alpha: 0.25
      focal_gamma: 2.0

      # Dice Loss配置
      use_dice_loss: true
      dice_weight: 0.5

      # Deep Supervision
      deep_supervision: true

      # Decoder配置
      decoder_channels: [256, 256, 128, 128]

  # 损失权重（分割权重×5）
  loss_scale:
    object: 1.0
    map: 5.0

# 训练epochs
max_epochs: 20

# 优化器（降低学习率）
optimizer:
  lr: 1.0e-4

# 修改Collect3D，添加gt_masks_bev
train_pipeline:
  -
    type: LoadMultiViewImageFromFiles
    to_float32: true
  -
    type: LoadPointsFromFile
    coord_type: LIDAR
    load_dim: ${load_dim}
    use_dim: ${use_dim}
    reduce_beams: ${reduce_beams}
    load_augmented: ${load_augmented}
  -
    type: LoadPointsFromMultiSweeps
    sweeps_num: 9
    load_dim: ${load_dim}
    use_dim: ${use_dim}
    reduce_beams: ${reduce_beams}
    pad_empty_sweeps: true
    remove_close: true
    load_augmented: ${load_augmented}
  -
    type: LoadAnnotations3D
    with_bbox_3d: true
    with_label_3d: true
    with_attr_label: False
  -
    type: ObjectPaste
    stop_epoch: ${gt_paste_stop_epoch}
    db_sampler:
      dataset_root: ${dataset_root}
      info_path: ${dataset_root + "nuscenes_dbinfos_train.pkl"}
      rate: 1.0
      prepare:
        filter_by_difficulty: [-1]
        filter_by_min_points:
          car: 5
          truck: 5
          bus: 5
          trailer: 5
          construction_vehicle: 5
          traffic_cone: 5
          barrier: 5
          motorcycle: 5
          bicycle: 5
          pedestrian: 5
      classes: ${object_classes}
      sample_groups:
        car: 2
        truck: 3
        construction_vehicle: 7
        bus: 4
        trailer: 6
        barrier: 2
        motorcycle: 6
        bicycle: 6
        pedestrian: 2
        traffic_cone: 2
      points_loader:
        type: LoadPointsFromFile
        coord_type: LIDAR
        load_dim: ${load_dim}
        use_dim: ${use_dim}
        reduce_beams: ${reduce_beams}
  -
    type: ImageAug3D
    final_dim: ${image_size}
    resize_lim: ${augment2d.resize[0]}
    bot_pct_lim: [0.0, 0.0]
    rot_lim: ${augment2d.rotate}
    rand_flip: true
    is_train: true
  -
    type: GlobalRotScaleTrans
    resize_lim: ${augment3d.scale}
    rot_lim: ${augment3d.rotate}
    trans_lim: ${augment3d.translate}
    is_train: true
  -
    type: LoadBEVSegmentation
    dataset_root: ${dataset_root}
    xbound: [-50.0, 50.0, 0.5]
    ybound: [-50.0, 50.0, 0.5]
    classes: ${map_classes}
  -
    type: RandomFlip3D
  -
    type: PointsRangeFilter
    point_cloud_range: ${point_cloud_range}
  -
    type: ObjectRangeFilter
    point_cloud_range: ${point_cloud_range}
  -
    type: ObjectNameFilter
    classes: ${object_classes}
  -
    type: ImageNormalize
    mean: [0.485, 0.456, 0.406]
    std: [0.229, 0.224, 0.225]
  -
    type: GridMask
    use_h: true
    use_w: true
    max_epoch: ${max_epochs}
    rotate: 1
    offset: false
    ratio: 0.5
    mode: 1
    prob: ${augment2d.gridmask.prob}
    fixed_prob: ${augment2d.gridmask.fixed_prob}
  -
    type: PointShuffle
  -
    type: DefaultFormatBundle3D
    classes: ${object_classes}
  -
    type: Collect3D
    keys:
      - img
      - points
      - gt_bboxes_3d
      - gt_labels_3d
      - gt_masks_bev
    meta_keys:
      - camera_intrinsics
      - camera2ego
      - lidar2ego
      - lidar2camera
      - camera2lidar
      - lidar2image
      - img_aug_matrix
      - lidar_aug_matrix
  -
    type: GTDepth
    keyframe_only: true

test_pipeline:
  -
    type: LoadMultiViewImageFromFiles
    to_float32: true
  -
    type: LoadPointsFromFile
    coord_type: LIDAR
    load_dim: ${load_dim}
    use_dim: ${use_dim}
    reduce_beams: ${reduce_beams}
    load_augmented: ${load_augmented}
  -
    type: LoadPointsFromMultiSweeps
    sweeps_num: 9
    load_dim: ${load_dim}
    use_dim: ${use_dim}
    reduce_beams: ${reduce_beams}
    pad_empty_sweeps: true
    remove_close: true
    load_augmented: ${load_augmented}
  -
    type: LoadAnnotations3D
    with_bbox_3d: true
    with_label_3d: true
    with_attr_label: False
  -
    type: ImageAug3D
    final_dim: ${image_size}
    resize_lim: ${augment2d.resize[1]}
    bot_pct_lim: [0.0, 0.0]
    rot_lim: [0.0, 0.0]
    rand_flip: false
    is_train: false
  -
    type: GlobalRotScaleTrans
    resize_lim: [1.0, 1.0]
    rot_lim: [0.0, 0.0]
    trans_lim: 0.0
    is_train: false
  -
    type: LoadBEVSegmentation
    dataset_root: ${dataset_root}
    xbound: [-50.0, 50.0, 0.5]
    ybound: [-50.0, 50.0, 0.5]
    classes: ${map_classes}
  -
    type: PointsRangeFilter
    point_cloud_range: ${point_cloud_range}
  -
    type: ImageNormalize
    mean: [0.485, 0.456, 0.406]
    std: [0.229, 0.224, 0.225]
  -
    type: DefaultFormatBundle3D
    classes: ${object_classes}
  -
    type: Collect3D
    keys:
      - img
      - points
      - gt_bboxes_3d
      - gt_labels_3d
      - gt_masks_bev
    meta_keys:
      - camera_intrinsics
      - camera2ego
      - lidar2ego
      - lidar2camera
      - camera2lidar
      - lidar2image
      - img_aug_matrix
      - lidar_aug_matrix
  -
    type: GTDepth
    keyframe_only: true