bev-project/configs/nuscenes/three_tasks/bevfusion_det_seg_vec.yaml

# 三任务配置: 3D检测 + BEV分割 + 矢量地图
# 基于SwinTransformer + ConvFuser架构

_base_: ./default.yaml

# 模型配置
model:
  type: BEVFusion
  
  # Encoder配置（复用训练好的）
  encoders:
    camera:
      backbone:
        type: SwinTransformer
        embed_dims: 96
        depths: [2, 2, 6, 2]
        num_heads: [3, 6, 12, 24]
        window_size: 7
        mlp_ratio: 4
        qkv_bias: true
        qk_scale: null
        drop_rate: 0.
        attn_drop_rate: 0.
        drop_path_rate: 0.2
        patch_norm: true
        out_indices: [1, 2, 3]
        with_cp: false
        convert_weights: true
        init_cfg:
          type: Pretrained
          checkpoint: https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_tiny_patch4_window7_224.pth
      
      neck:
        type: GeneralizedLSSFPN
        in_channels: [192, 384, 768]
        out_channels: 256
        start_level: 0
        num_outs: 3
        norm_cfg:
          type: BN2d
          requires_grad: true
        act_cfg:
          type: ReLU
          inplace: true
        upsample_cfg:
          mode: bilinear
          align_corners: false
      
      vtransform:
        type: DepthLSSTransform
        in_channels: 256
        out_channels: 80
        image_size: ${image_size}
        feature_size: [${[image_size[0] // 8, image_size[1] // 8]}]
        xbound: [-54.0, 54.0, 0.3]
        ybound: [-54.0, 54.0, 0.3]
        zbound: [-10.0, 10.0, 20.0]
        dbound: [1.0, 60.0, 0.5]
        downsample: 2
    
    lidar:
      voxelize:
        max_num_points: 10
        point_cloud_range: ${point_cloud_range}
        voxel_size: ${voxel_size}
        max_voxels: [120000, 160000]
      
      backbone:
        type: SparseEncoder
        in_channels: 5
        sparse_shape: [1440, 1440, 41]
        output_channels: 128
        order:
          - conv
          - norm
          - act
        encoder_channels:
          - [16, 16, 32]
          - [32, 32, 64]
          - [64, 64, 128]
          - [128, 128]
        encoder_paddings:
          - [0, 0, 1]
          - [0, 0, 1]
          - [0, 0, [1, 1, 0]]
          - [0, 0]
        block_type: basicblock
  
  # Fuser配置
  fuser:
    type: ConvFuser
    in_channels: [80, 256]
    out_channels: 256
  
  # Decoder配置
  decoder:
    backbone:
      type: SECOND
      in_channels: 256
      out_channels: [128, 256]
      layer_nums: [5, 5]
      layer_strides: [1, 2]
      norm_cfg:
        type: BN
        eps: 1.0e-3
        momentum: 0.01
      conv_cfg:
        type: Conv2d
        bias: false
    
    neck:
      type: SECONDFPN
      in_channels: [128, 256]
      out_channels: [256, 256]
      upsample_strides: [1, 2]
      norm_cfg:
        type: BN
        eps: 1.0e-3
        momentum: 0.01
      upsample_cfg:
        type: deconv
        bias: false
      use_conv_for_no_stride: true
  
  # 三个任务头
  heads:
    # 任务1: 3D目标检测
    object:
      type: TransFusionHead
      num_proposals: 200
      auxiliary: true
      in_channels: 512
      hidden_channel: 128
      num_classes: 10
      num_decoder_layers: 1
      num_heads: 8
      nms_kernel_size: 3
      bn_momentum: 0.1
      activation: relu
      common_heads:
        center: [2, 2]
        height: [1, 2]
        dim: [3, 2]
        rot: [2, 2]
        vel: [2, 2]
      bbox_coder:
        type: TransFusionBBoxCoder
        pc_range: ${point_cloud_range[:2]}
        voxel_size: ${voxel_size[:2]}
        out_size_factor: 8
        post_center_range: [-61.2, -61.2, -10.0, 61.2, 61.2, 10.0]
        score_threshold: 0.0
        code_size: 10
      loss_cls:
        type: FocalLoss
        use_sigmoid: true
        gamma: 2.0
        alpha: 0.25
        reduction: mean
        loss_weight: 1.0
      loss_bbox:
        type: L1Loss
        reduction: mean
        loss_weight: 0.25
      loss_heatmap:
        type: GaussianFocalLoss
        reduction: mean
        loss_weight: 1.0
    
    # 任务2: BEV地图分割
    map:
      type: VanillaSegmentationHead
      in_channels: 512
      num_classes: 6
      align_corners: false
      loss_decode:
        type: CrossEntropyLoss
        use_sigmoid: false
        class_weight: [1.0, 2.0, 2.0, 2.0, 2.0, 2.0]
        loss_weight: 1.0
    
    # 任务3: 矢量地图预测 🆕
    vector_map:
      type: MapTRHead
      in_channels: 512
      num_classes: 3
      num_queries: 50
      num_points: 20
      embed_dims: 256
      num_decoder_layers: 6
      num_heads: 8
      dropout: 0.1
      loss_cls_weight: 2.0
      loss_reg_weight: 5.0
      loss_chamfer_weight: 2.0
      score_threshold: 0.3
      nms_threshold: 0.5
  
  # 损失权重
  loss_scale:
    object: 1.0
    map: 1.0
    vector_map: 1.0  # 矢量地图任务权重

# 训练配置
max_epochs: 20
batch_size_per_gpu: 1
num_workers_per_gpu: 0  # 避免shared memory问题

# 学习率（三任务可能需要稍微调整）
optimizer:
  type: AdamW
  lr: 1.5e-4  # 稍微降低学习率
  weight_decay: 0.01

# 评估配置
evaluation:
  interval: 1
  pipeline: ${val_pipeline}
  metric:
    - bbox  # 3D检测mAP
    - map   # BEV分割mIoU
    - vector  # 矢量地图AP 🆕

# Checkpoint
checkpoint_config:
  interval: 1
  max_keep_ckpts: 5

# 日志
log_config:
  interval: 50
  hooks:
    - type: TextLoggerHook
    - type: TensorboardLoggerHook

# 运行时
dist_params:
  backend: nccl

log_level: INFO
work_dir: runs/three_tasks
load_from: null  # 从当前训练的多任务模型加载
resume_from: null
workflow: [['train', 1]]
-												Complete project state snapshot: Phase 4B RMT-PPAD Integration

🎯 Training Status:
- Current Epoch: 2/10 (13.3% complete)
- Segmentation Dice: 0.9594
- Detection IoU: 0.5742
- Training stable with 8 GPUs

🔧 Technical Achievements:
- ✅ RMT-PPAD Transformer segmentation decoder integrated
- ✅ Task-specific GCA architecture optimized
- ✅ Multi-scale feature fusion (180×180, 360×360, 600×600)
- ✅ Adaptive scale weight learning implemented
- ✅ BEVFusion multi-task framework enhanced

📊 Performance Highlights:
- Divider segmentation: 0.9793 Dice (excellent)
- Pedestrian crossing: 0.9812 Dice (excellent)
- Stop line: 0.9812 Dice (excellent)
- Carpark area: 0.9802 Dice (excellent)
- Walkway: 0.9401 Dice (good)
- Drivable area: 0.8959 Dice (good)

🛠️ Code Changes Included:
- Enhanced BEVFusion model (bevfusion.py)
- RMT-PPAD integration modules (rmtppad_integration.py)
- Transformer segmentation head (enhanced_transformer.py)
- GCA module optimizations (gca.py)
- Configuration updates (Phase 4B configs)
- Training scripts and automation tools
- Comprehensive documentation and analysis reports

📅 Snapshot Date: Fri Nov 14 09:06:09 UTC 2025
📍 Environment: Docker container
🎯 Phase: RMT-PPAD Integration Complete

											
										
										
											2025-11-14 17:06:09 +08:00
+								# 三任务配置: 3D检测 + BEV分割 + 矢量地图
 								# 基于SwinTransformer + ConvFuser架构
 								_base_: ./default.yaml
 								# 模型配置
 								model:
 								  type: BEVFusion
 								  # Encoder配置（复用训练好的）
 								  encoders:
 								    camera:
 								      backbone:
 								        type: SwinTransformer
 								        embed_dims: 96
 								        depths: [2, 2, 6, 2]
 								        num_heads: [3, 6, 12, 24]
 								        window_size: 7
 								        mlp_ratio: 4
 								        qkv_bias: true
 								        qk_scale: null
 								        drop_rate: 0.
 								        attn_drop_rate: 0.
 								        drop_path_rate: 0.2
 								        patch_norm: true
 								        out_indices: [1, 2, 3]
 								        with_cp: false
 								        convert_weights: true
 								        init_cfg:
 								          type: Pretrained
 								          checkpoint: https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_tiny_patch4_window7_224.pth
 								      neck:
 								        type: GeneralizedLSSFPN
 								        in_channels: [192, 384, 768]
 								        out_channels: 256
 								        start_level: 0
 								        num_outs: 3
 								        norm_cfg:
 								          type: BN2d
 								          requires_grad: true
 								        act_cfg:
 								          type: ReLU
 								          inplace: true
 								        upsample_cfg:
 								          mode: bilinear
 								          align_corners: false
 								      vtransform:
 								        type: DepthLSSTransform
 								        in_channels: 256
 								        out_channels: 80
 								        image_size: ${image_size}
 								        feature_size: [${[image_size[0] // 8, image_size[1] // 8]}]
 								        xbound: [-54.0, 54.0, 0.3]
 								        ybound: [-54.0, 54.0, 0.3]
 								        zbound: [-10.0, 10.0, 20.0]
 								        dbound: [1.0, 60.0, 0.5]
 								        downsample: 2
 								    lidar:
 								      voxelize:
 								        max_num_points: 10
 								        point_cloud_range: ${point_cloud_range}
 								        voxel_size: ${voxel_size}
 								        max_voxels: [120000, 160000]
 								      backbone:
 								        type: SparseEncoder
 								        in_channels: 5
 								        sparse_shape: [1440, 1440, 41]
 								        output_channels: 128
 								        order:
 								          - conv
 								          - norm
 								          - act
 								        encoder_channels:
 								          - [16, 16, 32]
 								          - [32, 32, 64]
 								          - [64, 64, 128]
 								          - [128, 128]
 								        encoder_paddings:
 								          - [0, 0, 1]
 								          - [0, 0, 1]
 								          - [0, 0, [1, 1, 0]]
 								          - [0, 0]
 								        block_type: basicblock
 								  # Fuser配置
 								  fuser:
 								    type: ConvFuser
 								    in_channels: [80, 256]
 								    out_channels: 256
 								  # Decoder配置
 								  decoder:
 								    backbone:
 								      type: SECOND
 								      in_channels: 256
 								      out_channels: [128, 256]
 								      layer_nums: [5, 5]
 								      layer_strides: [1, 2]
 								      norm_cfg:
 								        type: BN
 								        eps: 1.0e-3
 								        momentum: 0.01
 								      conv_cfg:
 								        type: Conv2d
 								        bias: false
 								    neck:
 								      type: SECONDFPN
 								      in_channels: [128, 256]
 								      out_channels: [256, 256]
 								      upsample_strides: [1, 2]
 								      norm_cfg:
 								        type: BN
 								        eps: 1.0e-3
 								        momentum: 0.01
 								      upsample_cfg:
 								        type: deconv
 								        bias: false
 								      use_conv_for_no_stride: true
 								  # 三个任务头
 								  heads:
 								    # 任务1: 3D目标检测
 								    object:
 								      type: TransFusionHead
 								      num_proposals: 200
 								      auxiliary: true
 								      in_channels: 512
 								      hidden_channel: 128
 								      num_classes: 10
 								      num_decoder_layers: 1
 								      num_heads: 8
 								      nms_kernel_size: 3
 								      bn_momentum: 0.1
 								      activation: relu
 								      common_heads:
 								        center: [2, 2]
 								        height: [1, 2]
 								        dim: [3, 2]
 								        rot: [2, 2]
 								        vel: [2, 2]
 								      bbox_coder:
 								        type: TransFusionBBoxCoder
 								        pc_range: ${point_cloud_range[:2]}
 								        voxel_size: ${voxel_size[:2]}
 								        out_size_factor: 8
 								        post_center_range: [-61.2, -61.2, -10.0, 61.2, 61.2, 10.0]
 								        score_threshold: 0.0
 								        code_size: 10
 								      loss_cls:
 								        type: FocalLoss
 								        use_sigmoid: true
 								        gamma: 2.0
 								        alpha: 0.25
 								        reduction: mean
 								        loss_weight: 1.0
 								      loss_bbox:
 								        type: L1Loss
 								        reduction: mean
 								        loss_weight: 0.25
 								      loss_heatmap:
 								        type: GaussianFocalLoss
 								        reduction: mean
 								        loss_weight: 1.0
 								    # 任务2: BEV地图分割
 								    map:
 								      type: VanillaSegmentationHead
 								      in_channels: 512
 								      num_classes: 6
 								      align_corners: false
 								      loss_decode:
 								        type: CrossEntropyLoss
 								        use_sigmoid: false
 								        class_weight: [1.0, 2.0, 2.0, 2.0, 2.0, 2.0]
 								        loss_weight: 1.0
 								    # 任务3: 矢量地图预测 🆕
 								    vector_map:
 								      type: MapTRHead
 								      in_channels: 512
 								      num_classes: 3
 								      num_queries: 50
 								      num_points: 20
 								      embed_dims: 256
 								      num_decoder_layers: 6
 								      num_heads: 8
 								      dropout: 0.1
 								      loss_cls_weight: 2.0
 								      loss_reg_weight: 5.0
 								      loss_chamfer_weight: 2.0
 								      score_threshold: 0.3
 								      nms_threshold: 0.5
 								  # 损失权重
 								  loss_scale:
 								    object: 1.0
 								    map: 1.0
 								    vector_map: 1.0  # 矢量地图任务权重
 								# 训练配置
 								max_epochs: 20
 								batch_size_per_gpu: 1
 								num_workers_per_gpu: 0  # 避免shared memory问题
 								# 学习率（三任务可能需要稍微调整）
 								optimizer:
 								  type: AdamW
 								  lr: 1.5e-4  # 稍微降低学习率
 								  weight_decay: 0.01
 								# 评估配置
 								evaluation:
 								  interval: 1
 								  pipeline: ${val_pipeline}
 								  metric:
 								    - bbox  # 3D检测mAP
 								    - map   # BEV分割mIoU
 								    - vector  # 矢量地图AP 🆕
 								# Checkpoint
 								checkpoint_config:
 								  interval: 1
 								  max_keep_ckpts: 5
 								# 日志
 								log_config:
 								  interval: 50
 								  hooks:
 								    - type: TextLoggerHook
 								    - type: TensorboardLoggerHook
 								# 运行时
 								dist_params:
 								  backend: nccl
 								log_level: INFO
 								work_dir: runs/three_tasks
 								load_from: null  # 从当前训练的多任务模型加载
 								resume_from: null
 								workflow: [['train', 1]]