bev-project/configs/nuscenes/three_tasks/bevfusion_det_seg_vec.yaml

247 lines
5.4 KiB
YAML
Raw Normal View History

# 三任务配置: 3D检测 + BEV分割 + 矢量地图
# 基于SwinTransformer + ConvFuser架构
_base_: ./default.yaml
# 模型配置
model:
type: BEVFusion
# Encoder配置复用训练好的
encoders:
camera:
backbone:
type: SwinTransformer
embed_dims: 96
depths: [2, 2, 6, 2]
num_heads: [3, 6, 12, 24]
window_size: 7
mlp_ratio: 4
qkv_bias: true
qk_scale: null
drop_rate: 0.
attn_drop_rate: 0.
drop_path_rate: 0.2
patch_norm: true
out_indices: [1, 2, 3]
with_cp: false
convert_weights: true
init_cfg:
type: Pretrained
checkpoint: https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_tiny_patch4_window7_224.pth
neck:
type: GeneralizedLSSFPN
in_channels: [192, 384, 768]
out_channels: 256
start_level: 0
num_outs: 3
norm_cfg:
type: BN2d
requires_grad: true
act_cfg:
type: ReLU
inplace: true
upsample_cfg:
mode: bilinear
align_corners: false
vtransform:
type: DepthLSSTransform
in_channels: 256
out_channels: 80
image_size: ${image_size}
feature_size: [${[image_size[0] // 8, image_size[1] // 8]}]
xbound: [-54.0, 54.0, 0.3]
ybound: [-54.0, 54.0, 0.3]
zbound: [-10.0, 10.0, 20.0]
dbound: [1.0, 60.0, 0.5]
downsample: 2
lidar:
voxelize:
max_num_points: 10
point_cloud_range: ${point_cloud_range}
voxel_size: ${voxel_size}
max_voxels: [120000, 160000]
backbone:
type: SparseEncoder
in_channels: 5
sparse_shape: [1440, 1440, 41]
output_channels: 128
order:
- conv
- norm
- act
encoder_channels:
- [16, 16, 32]
- [32, 32, 64]
- [64, 64, 128]
- [128, 128]
encoder_paddings:
- [0, 0, 1]
- [0, 0, 1]
- [0, 0, [1, 1, 0]]
- [0, 0]
block_type: basicblock
# Fuser配置
fuser:
type: ConvFuser
in_channels: [80, 256]
out_channels: 256
# Decoder配置
decoder:
backbone:
type: SECOND
in_channels: 256
out_channels: [128, 256]
layer_nums: [5, 5]
layer_strides: [1, 2]
norm_cfg:
type: BN
eps: 1.0e-3
momentum: 0.01
conv_cfg:
type: Conv2d
bias: false
neck:
type: SECONDFPN
in_channels: [128, 256]
out_channels: [256, 256]
upsample_strides: [1, 2]
norm_cfg:
type: BN
eps: 1.0e-3
momentum: 0.01
upsample_cfg:
type: deconv
bias: false
use_conv_for_no_stride: true
# 三个任务头
heads:
# 任务1: 3D目标检测
object:
type: TransFusionHead
num_proposals: 200
auxiliary: true
in_channels: 512
hidden_channel: 128
num_classes: 10
num_decoder_layers: 1
num_heads: 8
nms_kernel_size: 3
bn_momentum: 0.1
activation: relu
common_heads:
center: [2, 2]
height: [1, 2]
dim: [3, 2]
rot: [2, 2]
vel: [2, 2]
bbox_coder:
type: TransFusionBBoxCoder
pc_range: ${point_cloud_range[:2]}
voxel_size: ${voxel_size[:2]}
out_size_factor: 8
post_center_range: [-61.2, -61.2, -10.0, 61.2, 61.2, 10.0]
score_threshold: 0.0
code_size: 10
loss_cls:
type: FocalLoss
use_sigmoid: true
gamma: 2.0
alpha: 0.25
reduction: mean
loss_weight: 1.0
loss_bbox:
type: L1Loss
reduction: mean
loss_weight: 0.25
loss_heatmap:
type: GaussianFocalLoss
reduction: mean
loss_weight: 1.0
# 任务2: BEV地图分割
map:
type: VanillaSegmentationHead
in_channels: 512
num_classes: 6
align_corners: false
loss_decode:
type: CrossEntropyLoss
use_sigmoid: false
class_weight: [1.0, 2.0, 2.0, 2.0, 2.0, 2.0]
loss_weight: 1.0
# 任务3: 矢量地图预测 🆕
vector_map:
type: MapTRHead
in_channels: 512
num_classes: 3
num_queries: 50
num_points: 20
embed_dims: 256
num_decoder_layers: 6
num_heads: 8
dropout: 0.1
loss_cls_weight: 2.0
loss_reg_weight: 5.0
loss_chamfer_weight: 2.0
score_threshold: 0.3
nms_threshold: 0.5
# 损失权重
loss_scale:
object: 1.0
map: 1.0
vector_map: 1.0 # 矢量地图任务权重
# 训练配置
max_epochs: 20
batch_size_per_gpu: 1
num_workers_per_gpu: 0 # 避免shared memory问题
# 学习率(三任务可能需要稍微调整)
optimizer:
type: AdamW
lr: 1.5e-4 # 稍微降低学习率
weight_decay: 0.01
# 评估配置
evaluation:
interval: 1
pipeline: ${val_pipeline}
metric:
- bbox # 3D检测mAP
- map # BEV分割mIoU
- vector # 矢量地图AP 🆕
# Checkpoint
checkpoint_config:
interval: 1
max_keep_ckpts: 5
# 日志
log_config:
interval: 50
hooks:
- type: TextLoggerHook
- type: TensorboardLoggerHook
# 运行时
dist_params:
backend: nccl
log_level: INFO
work_dir: runs/three_tasks
load_from: null # 从当前训练的多任务模型加载
resume_from: null
workflow: [['train', 1]]