bev-project/configs/nuscenes/multitask/fusion-det-seg-swint-enhanc...

284 lines
6.3 KiB
YAML
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# 增强版多任务配置:检测 + 分割Enhanced Segmentation Head
#
# 改进点:
# 1. EnhancedBEVSegmentationHead (ASPP + 双注意力 + 深层解码器)
# 2. Focal Loss修复 (alpha=0.25)
# 3. Dice Loss混合 (weight=0.5)
# 4. 类别权重平衡 (stop_line×4, divider×3)
# 5. Deep supervision
# 6. 分割损失权重×5
# 7. 学习率降低(复用权重)
#
# 复用: epoch_19.pth (93.7%参数)
# 预期: mIoU 60-65% (2-3天)
_base_: ./convfuser.yaml
# 添加分割头和pipeline
model:
# 确保fuser配置正确
fuser:
type: ConvFuser
in_channels: [80, 256]
out_channels: 256
heads:
# object head从convfuser.yaml继承
# 添加增强版map head
map:
type: EnhancedBEVSegmentationHead # ← 使用增强版!
in_channels: 512
grid_transform:
input_scope: [[-54.0, 54.0, 0.75], [-54.0, 54.0, 0.75]]
output_scope: [[-50, 50, 0.5], [-50, 50, 0.5]]
classes: ${map_classes}
loss: focal
# 类别权重针对nuScenes不平衡
loss_weight:
drivable_area: 1.0
ped_crossing: 3.0
walkway: 1.5
stop_line: 4.0
carpark_area: 2.0
divider: 3.0
# Focal Loss参数
focal_alpha: 0.25
focal_gamma: 2.0
# Dice Loss配置
use_dice_loss: true
dice_weight: 0.5
# Deep Supervision
deep_supervision: true
# Decoder配置
decoder_channels: [256, 256, 128, 128]
# 损失权重分割权重×5
loss_scale:
object: 1.0
map: 5.0
# 训练epochs
max_epochs: 20
# 优化器(降低学习率)
optimizer:
lr: 1.0e-4
# 修改Collect3D添加gt_masks_bev
train_pipeline:
-
type: LoadMultiViewImageFromFiles
to_float32: true
-
type: LoadPointsFromFile
coord_type: LIDAR
load_dim: ${load_dim}
use_dim: ${use_dim}
reduce_beams: ${reduce_beams}
load_augmented: ${load_augmented}
-
type: LoadPointsFromMultiSweeps
sweeps_num: 9
load_dim: ${load_dim}
use_dim: ${use_dim}
reduce_beams: ${reduce_beams}
pad_empty_sweeps: true
remove_close: true
load_augmented: ${load_augmented}
-
type: LoadAnnotations3D
with_bbox_3d: true
with_label_3d: true
with_attr_label: False
-
type: ObjectPaste
stop_epoch: ${gt_paste_stop_epoch}
db_sampler:
dataset_root: ${dataset_root}
info_path: ${dataset_root + "nuscenes_dbinfos_train.pkl"}
rate: 1.0
prepare:
filter_by_difficulty: [-1]
filter_by_min_points:
car: 5
truck: 5
bus: 5
trailer: 5
construction_vehicle: 5
traffic_cone: 5
barrier: 5
motorcycle: 5
bicycle: 5
pedestrian: 5
classes: ${object_classes}
sample_groups:
car: 2
truck: 3
construction_vehicle: 7
bus: 4
trailer: 6
barrier: 2
motorcycle: 6
bicycle: 6
pedestrian: 2
traffic_cone: 2
points_loader:
type: LoadPointsFromFile
coord_type: LIDAR
load_dim: ${load_dim}
use_dim: ${use_dim}
reduce_beams: ${reduce_beams}
-
type: ImageAug3D
final_dim: ${image_size}
resize_lim: ${augment2d.resize[0]}
bot_pct_lim: [0.0, 0.0]
rot_lim: ${augment2d.rotate}
rand_flip: true
is_train: true
-
type: GlobalRotScaleTrans
resize_lim: ${augment3d.scale}
rot_lim: ${augment3d.rotate}
trans_lim: ${augment3d.translate}
is_train: true
-
type: LoadBEVSegmentation
dataset_root: ${dataset_root}
xbound: [-50.0, 50.0, 0.5]
ybound: [-50.0, 50.0, 0.5]
classes: ${map_classes}
-
type: RandomFlip3D
-
type: PointsRangeFilter
point_cloud_range: ${point_cloud_range}
-
type: ObjectRangeFilter
point_cloud_range: ${point_cloud_range}
-
type: ObjectNameFilter
classes: ${object_classes}
-
type: ImageNormalize
mean: [0.485, 0.456, 0.406]
std: [0.229, 0.224, 0.225]
-
type: GridMask
use_h: true
use_w: true
max_epoch: ${max_epochs}
rotate: 1
offset: false
ratio: 0.5
mode: 1
prob: ${augment2d.gridmask.prob}
fixed_prob: ${augment2d.gridmask.fixed_prob}
-
type: PointShuffle
-
type: DefaultFormatBundle3D
classes: ${object_classes}
-
type: Collect3D
keys:
- img
- points
- gt_bboxes_3d
- gt_labels_3d
- gt_masks_bev
meta_keys:
- camera_intrinsics
- camera2ego
- lidar2ego
- lidar2camera
- camera2lidar
- lidar2image
- img_aug_matrix
- lidar_aug_matrix
-
type: GTDepth
keyframe_only: true
test_pipeline:
-
type: LoadMultiViewImageFromFiles
to_float32: true
-
type: LoadPointsFromFile
coord_type: LIDAR
load_dim: ${load_dim}
use_dim: ${use_dim}
reduce_beams: ${reduce_beams}
load_augmented: ${load_augmented}
-
type: LoadPointsFromMultiSweeps
sweeps_num: 9
load_dim: ${load_dim}
use_dim: ${use_dim}
reduce_beams: ${reduce_beams}
pad_empty_sweeps: true
remove_close: true
load_augmented: ${load_augmented}
-
type: LoadAnnotations3D
with_bbox_3d: true
with_label_3d: true
with_attr_label: False
-
type: ImageAug3D
final_dim: ${image_size}
resize_lim: ${augment2d.resize[1]}
bot_pct_lim: [0.0, 0.0]
rot_lim: [0.0, 0.0]
rand_flip: false
is_train: false
-
type: GlobalRotScaleTrans
resize_lim: [1.0, 1.0]
rot_lim: [0.0, 0.0]
trans_lim: 0.0
is_train: false
-
type: LoadBEVSegmentation
dataset_root: ${dataset_root}
xbound: [-50.0, 50.0, 0.5]
ybound: [-50.0, 50.0, 0.5]
classes: ${map_classes}
-
type: PointsRangeFilter
point_cloud_range: ${point_cloud_range}
-
type: ImageNormalize
mean: [0.485, 0.456, 0.406]
std: [0.229, 0.224, 0.225]
-
type: DefaultFormatBundle3D
classes: ${object_classes}
-
type: Collect3D
keys:
- img
- points
- gt_bboxes_3d
- gt_labels_3d
- gt_masks_bev
meta_keys:
- camera_intrinsics
- camera2ego
- lidar2ego
- lidar2camera
- camera2lidar
- lidar2image
- img_aug_matrix
- lidar_aug_matrix
-
type: GTDepth
keyframe_only: true