154 lines
3.5 KiB
YAML
154 lines
3.5 KiB
YAML
# BEVFusion 多任务配置:检测 + 分割(SwinTransformer版本)
|
||
_base_:
|
||
- ../det/transfusion/secfpn/camera+lidar/swint_v0p075/convfuser.yaml
|
||
|
||
voxel_size: [0.075, 0.075, 0.2]
|
||
point_cloud_range: [-54.0, -54.0, -5.0, 54.0, 54.0, 3.0]
|
||
|
||
model:
|
||
encoders:
|
||
camera:
|
||
backbone:
|
||
type: SwinTransformer
|
||
embed_dims: 96
|
||
depths: [2, 2, 6, 2]
|
||
num_heads: [3, 6, 12, 24]
|
||
window_size: 7
|
||
mlp_ratio: 4
|
||
qkv_bias: true
|
||
qk_scale: null
|
||
drop_rate: 0.
|
||
attn_drop_rate: 0.
|
||
drop_path_rate: 0.2
|
||
patch_norm: true
|
||
out_indices: [1, 2, 3]
|
||
with_cp: false
|
||
convert_weights: true
|
||
init_cfg:
|
||
type: Pretrained
|
||
checkpoint: pretrained/swint-nuimages-pretrained.pth
|
||
neck:
|
||
type: GeneralizedLSSFPN
|
||
in_channels: [192, 384, 768]
|
||
out_channels: 256
|
||
start_level: 0
|
||
num_outs: 3
|
||
norm_cfg:
|
||
type: BN2d
|
||
requires_grad: true
|
||
act_cfg:
|
||
type: ReLU
|
||
inplace: true
|
||
upsample_cfg:
|
||
mode: bilinear
|
||
align_corners: false
|
||
vtransform:
|
||
type: DepthLSSTransform
|
||
in_channels: 256
|
||
out_channels: 80
|
||
image_size: ${image_size}
|
||
feature_size: ${[image_size[0] // 8, image_size[1] // 8]}
|
||
xbound: [-54.0, 54.0, 0.3]
|
||
ybound: [-54.0, 54.0, 0.3]
|
||
zbound: [-10.0, 10.0, 20.0]
|
||
dbound: [1.0, 60.0, 0.5]
|
||
downsample: 2
|
||
|
||
lidar:
|
||
voxelize:
|
||
max_num_points: 10
|
||
point_cloud_range: ${point_cloud_range}
|
||
voxel_size: ${voxel_size}
|
||
max_voxels: [120000, 160000]
|
||
backbone:
|
||
type: SparseEncoder
|
||
in_channels: 5
|
||
sparse_shape: [1440, 1440, 41]
|
||
output_channels: 128
|
||
order:
|
||
- conv
|
||
- norm
|
||
- act
|
||
encoder_channels:
|
||
- [16, 16, 32]
|
||
- [32, 32, 64]
|
||
- [64, 64, 128]
|
||
- [128, 128]
|
||
encoder_paddings:
|
||
- [0, 0, 1]
|
||
- [0, 0, 1]
|
||
- [0, 0, [1, 1, 0]]
|
||
- [0, 0]
|
||
block_type: basicblock
|
||
|
||
fuser:
|
||
type: ConvFuser
|
||
in_channels: [80, 256]
|
||
out_channels: 256
|
||
|
||
decoder:
|
||
backbone:
|
||
type: SECOND
|
||
in_channels: 256
|
||
out_channels: [128, 256]
|
||
layer_nums: [5, 5]
|
||
layer_strides: [1, 2]
|
||
norm_cfg:
|
||
type: BN
|
||
eps: 1.0e-3
|
||
momentum: 0.01
|
||
conv_cfg:
|
||
type: Conv2d
|
||
bias: false
|
||
neck:
|
||
type: SECONDFPN
|
||
in_channels: [128, 256]
|
||
out_channels: [256, 256]
|
||
upsample_strides: [1, 2]
|
||
norm_cfg:
|
||
type: BN
|
||
eps: 1.0e-3
|
||
momentum: 0.01
|
||
upsample_cfg:
|
||
type: deconv
|
||
bias: false
|
||
use_conv_for_no_stride: true
|
||
|
||
heads:
|
||
# 3D检测头
|
||
object:
|
||
in_channels: 512
|
||
train_cfg:
|
||
grid_size: [1440, 1440, 41]
|
||
test_cfg:
|
||
grid_size: [1440, 1440, 41]
|
||
|
||
# BEV分割头
|
||
map:
|
||
in_channels: 512
|
||
grid_transform:
|
||
input_scope: [[-54.0, 54.0, 0.75], [-54.0, 54.0, 0.75]]
|
||
output_scope: [[-50, 50, 0.5], [-50, 50, 0.5]]
|
||
|
||
# 损失权重
|
||
loss_scale:
|
||
object: 1.0
|
||
map: 1.0
|
||
|
||
# 训练超参数
|
||
max_epochs: 20
|
||
|
||
lr_config:
|
||
policy: CosineAnnealing
|
||
warmup: linear
|
||
warmup_iters: 500
|
||
warmup_ratio: 0.33333333
|
||
min_lr_ratio: 1.0e-3
|
||
|
||
log_config:
|
||
interval: 50
|
||
hooks:
|
||
- type: TextLoggerHook
|
||
# - type: TensorboardLoggerHook # 可选:启用tensorboard
|
||
|