model: encoders: lidar: null camera: backbone: type: SwinTransformer embed_dims: 96 depths: [2, 2, 6, 2] num_heads: [3, 6, 12, 24] window_size: 7 mlp_ratio: 4 qkv_bias: true qk_scale: null drop_rate: 0. attn_drop_rate: 0. drop_path_rate: 0.3 patch_norm: true out_indices: [1, 2, 3] with_cp: false convert_weights: true init_cfg: type: Pretrained checkpoint: https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_tiny_patch4_window7_224.pth neck: type: GeneralizedLSSFPN in_channels: [192, 384, 768] out_channels: 256 start_level: 0 num_outs: 3 norm_cfg: type: BN2d requires_grad: true act_cfg: type: ReLU inplace: true upsample_cfg: mode: bilinear align_corners: false vtransform: type: LSSTransform in_channels: 256 out_channels: 80 image_size: ${image_size} feature_size: ${[image_size[0] // 8, image_size[1] // 8]} xbound: [-51.2, 51.2, 0.4] ybound: [-51.2, 51.2, 0.4] zbound: [-10.0, 10.0, 20.0] dbound: [1.0, 60.0, 0.5] downsample: 2 fuser: null decoder: backbone: type: GeneralizedResNet in_channels: 80 blocks: - [2, 160, 2] - [2, 320, 2] - [2, 640, 1] neck: type: LSSFPN in_indices: [-1, 0] in_channels: [640, 160] out_channels: 256 scale_factor: 2 optimizer: type: AdamW lr: 1.0e-4 weight_decay: 0.01 paramwise_cfg: custom_keys: absolute_pos_embed: decay_mult: 0 relative_position_bias_table: decay_mult: 0 optimizer_config: grad_clip: max_norm: 35 norm_type: 2 lr_config: policy: cyclic momentum_config: policy: cyclic