model: encoders: camera: backbone: type: SwinTransformer embed_dims: 96 depths: [2, 2, 6, 2] num_heads: [3, 6, 12, 24] window_size: 7 mlp_ratio: 4 qkv_bias: true qk_scale: null drop_rate: 0. attn_drop_rate: 0. drop_path_rate: 0.2 patch_norm: true out_indices: [1, 2, 3] with_cp: false convert_weights: true init_cfg: type: Pretrained checkpoint: https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_tiny_patch4_window7_224.pth neck: type: GeneralizedLSSFPN in_channels: [192, 384, 768] out_channels: 256 start_level: 0 num_outs: 3 norm_cfg: type: BN2d requires_grad: true act_cfg: type: ReLU inplace: true upsample_cfg: mode: bilinear align_corners: false vtransform: in_channels: 256 out_channels: 80 feature_size: ${[image_size[0] // 8, image_size[1] // 8]} xbound: [-51.2, 51.2, 0.4] ybound: [-51.2, 51.2, 0.4] zbound: [-10.0, 10.0, 20.0] dbound: [1.0, 60.0, 0.5] downsample: 2 decoder: backbone: type: GeneralizedResNet in_channels: 80 blocks: - [2, 128, 2] - [2, 256, 2] - [2, 512, 1] neck: type: LSSFPN in_indices: [-1, 0] in_channels: [512, 128] out_channels: 256 scale_factor: 2 optimizer: paramwise_cfg: custom_keys: absolute_pos_embed: decay_mult: 0 relative_position_bias_table: decay_mult: 0 encoders.camera.backbone: lr_mult: 0.1 lr_config: policy: cyclic target_ratio: 5.0 cyclic_times: 1 step_ratio_up: 0.4 momentum_config: policy: cyclic cyclic_times: 1 step_ratio_up: 0.4 data: samples_per_gpu: 4