bev-project/ANALYZE_NETWORK_CONFIG.py

#!/usr/bin/env python
"""
分析multitask_BEV2X_phase4b_rmtppad_segmentation.yaml配置中的网络结构和特征尺寸
"""
import yaml
import torch
import numpy as np

def analyze_network_config():
    """分析BEVFusion Phase 4B网络配置"""
    print("="*100)
    print("🎯 BEVFusion Phase 4B 网络结构与特征尺寸分析")
    print("="*100)

    # 解析配置参数
    config_params = {
        # 输入规格
        'input': {
            'camera': {'views': 6, 'size': [256, 704], 'channels': 3},
            'lidar': {'points': '32线', 'range': [-54, 54], 'voxel_size': [0.075, 0.075, 0.2]}
        },

        # Camera Encoder
        'camera_encoder': {
            'backbone': {'type': 'SwinTransformer', 'embed_dims': 96, 'depths': [2, 2, 6, 2], 'num_heads': [3, 6, 12, 24]},
            'neck': {'in_channels': [192, 384, 768], 'out_channels': 256, 'num_outs': 3},
            'vtransform': {'in_channels': 256, 'out_channels': 80, 'image_size': [256, 704], 'feature_size': [32, 88]}
        },

        # LiDAR Encoder
        'lidar_encoder': {
            'voxelize': {'max_voxels': [120000, 160000], 'voxel_size': [0.075, 0.075, 0.2]},
            'backbone': {'sparse_shape': [1440, 1440, 41], 'output_channels': 128}
        },

        # Decoder
        'decoder': {
            'backbone': {'in_channels': 256, 'out_channels': [128, 256], 'layer_nums': [5, 5], 'layer_strides': [1, 2]},
            'neck': {'in_channels': [128, 256], 'out_channels': [256, 256], 'upsample_strides': [1, 2]}
        },

        # Segmentation Head
        'segmentation_head': {
            'in_channels': 512,
            'transformer_hidden_dim': 256,
            'transformer_C': 64,
            'transformer_num_layers': 2,
            'grid_transform': {
                'input_scope': [[-54.0, 54.0, 0.75], [-54.0, 54.0, 0.75]],
                'output_scope': [[-50, 50, 0.167], [-50, 50, 0.167]]
            }
        }
    }

    # 1. 输入数据规格
    print("\n📥 1. 输入数据规格")
    print("-" * 50)

    input_spec = config_params['input']
    print("相机输入:")
    print(f"├── 视角数量: {input_spec['camera']['views']}")
    print(f"├── 图像尺寸: {input_spec['camera']['size'][0]}×{input_spec['camera']['size'][1]}")
    print(f"├── 通道数: {input_spec['camera']['channels']} (RGB)")
    print(f"└── 总像素: {input_spec['camera']['size'][0] * input_spec['camera']['size'][1] * input_spec['camera']['views']:,}")

    print("\nLiDAR输入:")
    print(f"├── 激光雷达: {input_spec['lidar']['points']}")
    print(f"├── 检测范围: {input_spec['lidar']['range'][0]}m ~ {input_spec['lidar']['range'][1]}m")
    print(f"├── 体素尺寸: {input_spec['lidar']['voxel_size']}m")
    print(f"└── 稀疏形状: [1440, 1440, 41] (基于配置文件)")

    # 2. Camera Encoder特征尺寸分析
    print("\n📷 2. Camera Encoder特征尺寸变化")
    print("-" * 50)

    camera_spec = config_params['camera_encoder']
    print("SwinTransformer Backbone:")
    print(f"├── 输入: {input_spec['camera']['size'][0]}×{input_spec['camera']['size'][1]}×{input_spec['camera']['channels']}")
    print(f"├── Patch Embed: {camera_spec['backbone']['embed_dims']}通道")
    print(f"├── 层数分布: {camera_spec['backbone']['depths']}")
    print(f"├── 注意力头: {camera_spec['backbone']['num_heads']}")

    # 计算SwinTransformer各阶段输出尺寸
    H, W = input_spec['camera']['size']
    embed_dim = camera_spec['backbone']['embed_dims']

    # Stage outputs (每4个patch合并一次)
    stage_outputs = []
    current_H, current_W = H // 4, W // 4  # 初始patch大小4x4
    for i, (depth, num_heads) in enumerate(zip(camera_spec['backbone']['depths'], camera_spec['backbone']['num_heads'])):
        if i > 0:  # 从第二阶段开始下采样
            current_H, current_W = current_H // 2, current_W // 2
            embed_dim *= 2  # 通道数翻倍

        stage_outputs.append({
            'stage': i+1,
            'channels': embed_dim,
            'height': current_H,
            'width': current_W,
            'tokens': current_H * current_W
        })

        print(f"├── Stage {i+1}: {embed_dim}ch × {current_H}×{current_W} = {embed_dim * current_H * current_W:,} 参数")

    print("\nGeneralizedLSSFPN Neck:")
    neck_in = camera_spec['neck']['in_channels']
    neck_out = camera_spec['neck']['out_channels']
    print(f"├── 输入通道: {neck_in}")
    print(f"├── 输出通道: {neck_out} (统一)")
    print(f"├── 输出层数: {camera_spec['neck']['num_outs']}")

    # 计算FPN输出尺寸 (假设与backbone输出尺寸相同)
    fpn_outputs = []
    for i, (in_ch, out_ch) in enumerate(zip(neck_in, [neck_out] * len(neck_in))):
        stage = stage_outputs[i+1]  # FPN使用Stage 2,3,4
        fpn_outputs.append({
            'level': i+1,
            'channels': out_ch,
            'height': stage['height'],
            'width': stage['width']
        })
        print(f"├── Level {i+1}: {out_ch}ch × {stage['height']}×{stage['width']}")

    print("\nDepthLSSTransform (BEV投影):")
    vtrans = camera_spec['vtransform']
    print(f"├── 输入通道: {vtrans['in_channels']}")
    print(f"├── 输出通道: {vtrans['out_channels']}")
    print(f"├── 图像尺寸: {vtrans['image_size']}")
    print(f"├── 特征尺寸: {vtrans['feature_size']}")

    # 计算BEV尺寸
    bev_range = 108  # [-54, 54]
    bev_resolution = 0.2  # 从xbound配置
    bev_pixels = int(bev_range / bev_resolution) + 1
    print(f"├── BEV范围: [-54, 54]m × [-54, 54]m = {bev_range}m × {bev_range}m")
    print(f"├── BEV分辨率: {bev_resolution}m/像素")
    print(f"├── BEV尺寸: {bev_pixels}×{bev_pixels} 像素")
    print(f"└── Camera BEV特征: {vtrans['out_channels']}ch × {bev_pixels}×{bev_pixels}")

    # 3. LiDAR Encoder特征尺寸分析
    print("\n🔍 3. LiDAR Encoder特征尺寸变化")
    print("-" * 50)

    lidar_spec = config_params['lidar_encoder']
    print("体素化 (Voxelization):")
    voxelize = lidar_spec['voxelize']
    print(f"├── 最大体素数: {voxelize['max_voxels']}")
    print(f"├── 体素尺寸: {voxelize['voxel_size']}m")
    print(f"└── 稀疏形状: [1440, 1440, 41]")

    print("\nSparse Encoder Backbone:")
    backbone = lidar_spec['backbone']
    sparse_shape = backbone['sparse_shape']
    out_channels = backbone['output_channels']
    print(f"├── 稀疏形状: {sparse_shape}")
    print(f"├── 输出通道: {out_channels}")

    # 计算稀疏体素的实际空间尺寸
    spatial_shape = [sparse_shape[0], sparse_shape[1]]  # [1440, 1440]
    voxel_size_xy = voxelize['voxel_size'][:2]  # [0.075, 0.075]
    actual_size = [s * vs for s, vs in zip(spatial_shape, voxel_size_xy)]
    print(f"├── 空间覆盖: {actual_size[0]:.1f}m × {actual_size[1]:.1f}m")
    print(f"└── LiDAR BEV特征: {out_channels}ch × {sparse_shape[0]}×{sparse_shape[1]}")

    # 4. 融合层
    print("\n🔗 4. 融合层 (Fusion)")
    print("-" * 50)

    camera_bev_channels = vtrans['out_channels']  # 80
    lidar_bev_channels = out_channels  # 128
    fused_channels = 256  # 从fuser配置

    print("ConvFuser:")
    print(f"├── Camera BEV: {camera_bev_channels}ch × {bev_pixels}×{bev_pixels}")
    print(f"├── LiDAR BEV: {lidar_bev_channels}ch × {sparse_shape[0]}×{sparse_shape[1]}")
    print(f"├── 融合后: {fused_channels}ch × {sparse_shape[0]}×{sparse_shape[1]}")
    print(f"└── 融合方式: 通道级拼接 + 1×1卷积")

    # 5. Decoder特征尺寸分析
    print("\n🔄 5. Decoder特征尺寸变化")
    print("-" * 50)

    decoder_spec = config_params['decoder']

    print("SECOND Backbone:")
    second_in = decoder_spec['backbone']['in_channels']  # 256
    second_out = decoder_spec['backbone']['out_channels']  # [128, 256]
    layer_nums = decoder_spec['backbone']['layer_nums']  # [5, 5]
    layer_strides = decoder_spec['backbone']['layer_strides']  # [1, 2]

    print(f"├── 输入通道: {second_in}")
    print(f"├── 输出通道: {second_out}")
    print(f"├── 层数: {layer_nums}")
    print(f"├── 步长: {layer_strides}")

    # SECOND特征图尺寸计算
    input_size = sparse_shape[0]  # 1440
    second_features = []

    # 第一阶段: stride=1, 保持尺寸
    stage1_out = second_out[0]  # 128
    stage1_size = input_size  # 1440
    second_features.append({
        'stage': 1,
        'channels': stage1_out,
        'size': stage1_size
    })
    print(f"├── Stage 1: {stage1_out}ch × {stage1_size}×{stage1_size}")

    # 第二阶段: stride=2, 下采样
    stage2_out = second_out[1]  # 256
    stage2_size = input_size // 2  # 720
    second_features.append({
        'stage': 2,
        'channels': stage2_out,
        'size': stage2_size
    })
    print(f"└── Stage 2: {stage2_out}ch × {stage2_size}×{stage2_size}")

    print("\nSECONDFPN Neck:")
    fpn_in = decoder_spec['neck']['in_channels']  # [128, 256]
    fpn_out = decoder_spec['neck']['out_channels']  # [256, 256]
    upsample_strides = decoder_spec['neck']['upsample_strides']  # [1, 2]

    fpn_features = []
    for i, (in_ch, out_ch, stride, feat) in enumerate(zip(fpn_in, fpn_out, upsample_strides, second_features)):
        if stride == 1:
            out_size = feat['size']  # 保持尺寸
        else:  # stride == 2
            out_size = feat['size'] * 2  # 上采样

        fpn_features.append({
            'level': i+1,
            'channels': out_ch,
            'size': out_size
        })
        print(f"├── Level {i+1}: {in_ch}ch → {out_ch}ch, {feat['size']}×{feat['size']} → {out_size}×{out_size}")

    # BEV Neck最终输出
    bev_neck_output = fpn_features[-1]  # Level 2: 256ch × 1440×1440
    print(f"└── BEV特征: {bev_neck_output['channels']}ch × {bev_neck_output['size']}×{bev_neck_output['size']}")

    # 6. Task-specific GCA
    print("\n🎯 6. Task-specific GCA")
    print("-" * 50)

    gca_input_channels = 512  # BEV特征通道数
    gca_reduction = 4

    print("全局上下文聚合 (GCA):")
    print(f"├── 输入通道: {gca_input_channels}")
    print(f"├── 降维比例: {gca_reduction}x")
    print(f"├── 压缩通道: {gca_input_channels // gca_reduction}")
    print(f"├── 检测GCA: {gca_reduction}x降维 → 检测优化特征")
    print(f"└── 分割GCA: {gca_reduction}x降维 → 分割优化特征")

    # 7. 分割头特征尺寸
    print("\n🎨 7. BEV分割头特征尺寸")
    print("-" * 50)

    seg_head = config_params['segmentation_head']

    print("RMT-PPAD Transformer解码器:")
    print(f"├── 输入通道: {seg_head['in_channels']}")
    print(f"├── Transformer隐藏维: {seg_head['transformer_hidden_dim']}")
    print(f"├── Transformer C: {seg_head['transformer_C']}")
    print(f"├── Transformer层数: {seg_head['transformer_num_layers']}")

    # Grid Transform
    grid_trans = seg_head['grid_transform']
    input_range = grid_trans['input_scope'][0][1] - grid_trans['input_scope'][0][0]  # 108m
    input_res = grid_trans['input_scope'][0][2]  # 0.75m/px
    input_pixels = int(input_range / input_res) + 1  # 144 + 1 = 145

    output_range = grid_trans['output_scope'][0][1] - grid_trans['output_scope'][0][0]  # 100m
    output_res = grid_trans['output_scope'][0][2]  # 0.167m/px
    output_pixels = int(output_range / output_res) + 1  # 598 + 1 = 599

    print("\nBEV Grid Transform:")
    print(f"├── 输入: {input_pixels-1}×{input_pixels-1} ({input_res}m/px)")
    print(f"├── 输出: {output_pixels-1}×{output_pixels-1} ({output_res}m/px)")
    print(f"└── 放大倍数: {(output_pixels-1) / (input_pixels-1):.1f}x")

    print("\n最终输出:")
    print(f"├── 分割图: 6类别 × {output_pixels-1}×{output_pixels-1}")
    print(f"├── 分辨率: {output_res}m/像素")
    print(f"├── 覆盖范围: -50m ~ 50m")
    print(f"└── 总像素数: {6 * (output_pixels-1) ** 2:,}")

    # 8. 内存和计算量估算
    print("\n💾 8. 内存与计算量估算")
    print("-" * 50)

    # 主要特征图内存占用估算
    memory_usage = {
        'Camera BEV': bev_pixels * bev_pixels * 80 * 4,  # float32
        'LiDAR BEV': sparse_shape[0] * sparse_shape[1] * 128 * 4,
        'Fused BEV': sparse_shape[0] * sparse_shape[1] * 256 * 4,
        'BEV Neck': bev_neck_output['size'] * bev_neck_output['size'] * bev_neck_output['channels'] * 4,
        'Segmentation': (output_pixels-1) ** 2 * 6 * 4
    }

    print("主要特征图内存占用 (单batch, float32):")
    total_memory = 0
    for name, mem_bytes in memory_usage.items():
        mem_mb = mem_bytes / (1024 * 1024)
        total_memory += mem_mb
        print("8.1f")

    print(f"└── 总计: {total_memory:.1f} MB")

    # 9. 网络架构总结
    print("\n🏗️ 9. 网络架构总结")
    print("-" * 50)

    architecture_summary = [
        ("Camera Encoder", "6视角图像 → SwinTransformer → LSS → BEV特征 (80ch)"),
        ("LiDAR Encoder", "点云 → 体素化 → SparseEncoder → BEV特征 (128ch)"),
        ("Fusion", "Camera + LiDAR → ConvFuser → 融合特征 (256ch)"),
        ("BEV Decoder", "SECOND + SECONDFPN → 高分辨率BEV (256ch × 1440×1440)"),
        ("Task GCA", "检测GCA + 分割GCA → 任务特定特征优化"),
        ("Segmentation Head", "RMT-PPAD Transformer → 6类别分割 (598×598)"),
        ("Detection Head", "TransFusion → 10类别3D检测")
    ]

    for i, (component, description) in enumerate(architecture_summary):
        marker = "├──" if i < len(architecture_summary) - 1 else "└──"
        print(f"{marker} {component}: {description}")

    print("\n" + "="*100)
    print("🏁 网络配置分析完成！Phase 4B架构清晰！")
    print("="*100)

if __name__ == '__main__':
    analyze_network_config()