bev-project/ANALYZE_NETWORK_CONFIG.py

336 lines
14 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python
"""
分析multitask_BEV2X_phase4b_rmtppad_segmentation.yaml配置中的网络结构和特征尺寸
"""
import yaml
import torch
import numpy as np
def analyze_network_config():
"""分析BEVFusion Phase 4B网络配置"""
print("="*100)
print("🎯 BEVFusion Phase 4B 网络结构与特征尺寸分析")
print("="*100)
# 解析配置参数
config_params = {
# 输入规格
'input': {
'camera': {'views': 6, 'size': [256, 704], 'channels': 3},
'lidar': {'points': '32线', 'range': [-54, 54], 'voxel_size': [0.075, 0.075, 0.2]}
},
# Camera Encoder
'camera_encoder': {
'backbone': {'type': 'SwinTransformer', 'embed_dims': 96, 'depths': [2, 2, 6, 2], 'num_heads': [3, 6, 12, 24]},
'neck': {'in_channels': [192, 384, 768], 'out_channels': 256, 'num_outs': 3},
'vtransform': {'in_channels': 256, 'out_channels': 80, 'image_size': [256, 704], 'feature_size': [32, 88]}
},
# LiDAR Encoder
'lidar_encoder': {
'voxelize': {'max_voxels': [120000, 160000], 'voxel_size': [0.075, 0.075, 0.2]},
'backbone': {'sparse_shape': [1440, 1440, 41], 'output_channels': 128}
},
# Decoder
'decoder': {
'backbone': {'in_channels': 256, 'out_channels': [128, 256], 'layer_nums': [5, 5], 'layer_strides': [1, 2]},
'neck': {'in_channels': [128, 256], 'out_channels': [256, 256], 'upsample_strides': [1, 2]}
},
# Segmentation Head
'segmentation_head': {
'in_channels': 512,
'transformer_hidden_dim': 256,
'transformer_C': 64,
'transformer_num_layers': 2,
'grid_transform': {
'input_scope': [[-54.0, 54.0, 0.75], [-54.0, 54.0, 0.75]],
'output_scope': [[-50, 50, 0.167], [-50, 50, 0.167]]
}
}
}
# 1. 输入数据规格
print("\n📥 1. 输入数据规格")
print("-" * 50)
input_spec = config_params['input']
print("相机输入:")
print(f"├── 视角数量: {input_spec['camera']['views']}")
print(f"├── 图像尺寸: {input_spec['camera']['size'][0]}×{input_spec['camera']['size'][1]}")
print(f"├── 通道数: {input_spec['camera']['channels']} (RGB)")
print(f"└── 总像素: {input_spec['camera']['size'][0] * input_spec['camera']['size'][1] * input_spec['camera']['views']:,}")
print("\nLiDAR输入:")
print(f"├── 激光雷达: {input_spec['lidar']['points']}")
print(f"├── 检测范围: {input_spec['lidar']['range'][0]}m ~ {input_spec['lidar']['range'][1]}m")
print(f"├── 体素尺寸: {input_spec['lidar']['voxel_size']}m")
print(f"└── 稀疏形状: [1440, 1440, 41] (基于配置文件)")
# 2. Camera Encoder特征尺寸分析
print("\n📷 2. Camera Encoder特征尺寸变化")
print("-" * 50)
camera_spec = config_params['camera_encoder']
print("SwinTransformer Backbone:")
print(f"├── 输入: {input_spec['camera']['size'][0]}×{input_spec['camera']['size'][1]}×{input_spec['camera']['channels']}")
print(f"├── Patch Embed: {camera_spec['backbone']['embed_dims']}通道")
print(f"├── 层数分布: {camera_spec['backbone']['depths']}")
print(f"├── 注意力头: {camera_spec['backbone']['num_heads']}")
# 计算SwinTransformer各阶段输出尺寸
H, W = input_spec['camera']['size']
embed_dim = camera_spec['backbone']['embed_dims']
# Stage outputs (每4个patch合并一次)
stage_outputs = []
current_H, current_W = H // 4, W // 4 # 初始patch大小4x4
for i, (depth, num_heads) in enumerate(zip(camera_spec['backbone']['depths'], camera_spec['backbone']['num_heads'])):
if i > 0: # 从第二阶段开始下采样
current_H, current_W = current_H // 2, current_W // 2
embed_dim *= 2 # 通道数翻倍
stage_outputs.append({
'stage': i+1,
'channels': embed_dim,
'height': current_H,
'width': current_W,
'tokens': current_H * current_W
})
print(f"├── Stage {i+1}: {embed_dim}ch × {current_H}×{current_W} = {embed_dim * current_H * current_W:,} 参数")
print("\nGeneralizedLSSFPN Neck:")
neck_in = camera_spec['neck']['in_channels']
neck_out = camera_spec['neck']['out_channels']
print(f"├── 输入通道: {neck_in}")
print(f"├── 输出通道: {neck_out} (统一)")
print(f"├── 输出层数: {camera_spec['neck']['num_outs']}")
# 计算FPN输出尺寸 (假设与backbone输出尺寸相同)
fpn_outputs = []
for i, (in_ch, out_ch) in enumerate(zip(neck_in, [neck_out] * len(neck_in))):
stage = stage_outputs[i+1] # FPN使用Stage 2,3,4
fpn_outputs.append({
'level': i+1,
'channels': out_ch,
'height': stage['height'],
'width': stage['width']
})
print(f"├── Level {i+1}: {out_ch}ch × {stage['height']}×{stage['width']}")
print("\nDepthLSSTransform (BEV投影):")
vtrans = camera_spec['vtransform']
print(f"├── 输入通道: {vtrans['in_channels']}")
print(f"├── 输出通道: {vtrans['out_channels']}")
print(f"├── 图像尺寸: {vtrans['image_size']}")
print(f"├── 特征尺寸: {vtrans['feature_size']}")
# 计算BEV尺寸
bev_range = 108 # [-54, 54]
bev_resolution = 0.2 # 从xbound配置
bev_pixels = int(bev_range / bev_resolution) + 1
print(f"├── BEV范围: [-54, 54]m × [-54, 54]m = {bev_range}m × {bev_range}m")
print(f"├── BEV分辨率: {bev_resolution}m/像素")
print(f"├── BEV尺寸: {bev_pixels}×{bev_pixels} 像素")
print(f"└── Camera BEV特征: {vtrans['out_channels']}ch × {bev_pixels}×{bev_pixels}")
# 3. LiDAR Encoder特征尺寸分析
print("\n🔍 3. LiDAR Encoder特征尺寸变化")
print("-" * 50)
lidar_spec = config_params['lidar_encoder']
print("体素化 (Voxelization):")
voxelize = lidar_spec['voxelize']
print(f"├── 最大体素数: {voxelize['max_voxels']}")
print(f"├── 体素尺寸: {voxelize['voxel_size']}m")
print(f"└── 稀疏形状: [1440, 1440, 41]")
print("\nSparse Encoder Backbone:")
backbone = lidar_spec['backbone']
sparse_shape = backbone['sparse_shape']
out_channels = backbone['output_channels']
print(f"├── 稀疏形状: {sparse_shape}")
print(f"├── 输出通道: {out_channels}")
# 计算稀疏体素的实际空间尺寸
spatial_shape = [sparse_shape[0], sparse_shape[1]] # [1440, 1440]
voxel_size_xy = voxelize['voxel_size'][:2] # [0.075, 0.075]
actual_size = [s * vs for s, vs in zip(spatial_shape, voxel_size_xy)]
print(f"├── 空间覆盖: {actual_size[0]:.1f}m × {actual_size[1]:.1f}m")
print(f"└── LiDAR BEV特征: {out_channels}ch × {sparse_shape[0]}×{sparse_shape[1]}")
# 4. 融合层
print("\n🔗 4. 融合层 (Fusion)")
print("-" * 50)
camera_bev_channels = vtrans['out_channels'] # 80
lidar_bev_channels = out_channels # 128
fused_channels = 256 # 从fuser配置
print("ConvFuser:")
print(f"├── Camera BEV: {camera_bev_channels}ch × {bev_pixels}×{bev_pixels}")
print(f"├── LiDAR BEV: {lidar_bev_channels}ch × {sparse_shape[0]}×{sparse_shape[1]}")
print(f"├── 融合后: {fused_channels}ch × {sparse_shape[0]}×{sparse_shape[1]}")
print(f"└── 融合方式: 通道级拼接 + 1×1卷积")
# 5. Decoder特征尺寸分析
print("\n🔄 5. Decoder特征尺寸变化")
print("-" * 50)
decoder_spec = config_params['decoder']
print("SECOND Backbone:")
second_in = decoder_spec['backbone']['in_channels'] # 256
second_out = decoder_spec['backbone']['out_channels'] # [128, 256]
layer_nums = decoder_spec['backbone']['layer_nums'] # [5, 5]
layer_strides = decoder_spec['backbone']['layer_strides'] # [1, 2]
print(f"├── 输入通道: {second_in}")
print(f"├── 输出通道: {second_out}")
print(f"├── 层数: {layer_nums}")
print(f"├── 步长: {layer_strides}")
# SECOND特征图尺寸计算
input_size = sparse_shape[0] # 1440
second_features = []
# 第一阶段: stride=1, 保持尺寸
stage1_out = second_out[0] # 128
stage1_size = input_size # 1440
second_features.append({
'stage': 1,
'channels': stage1_out,
'size': stage1_size
})
print(f"├── Stage 1: {stage1_out}ch × {stage1_size}×{stage1_size}")
# 第二阶段: stride=2, 下采样
stage2_out = second_out[1] # 256
stage2_size = input_size // 2 # 720
second_features.append({
'stage': 2,
'channels': stage2_out,
'size': stage2_size
})
print(f"└── Stage 2: {stage2_out}ch × {stage2_size}×{stage2_size}")
print("\nSECONDFPN Neck:")
fpn_in = decoder_spec['neck']['in_channels'] # [128, 256]
fpn_out = decoder_spec['neck']['out_channels'] # [256, 256]
upsample_strides = decoder_spec['neck']['upsample_strides'] # [1, 2]
fpn_features = []
for i, (in_ch, out_ch, stride, feat) in enumerate(zip(fpn_in, fpn_out, upsample_strides, second_features)):
if stride == 1:
out_size = feat['size'] # 保持尺寸
else: # stride == 2
out_size = feat['size'] * 2 # 上采样
fpn_features.append({
'level': i+1,
'channels': out_ch,
'size': out_size
})
print(f"├── Level {i+1}: {in_ch}ch → {out_ch}ch, {feat['size']}×{feat['size']}{out_size}×{out_size}")
# BEV Neck最终输出
bev_neck_output = fpn_features[-1] # Level 2: 256ch × 1440×1440
print(f"└── BEV特征: {bev_neck_output['channels']}ch × {bev_neck_output['size']}×{bev_neck_output['size']}")
# 6. Task-specific GCA
print("\n🎯 6. Task-specific GCA")
print("-" * 50)
gca_input_channels = 512 # BEV特征通道数
gca_reduction = 4
print("全局上下文聚合 (GCA):")
print(f"├── 输入通道: {gca_input_channels}")
print(f"├── 降维比例: {gca_reduction}x")
print(f"├── 压缩通道: {gca_input_channels // gca_reduction}")
print(f"├── 检测GCA: {gca_reduction}x降维 → 检测优化特征")
print(f"└── 分割GCA: {gca_reduction}x降维 → 分割优化特征")
# 7. 分割头特征尺寸
print("\n🎨 7. BEV分割头特征尺寸")
print("-" * 50)
seg_head = config_params['segmentation_head']
print("RMT-PPAD Transformer解码器:")
print(f"├── 输入通道: {seg_head['in_channels']}")
print(f"├── Transformer隐藏维: {seg_head['transformer_hidden_dim']}")
print(f"├── Transformer C: {seg_head['transformer_C']}")
print(f"├── Transformer层数: {seg_head['transformer_num_layers']}")
# Grid Transform
grid_trans = seg_head['grid_transform']
input_range = grid_trans['input_scope'][0][1] - grid_trans['input_scope'][0][0] # 108m
input_res = grid_trans['input_scope'][0][2] # 0.75m/px
input_pixels = int(input_range / input_res) + 1 # 144 + 1 = 145
output_range = grid_trans['output_scope'][0][1] - grid_trans['output_scope'][0][0] # 100m
output_res = grid_trans['output_scope'][0][2] # 0.167m/px
output_pixels = int(output_range / output_res) + 1 # 598 + 1 = 599
print("\nBEV Grid Transform:")
print(f"├── 输入: {input_pixels-1}×{input_pixels-1} ({input_res}m/px)")
print(f"├── 输出: {output_pixels-1}×{output_pixels-1} ({output_res}m/px)")
print(f"└── 放大倍数: {(output_pixels-1) / (input_pixels-1):.1f}x")
print("\n最终输出:")
print(f"├── 分割图: 6类别 × {output_pixels-1}×{output_pixels-1}")
print(f"├── 分辨率: {output_res}m/像素")
print(f"├── 覆盖范围: -50m ~ 50m")
print(f"└── 总像素数: {6 * (output_pixels-1) ** 2:,}")
# 8. 内存和计算量估算
print("\n💾 8. 内存与计算量估算")
print("-" * 50)
# 主要特征图内存占用估算
memory_usage = {
'Camera BEV': bev_pixels * bev_pixels * 80 * 4, # float32
'LiDAR BEV': sparse_shape[0] * sparse_shape[1] * 128 * 4,
'Fused BEV': sparse_shape[0] * sparse_shape[1] * 256 * 4,
'BEV Neck': bev_neck_output['size'] * bev_neck_output['size'] * bev_neck_output['channels'] * 4,
'Segmentation': (output_pixels-1) ** 2 * 6 * 4
}
print("主要特征图内存占用 (单batch, float32):")
total_memory = 0
for name, mem_bytes in memory_usage.items():
mem_mb = mem_bytes / (1024 * 1024)
total_memory += mem_mb
print("8.1f")
print(f"└── 总计: {total_memory:.1f} MB")
# 9. 网络架构总结
print("\n🏗️ 9. 网络架构总结")
print("-" * 50)
architecture_summary = [
("Camera Encoder", "6视角图像 → SwinTransformer → LSS → BEV特征 (80ch)"),
("LiDAR Encoder", "点云 → 体素化 → SparseEncoder → BEV特征 (128ch)"),
("Fusion", "Camera + LiDAR → ConvFuser → 融合特征 (256ch)"),
("BEV Decoder", "SECOND + SECONDFPN → 高分辨率BEV (256ch × 1440×1440)"),
("Task GCA", "检测GCA + 分割GCA → 任务特定特征优化"),
("Segmentation Head", "RMT-PPAD Transformer → 6类别分割 (598×598)"),
("Detection Head", "TransFusion → 10类别3D检测")
]
for i, (component, description) in enumerate(architecture_summary):
marker = "├──" if i < len(architecture_summary) - 1 else "└──"
print(f"{marker} {component}: {description}")
print("\n" + "="*100)
print("🏁 网络配置分析完成Phase 4B架构清晰")
print("="*100)
if __name__ == '__main__':
analyze_network_config()