217 lines
9.2 KiB
Python
217 lines
9.2 KiB
Python
#!/usr/bin/env python
|
||
"""
|
||
分析BEVFusion中BEV特征尺寸变化
|
||
重点分析EnhancedBEVSegmentationHead的4层渐进上采样过程
|
||
"""
|
||
import torch
|
||
import numpy as np
|
||
|
||
def calculate_bev_dimensions():
|
||
"""计算BEV特征在各个阶段的尺寸"""
|
||
print("="*80)
|
||
print("🎯 BEVFusion BEV特征尺寸分析")
|
||
print("="*80)
|
||
|
||
# 1. 输入BEV特征 (来自BEV Neck)
|
||
print("\n📥 1. 输入BEV特征")
|
||
print("-" * 40)
|
||
|
||
# SECONDFPN输出配置
|
||
bev_neck_output = {
|
||
'channels': 512, # SECONDFPN最终输出通道数
|
||
'height': 144, # BEV空间分辨率
|
||
'width': 144, # BEV空间分辨率
|
||
'voxel_size': 0.75, # 米/像素
|
||
'range': [-54, 54] # BEV范围 (米)
|
||
}
|
||
|
||
print("BEV Neck (SECONDFPN) 输出:")
|
||
print(f"├── 通道数: {bev_neck_output['channels']}")
|
||
print(f"├── 空间尺寸: {bev_neck_output['height']} × {bev_neck_output['width']}")
|
||
print(f"├── 分辨率: {bev_neck_output['voxel_size']}m/像素")
|
||
print(f"├── 覆盖范围: {bev_neck_output['range'][0]}m ~ {bev_neck_output['range'][1]}m")
|
||
print(".1f")
|
||
|
||
# 2. BEV Grid Transform
|
||
print("\n🔄 2. BEV Grid Transform")
|
||
print("-" * 40)
|
||
|
||
transform_config = {
|
||
'input_scope': [[-54.0, 54.0, 0.75], [-54.0, 54.0, 0.75]], # 输入坐标系
|
||
'output_scope': [[-50, 50, 0.167], [-50, 50, 0.167]] # 输出坐标系
|
||
}
|
||
|
||
# 计算变换后的尺寸
|
||
input_range = transform_config['input_scope'][0][1] - transform_config['input_scope'][0][0] # 108m
|
||
input_resolution = transform_config['input_scope'][0][2] # 0.75m/像素
|
||
input_pixels = int(input_range / input_resolution) + 1 # 144 + 1 = 145
|
||
|
||
output_range = transform_config['output_scope'][0][1] - transform_config['output_scope'][0][0] # 100m
|
||
output_resolution = transform_config['output_scope'][0][2] # 0.167m/像素
|
||
output_pixels = int(output_range / output_resolution) + 1 # 598 + 1 = 599
|
||
|
||
print("坐标系变换:")
|
||
print(f"├── 输入范围: {transform_config['input_scope'][0][0]}m ~ {transform_config['input_scope'][0][1]}m")
|
||
print(f"├── 输入分辨率: {transform_config['input_scope'][0][2]}m/像素")
|
||
print(f"├── 输入像素: {input_pixels-1} × {input_pixels-1}")
|
||
print("")
|
||
print(f"├── 输出范围: {transform_config['output_scope'][0][0]}m ~ {transform_config['output_scope'][0][1]}m")
|
||
print(f"├── 输出分辨率: {transform_config['output_scope'][0][2]}m/像素")
|
||
print(f"├── 输出像素: {output_pixels-1} × {output_pixels-1} (≈598×598)")
|
||
print(f"└── 空间放大: {(output_pixels-1) / (input_pixels-1):.1f}x")
|
||
|
||
# 3. ASPP多尺度特征
|
||
print("\n🎯 3. ASPP多尺度特征")
|
||
print("-" * 40)
|
||
|
||
aspp_config = {
|
||
'input_channels': bev_neck_output['channels'], # 512
|
||
'output_channels': 256, # decoder_channels[0]
|
||
'dilation_rates': [1, 3, 6, 12] # ASPP膨胀率
|
||
}
|
||
|
||
print("ASPP (Atrous Spatial Pyramid Pooling):")
|
||
print(f"├── 输入通道: {aspp_config['input_channels']}")
|
||
print(f"├── 输出通道: {aspp_config['output_channels']}")
|
||
print(f"├── 膨胀率: {aspp_config['dilation_rates']}")
|
||
print(f"├── 空间尺寸: {output_pixels-1} × {output_pixels-1} (保持不变)")
|
||
print("└── 作用: 捕获多尺度上下文信息")
|
||
|
||
# 4. 注意力机制
|
||
print("\n🎯 4. 注意力机制")
|
||
print("-" * 40)
|
||
|
||
attention_stages = [
|
||
{
|
||
'name': 'Channel Attention',
|
||
'input_channels': aspp_config['output_channels'],
|
||
'operation': '通道重要性加权'
|
||
},
|
||
{
|
||
'name': 'Spatial Attention',
|
||
'input_channels': aspp_config['output_channels'],
|
||
'operation': '空间位置重要性加权'
|
||
},
|
||
{
|
||
'name': 'Global Context Attention (GCA)',
|
||
'input_channels': aspp_config['output_channels'],
|
||
'operation': '全局上下文聚合',
|
||
'reduction': 4
|
||
}
|
||
]
|
||
|
||
for i, attn in enumerate(attention_stages):
|
||
print(f"{i+1}. {attn['name']}:")
|
||
print(f" ├── 输入通道: {attn['input_channels']}")
|
||
print(f" ├── 操作: {attn['operation']}")
|
||
if 'reduction' in attn:
|
||
print(f" ├── 降维比例: {attn['reduction']}x")
|
||
print(f" └── 输出通道: {attn['input_channels']}")
|
||
|
||
# 5. Deep Decoder (4层渐进上采样)
|
||
print("\n🔄 5. Deep Decoder (4层卷积网络)")
|
||
print("-" * 40)
|
||
|
||
# 注意:这里的"上采样"实际上是指通道数的变换,不是空间尺寸的上采样
|
||
decoder_channels = [256, 256, 128, 128] # decoder_channels配置
|
||
|
||
print("解码器架构 (4层卷积):")
|
||
print(f"├── 输入特征: {aspp_config['output_channels']}ch × {output_pixels-1}×{output_pixels-1}")
|
||
print("├── 空间尺寸: 始终保持 598×598 (无上采样!)")
|
||
|
||
print("\n通道变换过程:")
|
||
current_channels = aspp_config['output_channels']
|
||
for i, out_channels in enumerate(decoder_channels):
|
||
print(f"├── Layer {i+1}: {current_channels}ch → {out_channels}ch")
|
||
print("│ ├── Conv2d(3×3, padding=1)")
|
||
print("│ ├── GroupNorm")
|
||
print("│ ├── ReLU")
|
||
print("│ └── Dropout2d(0.1)")
|
||
current_channels = out_channels
|
||
|
||
print(f"└── 最终特征: {decoder_channels[-1]}ch × 598×598")
|
||
|
||
# 6. 分类头
|
||
print("\n🎯 6. 分类头 (Per-class Classification)")
|
||
print("-" * 40)
|
||
|
||
classifier_config = {
|
||
'input_channels': decoder_channels[-1], # 128
|
||
'hidden_channels': decoder_channels[-1] // 2, # 64
|
||
'num_classes': 6, # nuScenes BEV分割类别数
|
||
'classes': ['drivable_area', 'ped_crossing', 'walkway', 'stop_line', 'carpark_area', 'divider']
|
||
}
|
||
|
||
print("每个类别的分类器:")
|
||
print(f"├── 输入通道: {classifier_config['input_channels']}")
|
||
print(f"├── 隐藏通道: {classifier_config['hidden_channels']}")
|
||
print(f"├── 输出通道: 1 (每个类别独立预测)")
|
||
print(f"└── 总类别数: {classifier_config['num_classes']}")
|
||
|
||
print(f"\n类别列表:")
|
||
for i, cls in enumerate(classifier_config['classes']):
|
||
print("2d")
|
||
|
||
# 7. 最终输出
|
||
print("\n📤 7. 最终输出")
|
||
print("-" * 40)
|
||
|
||
final_output = {
|
||
'shape': [classifier_config['num_classes'], output_pixels-1, output_pixels-1],
|
||
'resolution': output_resolution,
|
||
'range': [transform_config['output_scope'][0][0], transform_config['output_scope'][0][1]],
|
||
'total_pixels': classifier_config['num_classes'] * (output_pixels-1) ** 2
|
||
}
|
||
|
||
print("分割预测结果:")
|
||
print(f"├── 张量形状: [{final_output['shape'][0]}, {final_output['shape'][1]}, {final_output['shape'][2]}]")
|
||
print(f"├── 空间分辨率: {final_output['resolution']}m/像素")
|
||
print(f"├── 覆盖范围: {final_output['range'][0]}m ~ {final_output['range'][1]}m")
|
||
print(".1f")
|
||
print(f"└── 总像素数: {final_output['total_pixels']:,}")
|
||
|
||
# 8. 尺寸变化总结
|
||
print("\n📊 8. 尺寸变化总结")
|
||
print("-" * 40)
|
||
|
||
dimension_summary = [
|
||
("BEV Neck输出", f"{bev_neck_output['channels']}ch × {bev_neck_output['height']}×{bev_neck_output['width']}"),
|
||
("Grid Transform", f"{bev_neck_output['channels']}ch × {output_pixels-1}×{output_pixels-1}"),
|
||
("ASPP", f"{aspp_config['output_channels']}ch × {output_pixels-1}×{output_pixels-1}"),
|
||
("Channel Attention", f"{aspp_config['output_channels']}ch × {output_pixels-1}×{output_pixels-1}"),
|
||
("Spatial Attention", f"{aspp_config['output_channels']}ch × {output_pixels-1}×{output_pixels-1}"),
|
||
("Decoder Layer 1", f"{decoder_channels[0]}ch × {output_pixels-1}×{output_pixels-1}"),
|
||
("Decoder Layer 2", f"{decoder_channels[1]}ch × {output_pixels-1}×{output_pixels-1}"),
|
||
("Decoder Layer 3", f"{decoder_channels[2]}ch × {output_pixels-1}×{output_pixels-1}"),
|
||
("Decoder Layer 4", f"{decoder_channels[3]}ch × {output_pixels-1}×{output_pixels-1}"),
|
||
("最终输出", f"{classifier_config['num_classes']}ch × {output_pixels-1}×{output_pixels-1}")
|
||
]
|
||
|
||
print("特征尺寸演变:")
|
||
for i, (stage, size) in enumerate(dimension_summary):
|
||
marker = "├──" if i < len(dimension_summary) - 1 else "└──"
|
||
print(f"{marker} {stage}: {size}")
|
||
|
||
# 9. 关键发现
|
||
print("\n💡 9. 关键发现")
|
||
print("-" * 40)
|
||
|
||
findings = [
|
||
"🎯 空间尺寸保持不变: 整个分割头不进行空间上采样",
|
||
"🔄 坐标系变换: 从144×144 (0.75m/px) → 598×598 (0.167m/px)",
|
||
"📈 分辨率提升: 4.5倍高分辨率输出 (144→598像素)",
|
||
"🏗️ 通道变换: 512→256→256→128→128 (4层渐进压缩)",
|
||
"🎯 最终输出: 6类别 × 598×598 高分辨率分割图",
|
||
"⚡ 计算特点: 空间保持,通道压缩,注意力增强"
|
||
]
|
||
|
||
for finding in findings:
|
||
print(f"├── {finding}")
|
||
|
||
print("\n" + "="*80)
|
||
print("🏁 BEV特征尺寸分析完成!")
|
||
print("="*80)
|
||
|
||
if __name__ == '__main__':
|
||
calculate_bev_dimensions()
|