bev-project/ANALYZE_BEV_FEATURE_DIMENSI...

217 lines
9.2 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python
"""
分析BEVFusion中BEV特征尺寸变化
重点分析EnhancedBEVSegmentationHead的4层渐进上采样过程
"""
import torch
import numpy as np
def calculate_bev_dimensions():
"""计算BEV特征在各个阶段的尺寸"""
print("="*80)
print("🎯 BEVFusion BEV特征尺寸分析")
print("="*80)
# 1. 输入BEV特征 (来自BEV Neck)
print("\n📥 1. 输入BEV特征")
print("-" * 40)
# SECONDFPN输出配置
bev_neck_output = {
'channels': 512, # SECONDFPN最终输出通道数
'height': 144, # BEV空间分辨率
'width': 144, # BEV空间分辨率
'voxel_size': 0.75, # 米/像素
'range': [-54, 54] # BEV范围 (米)
}
print("BEV Neck (SECONDFPN) 输出:")
print(f"├── 通道数: {bev_neck_output['channels']}")
print(f"├── 空间尺寸: {bev_neck_output['height']} × {bev_neck_output['width']}")
print(f"├── 分辨率: {bev_neck_output['voxel_size']}m/像素")
print(f"├── 覆盖范围: {bev_neck_output['range'][0]}m ~ {bev_neck_output['range'][1]}m")
print(".1f")
# 2. BEV Grid Transform
print("\n🔄 2. BEV Grid Transform")
print("-" * 40)
transform_config = {
'input_scope': [[-54.0, 54.0, 0.75], [-54.0, 54.0, 0.75]], # 输入坐标系
'output_scope': [[-50, 50, 0.167], [-50, 50, 0.167]] # 输出坐标系
}
# 计算变换后的尺寸
input_range = transform_config['input_scope'][0][1] - transform_config['input_scope'][0][0] # 108m
input_resolution = transform_config['input_scope'][0][2] # 0.75m/像素
input_pixels = int(input_range / input_resolution) + 1 # 144 + 1 = 145
output_range = transform_config['output_scope'][0][1] - transform_config['output_scope'][0][0] # 100m
output_resolution = transform_config['output_scope'][0][2] # 0.167m/像素
output_pixels = int(output_range / output_resolution) + 1 # 598 + 1 = 599
print("坐标系变换:")
print(f"├── 输入范围: {transform_config['input_scope'][0][0]}m ~ {transform_config['input_scope'][0][1]}m")
print(f"├── 输入分辨率: {transform_config['input_scope'][0][2]}m/像素")
print(f"├── 输入像素: {input_pixels-1} × {input_pixels-1}")
print("")
print(f"├── 输出范围: {transform_config['output_scope'][0][0]}m ~ {transform_config['output_scope'][0][1]}m")
print(f"├── 输出分辨率: {transform_config['output_scope'][0][2]}m/像素")
print(f"├── 输出像素: {output_pixels-1} × {output_pixels-1} (≈598×598)")
print(f"└── 空间放大: {(output_pixels-1) / (input_pixels-1):.1f}x")
# 3. ASPP多尺度特征
print("\n🎯 3. ASPP多尺度特征")
print("-" * 40)
aspp_config = {
'input_channels': bev_neck_output['channels'], # 512
'output_channels': 256, # decoder_channels[0]
'dilation_rates': [1, 3, 6, 12] # ASPP膨胀率
}
print("ASPP (Atrous Spatial Pyramid Pooling):")
print(f"├── 输入通道: {aspp_config['input_channels']}")
print(f"├── 输出通道: {aspp_config['output_channels']}")
print(f"├── 膨胀率: {aspp_config['dilation_rates']}")
print(f"├── 空间尺寸: {output_pixels-1} × {output_pixels-1} (保持不变)")
print("└── 作用: 捕获多尺度上下文信息")
# 4. 注意力机制
print("\n🎯 4. 注意力机制")
print("-" * 40)
attention_stages = [
{
'name': 'Channel Attention',
'input_channels': aspp_config['output_channels'],
'operation': '通道重要性加权'
},
{
'name': 'Spatial Attention',
'input_channels': aspp_config['output_channels'],
'operation': '空间位置重要性加权'
},
{
'name': 'Global Context Attention (GCA)',
'input_channels': aspp_config['output_channels'],
'operation': '全局上下文聚合',
'reduction': 4
}
]
for i, attn in enumerate(attention_stages):
print(f"{i+1}. {attn['name']}:")
print(f" ├── 输入通道: {attn['input_channels']}")
print(f" ├── 操作: {attn['operation']}")
if 'reduction' in attn:
print(f" ├── 降维比例: {attn['reduction']}x")
print(f" └── 输出通道: {attn['input_channels']}")
# 5. Deep Decoder (4层渐进上采样)
print("\n🔄 5. Deep Decoder (4层卷积网络)")
print("-" * 40)
# 注意:这里的"上采样"实际上是指通道数的变换,不是空间尺寸的上采样
decoder_channels = [256, 256, 128, 128] # decoder_channels配置
print("解码器架构 (4层卷积):")
print(f"├── 输入特征: {aspp_config['output_channels']}ch × {output_pixels-1}×{output_pixels-1}")
print("├── 空间尺寸: 始终保持 598×598 (无上采样!)")
print("\n通道变换过程:")
current_channels = aspp_config['output_channels']
for i, out_channels in enumerate(decoder_channels):
print(f"├── Layer {i+1}: {current_channels}ch → {out_channels}ch")
print("│ ├── Conv2d(3×3, padding=1)")
print("│ ├── GroupNorm")
print("│ ├── ReLU")
print("│ └── Dropout2d(0.1)")
current_channels = out_channels
print(f"└── 最终特征: {decoder_channels[-1]}ch × 598×598")
# 6. 分类头
print("\n🎯 6. 分类头 (Per-class Classification)")
print("-" * 40)
classifier_config = {
'input_channels': decoder_channels[-1], # 128
'hidden_channels': decoder_channels[-1] // 2, # 64
'num_classes': 6, # nuScenes BEV分割类别数
'classes': ['drivable_area', 'ped_crossing', 'walkway', 'stop_line', 'carpark_area', 'divider']
}
print("每个类别的分类器:")
print(f"├── 输入通道: {classifier_config['input_channels']}")
print(f"├── 隐藏通道: {classifier_config['hidden_channels']}")
print(f"├── 输出通道: 1 (每个类别独立预测)")
print(f"└── 总类别数: {classifier_config['num_classes']}")
print(f"\n类别列表:")
for i, cls in enumerate(classifier_config['classes']):
print("2d")
# 7. 最终输出
print("\n📤 7. 最终输出")
print("-" * 40)
final_output = {
'shape': [classifier_config['num_classes'], output_pixels-1, output_pixels-1],
'resolution': output_resolution,
'range': [transform_config['output_scope'][0][0], transform_config['output_scope'][0][1]],
'total_pixels': classifier_config['num_classes'] * (output_pixels-1) ** 2
}
print("分割预测结果:")
print(f"├── 张量形状: [{final_output['shape'][0]}, {final_output['shape'][1]}, {final_output['shape'][2]}]")
print(f"├── 空间分辨率: {final_output['resolution']}m/像素")
print(f"├── 覆盖范围: {final_output['range'][0]}m ~ {final_output['range'][1]}m")
print(".1f")
print(f"└── 总像素数: {final_output['total_pixels']:,}")
# 8. 尺寸变化总结
print("\n📊 8. 尺寸变化总结")
print("-" * 40)
dimension_summary = [
("BEV Neck输出", f"{bev_neck_output['channels']}ch × {bev_neck_output['height']}×{bev_neck_output['width']}"),
("Grid Transform", f"{bev_neck_output['channels']}ch × {output_pixels-1}×{output_pixels-1}"),
("ASPP", f"{aspp_config['output_channels']}ch × {output_pixels-1}×{output_pixels-1}"),
("Channel Attention", f"{aspp_config['output_channels']}ch × {output_pixels-1}×{output_pixels-1}"),
("Spatial Attention", f"{aspp_config['output_channels']}ch × {output_pixels-1}×{output_pixels-1}"),
("Decoder Layer 1", f"{decoder_channels[0]}ch × {output_pixels-1}×{output_pixels-1}"),
("Decoder Layer 2", f"{decoder_channels[1]}ch × {output_pixels-1}×{output_pixels-1}"),
("Decoder Layer 3", f"{decoder_channels[2]}ch × {output_pixels-1}×{output_pixels-1}"),
("Decoder Layer 4", f"{decoder_channels[3]}ch × {output_pixels-1}×{output_pixels-1}"),
("最终输出", f"{classifier_config['num_classes']}ch × {output_pixels-1}×{output_pixels-1}")
]
print("特征尺寸演变:")
for i, (stage, size) in enumerate(dimension_summary):
marker = "├──" if i < len(dimension_summary) - 1 else "└──"
print(f"{marker} {stage}: {size}")
# 9. 关键发现
print("\n💡 9. 关键发现")
print("-" * 40)
findings = [
"🎯 空间尺寸保持不变: 整个分割头不进行空间上采样",
"🔄 坐标系变换: 从144×144 (0.75m/px) → 598×598 (0.167m/px)",
"📈 分辨率提升: 4.5倍高分辨率输出 (144→598像素)",
"🏗️ 通道变换: 512→256→256→128→128 (4层渐进压缩)",
"🎯 最终输出: 6类别 × 598×598 高分辨率分割图",
"⚡ 计算特点: 空间保持,通道压缩,注意力增强"
]
for finding in findings:
print(f"├── {finding}")
print("\n" + "="*80)
print("🏁 BEV特征尺寸分析完成")
print("="*80)
if __name__ == '__main__':
calculate_bev_dimensions()