bev-project/ANALYZE_HIGHRES_CONFIG.py

379 lines
15 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python
"""
分析multitask_enhanced_phase1_HIGHRES.yaml配置的网络结构和特征尺寸
Phase 1: 简化版高分辨率分割专注ASPP + 高分辨率输出
"""
import yaml
import torch
import numpy as np
def analyze_highres_config():
"""分析Phase 1高分辨率配置"""
print("="*90)
print("🎯 Phase 1 高分辨率BEV分割网络分析")
print("="*90)
# 解析配置参数
config_params = {
'input': {
'camera': {'views': 6, 'size': [256, 704], 'channels': 3},
'lidar': {'points': '32线', 'range': [-54, 54], 'voxel_size': [0.075, 0.075, 0.2]}
},
'camera_encoder': {
'backbone': {'type': 'SwinTransformer', 'embed_dims': 96, 'depths': [2, 2, 6, 2], 'num_heads': [3, 6, 12, 24]},
'neck': {'in_channels': [192, 384, 768], 'out_channels': 256, 'num_outs': 3},
'vtransform': {'in_channels': 256, 'out_channels': 80, 'image_size': [256, 704], 'feature_size': [32, 88]}
},
'lidar_encoder': {
'voxelize': {'max_voxels': [120000, 160000], 'voxel_size': [0.075, 0.075, 0.2]},
'backbone': {'sparse_shape': [1440, 1440, 41], 'output_channels': 128}
},
'decoder': {
'backbone': {'in_channels': 256, 'out_channels': [128, 256], 'layer_nums': [5, 5], 'layer_strides': [1, 2]},
'neck': {'in_channels': [128, 256], 'out_channels': [256, 256], 'upsample_strides': [1, 2]}
},
'segmentation_head': {
'in_channels': 512,
'decoder_channels': [256, 128], # Phase 1简化版
'grid_transform': {
'input_scope': [[-54.0, 54.0, 0.75], [-54.0, 54.0, 0.75]],
'output_scope': [[-50, 50, 0.25], [-50, 50, 0.25]]
}
}
}
# 1. 输入规格
print("\n📥 1. 输入数据规格")
print("-" * 60)
input_spec = config_params['input']
print("相机输入:")
print(f"├── 视角数量: {input_spec['camera']['views']}")
print(f"├── 图像尺寸: {input_spec['camera']['size'][0]}×{input_spec['camera']['size'][1]}")
print(f"├── 通道数: {input_spec['camera']['channels']} (RGB)")
print(f"└── 总像素: {input_spec['camera']['size'][0] * input_spec['camera']['size'][1] * input_spec['camera']['views']:,}")
print("\nLiDAR输入:")
print(f"├── 激光雷达: {input_spec['lidar']['points']}")
print(f"├── 检测范围: {input_spec['lidar']['range'][0]}m ~ {input_spec['lidar']['range'][1]}m")
print(f"├── 体素尺寸: {input_spec['lidar']['voxel_size']}m")
print(f"└── 稀疏形状: [1440, 1440, 41]")
# 2. Camera Encoder (与Phase 4B相同)
print("\n📷 2. Camera Encoder特征变化")
print("-" * 60)
camera_spec = config_params['camera_encoder']
print("SwinTransformer Backbone:")
embed_dim = camera_spec['backbone']['embed_dims']
depths = camera_spec['backbone']['depths']
stage_outputs = []
H, W = input_spec['camera']['size']
current_H, current_W = H // 4, W // 4 # 初始patch大小4x4
for i, depth in enumerate(depths):
if i > 0: # 从第二阶段开始下采样
current_H, current_W = current_H // 2, current_W // 2
embed_dim *= 2
stage_outputs.append({
'stage': i+1,
'channels': embed_dim,
'height': current_H,
'width': current_W,
'tokens': current_H * current_W
})
print(f"├── Stage {i+1}: {embed_dim}ch × {current_H}×{current_W} = {embed_dim * current_H * current_W:,} 参数")
print("\nGeneralizedLSSFPN Neck:")
neck_in = camera_spec['neck']['in_channels']
neck_out = camera_spec['neck']['out_channels']
print(f"├── 输入通道: {neck_in}")
print(f"├── 输出通道: {neck_out} (统一)")
print(f"└── 输出层数: {camera_spec['neck']['num_outs']}")
fpn_outputs = []
for i, in_ch in enumerate(neck_in):
stage = stage_outputs[i+1] # FPN使用Stage 2,3,4
fpn_outputs.append({
'level': i+1,
'channels': neck_out,
'height': stage['height'],
'width': stage['width']
})
print(f"├── Level {i+1}: {in_ch}ch → {neck_out}ch, {stage['height']}×{stage['width']}")
print("\nDepthLSSTransform (BEV投影):")
vtrans = camera_spec['vtransform']
print(f"├── 输入通道: {vtrans['in_channels']}")
print(f"├── 输出通道: {vtrans['out_channels']}")
print(f"├── 图像尺寸: {vtrans['image_size']}")
print(f"├── 特征尺寸: {vtrans['feature_size']}")
# Camera BEV尺寸计算 (使用Phase 1配置)
bev_range = 54 - (-54) # 108米
bev_resolution = 0.3 # 从xbound配置
bev_pixels = int(bev_range / bev_resolution) + 1 # 108 / 0.3 + 1 = 361
print(f"├── BEV范围: [-54, 54]m × [-54, 54]m = {bev_range}m × {bev_range}m")
print(f"├── BEV分辨率: {bev_resolution}m/像素")
print(f"├── BEV尺寸: {bev_pixels}×{bev_pixels} 像素")
print(f"└── Camera BEV特征: {vtrans['out_channels']}ch × {bev_pixels}×{bev_pixels}")
# 3. LiDAR Encoder (与Phase 4B相同)
print("\n🔍 3. LiDAR Encoder特征变化")
print("-" * 60)
lidar_spec = config_params['lidar_encoder']
print("体素化 (Voxelization):")
voxelize = lidar_spec['voxelize']
print(f"├── 最大体素数: {voxelize['max_voxels']}")
print(f"├── 体素尺寸: {voxelize['voxel_size']}m")
print(f"└── 稀疏形状: [1440, 1440, 41]")
print("\nSparse Encoder Backbone:")
backbone = lidar_spec['backbone']
sparse_shape = backbone['sparse_shape']
out_channels = backbone['output_channels']
print(f"├── 稀疏形状: {sparse_shape}")
print(f"├── 输出通道: {out_channels}")
print(f"├── 空间覆盖: 108.0m × 108.0m")
print(f"└── LiDAR BEV特征: {out_channels}ch × {sparse_shape[0]}×{sparse_shape[1]}")
# 4. 融合层 (与Phase 4B相同)
print("\n🔗 4. 融合层 (Fusion)")
print("-" * 60)
camera_bev_channels = vtrans['out_channels'] # 80
lidar_bev_channels = out_channels # 128
fused_channels = 256 # 从fuser配置
print("ConvFuser:")
print(f"├── Camera BEV: {camera_bev_channels}ch × {bev_pixels}×{bev_pixels}")
print(f"├── LiDAR BEV: {lidar_bev_channels}ch × {sparse_shape[0]}×{sparse_shape[1]}")
print(f"├── 融合后: {fused_channels}ch × {sparse_shape[0]}×{sparse_shape[1]}")
print(f"└── 融合方式: 通道级拼接 + 1×1卷积")
# 5. Decoder (与Phase 4B相同)
print("\n🔄 5. Decoder特征变化")
print("-" * 60)
decoder_spec = config_params['decoder']
print("SECOND Backbone:")
second_in = decoder_spec['backbone']['in_channels'] # 256
second_out = decoder_spec['backbone']['out_channels'] # [128, 256]
layer_nums = decoder_spec['backbone']['layer_nums'] # [5, 5]
layer_strides = decoder_spec['backbone']['layer_strides'] # [1, 2]
print(f"├── 输入通道: {second_in}")
print(f"├── 输出通道: {second_out}")
print(f"├── 层数: {layer_nums}")
print(f"├── 步长: {layer_strides}")
second_features = []
input_size = sparse_shape[0] # 1440
# Stage 1: stride=1, 保持尺寸
stage1_out = second_out[0] # 128
stage1_size = input_size # 1440
second_features.append({
'stage': 1,
'channels': stage1_out,
'size': stage1_size
})
print(f"├── Stage 1: {stage1_out}ch × {stage1_size}×{stage1_size}")
# Stage 2: stride=2, 下采样
stage2_out = second_out[1] # 256
stage2_size = input_size // 2 # 720
second_features.append({
'stage': 2,
'channels': stage2_out,
'size': stage2_size
})
print(f"└── Stage 2: {stage2_out}ch × {stage2_size}×{stage2_size}")
print("\nSECONDFPN Neck:")
fpn_in = decoder_spec['neck']['in_channels'] # [128, 256]
fpn_out = decoder_spec['neck']['out_channels'] # [256, 256]
upsample_strides = decoder_spec['neck']['upsample_strides'] # [1, 2]
fpn_features = []
for i, (in_ch, out_ch, stride, feat) in enumerate(zip(fpn_in, fpn_out, upsample_strides, second_features)):
if stride == 1:
out_size = feat['size'] # 保持尺寸
else: # stride == 2
out_size = feat['size'] * 2 # 上采样
fpn_features.append({
'level': i+1,
'channels': out_ch,
'size': out_size
})
print(f"├── Level {i+1}: {in_ch}ch → {out_ch}ch, {feat['size']}×{feat['size']}{out_size}×{out_size}")
bev_neck_output = fpn_features[-1] # Level 2: 256ch × 1440×1440
print(f"└── BEV特征: {bev_neck_output['channels']}ch × {bev_neck_output['size']}×{bev_neck_output['size']}")
# 6. 分割头 (Phase 1简化版)
print("\n🎨 6. EnhancedBEVSegmentationHead (Phase 1)")
print("-" * 60)
seg_head = config_params['segmentation_head']
print("Phase 1配置特点:")
print("├── 简化设计: 只启用ASPP")
print("├── Deep Supervision: 关闭")
print("├── Dice Loss: 关闭")
print("├── Decoder: 简化版 [256, 128]")
print("\nEnhancedBEVSegmentationHead结构:")
print(f"├── 输入通道: {seg_head['in_channels']}")
print(f"├── Decoder通道: {seg_head['decoder_channels']}")
# Phase 1的处理流程
print("\n处理流程 (Phase 1):")
bev_input_size = bev_neck_output['size'] # 1440
bev_input_channels = bev_neck_output['channels'] # 256
print(f"├── 输入BEV: {bev_input_channels}ch × {bev_input_size}×{bev_input_size}")
# ASPP处理 (保持尺寸不变)
print(f"├── ASPP: {bev_input_channels}ch → 256ch, 尺寸保持 {bev_input_size}×{bev_input_size}")
# 简化解码器 (Phase 1)
decoder_channels = seg_head['decoder_channels'] # [256, 128]
current_size = bev_input_size
for i, out_ch in enumerate(decoder_channels):
print(f"├── Decoder Layer {i+1}: 256ch → {out_ch}ch, {current_size}×{current_size} (尺寸保持)")
# 分类头
final_channels = decoder_channels[-1] # 128
num_classes = 6 # nuScenes BEV分割类别数
print(f"└── 分类器: {final_channels}ch → {num_classes}ch (每个类别独立预测)")
# Grid Transform
grid_trans = seg_head['grid_transform']
input_range = grid_trans['input_scope'][0][1] - grid_trans['input_scope'][0][0] # 108m
input_res = grid_trans['input_scope'][0][2] # 0.75m/px
input_pixels = int(input_range / input_res) + 1 # 144 + 1 = 145
output_range = grid_trans['output_scope'][0][1] - grid_trans['output_scope'][0][0] # 100m
output_res = grid_trans['output_scope'][0][2] # 0.25m/px
output_pixels = int(output_range / output_res) + 1 # 400 + 1 = 401
print("\nBEV Grid Transform:")
print(f"├── 输入: {input_pixels-1}×{input_pixels-1} ({input_res}m/px)")
print(f"├── 输出: {output_pixels-1}×{output_pixels-1} ({output_res}m/px)")
print(f"├── 放大倍数: {(output_pixels-1) / (input_pixels-1):.1f}x")
print(f"└── 分辨率提升: {input_res/output_res:.1f}x更精细")
print("\n最终输出:")
print(f"├── 分割图: {num_classes}类别 × {output_pixels-1}×{output_pixels-1}")
print(f"├── 分辨率: {output_res}m/像素")
print(f"├── 覆盖范围: -50m ~ 50m")
print(f"└── 总像素数: {num_classes * (output_pixels-1) ** 2:,}")
# 7. 内存和计算量对比
print("\n💾 7. Phase 1 vs Phase 4B 内存对比")
print("-" * 60)
# Phase 1内存计算
phase1_memory = {
'Camera BEV': bev_pixels * bev_pixels * 80 * 4,
'LiDAR BEV': sparse_shape[0] * sparse_shape[1] * 128 * 4,
'Fused BEV': sparse_shape[0] * sparse_shape[1] * 256 * 4,
'BEV Neck': bev_neck_output['size'] * bev_neck_output['size'] * bev_neck_output['channels'] * 4,
'Segmentation': (output_pixels-1) ** 2 * 6 * 4
}
# Phase 4B内存计算 (从之前分析)
phase4b_memory = {
'Camera BEV': 541 * 541 * 80 * 4,
'LiDAR BEV': 1440 * 1440 * 128 * 4,
'Fused BEV': 1440 * 1440 * 256 * 4,
'BEV Neck': 1440 * 1440 * 256 * 4,
'Segmentation': 598 * 598 * 6 * 4
}
print("内存占用对比 (单batch, float32, MB):")
print("组件".ljust(15), "Phase 1".ljust(10), "Phase 4B".ljust(10), "差异")
print("-" * 55)
total_p1 = 0
total_p4b = 0
for component in phase1_memory.keys():
p1_mb = phase1_memory[component] / (1024 * 1024)
p4b_mb = phase4b_memory[component] / (1024 * 1024)
diff = p4b_mb - p1_mb
total_p1 += p1_mb
total_p4b += p4b_mb
print("12s" "8.1f" "8.1f" "+8.1f" if diff > 0 else "8.1f")
print("-" * 55)
print("12s" "8.1f" "8.1f" "+8.1f")
# 8. Phase 1设计理念
print("\n🎯 8. Phase 1设计理念")
print("-" * 60)
phase1_design = {
"目标": [
"验证高分辨率分割的可行性",
"从简单的ASPP开始避免复杂组件干扰",
"建立分割性能baseline"
],
"简化策略": [
"只启用ASPP多尺度特征",
"关闭Deep Supervision减少训练复杂度",
"关闭Dice Loss使用纯Focal Loss",
"简化Decoder为2层"
],
"分辨率提升": [
"BEV输出从180×180提升到400×400",
"分辨率从0.6m/px提升到0.25m/px",
"3倍分辨率提升理论上分割精度显著提高"
],
"训练策略": [
"基于epoch_19.pth继续训练",
"只训练4个epoch (19→23)",
"降低学习率避免破坏预训练权重",
"专注分割性能优化"
]
}
for category, items in phase1_design.items():
print(f"\n{category}:")
for item in items:
print(f"├── {item}")
# 9. 预期性能提升
print("\n📈 9. 预期性能提升")
print("-" * 60)
performance_targets = [
("分辨率提升", "180×180 → 400×400", "3倍像素数量"),
("分割精度", "理论上显著提升", "更细粒度特征表示"),
("车道线检测", "Divider/Stop Line", "预期IoU提升20-30%"),
("内存效率", "相比Phase 4B降低", "更简单的网络结构"),
("训练速度", "4个epoch完成", "快速验证高分辨率效果")
]
print("Phase 1预期效果:")
for target, value, note in performance_targets:
print("15s" "20s" "15s")
print("\n" + "="*90)
print("🏁 Phase 1高分辨率配置分析完成简化设计专注验证高分辨率分割效果")
print("="*90)
if __name__ == '__main__':
analyze_highres_config()