379 lines
15 KiB
Python
379 lines
15 KiB
Python
|
|
#!/usr/bin/env python
|
|||
|
|
"""
|
|||
|
|
分析multitask_enhanced_phase1_HIGHRES.yaml配置的网络结构和特征尺寸
|
|||
|
|
Phase 1: 简化版高分辨率分割,专注ASPP + 高分辨率输出
|
|||
|
|
"""
|
|||
|
|
import yaml
|
|||
|
|
import torch
|
|||
|
|
import numpy as np
|
|||
|
|
|
|||
|
|
def analyze_highres_config():
|
|||
|
|
"""分析Phase 1高分辨率配置"""
|
|||
|
|
print("="*90)
|
|||
|
|
print("🎯 Phase 1 高分辨率BEV分割网络分析")
|
|||
|
|
print("="*90)
|
|||
|
|
|
|||
|
|
# 解析配置参数
|
|||
|
|
config_params = {
|
|||
|
|
'input': {
|
|||
|
|
'camera': {'views': 6, 'size': [256, 704], 'channels': 3},
|
|||
|
|
'lidar': {'points': '32线', 'range': [-54, 54], 'voxel_size': [0.075, 0.075, 0.2]}
|
|||
|
|
},
|
|||
|
|
|
|||
|
|
'camera_encoder': {
|
|||
|
|
'backbone': {'type': 'SwinTransformer', 'embed_dims': 96, 'depths': [2, 2, 6, 2], 'num_heads': [3, 6, 12, 24]},
|
|||
|
|
'neck': {'in_channels': [192, 384, 768], 'out_channels': 256, 'num_outs': 3},
|
|||
|
|
'vtransform': {'in_channels': 256, 'out_channels': 80, 'image_size': [256, 704], 'feature_size': [32, 88]}
|
|||
|
|
},
|
|||
|
|
|
|||
|
|
'lidar_encoder': {
|
|||
|
|
'voxelize': {'max_voxels': [120000, 160000], 'voxel_size': [0.075, 0.075, 0.2]},
|
|||
|
|
'backbone': {'sparse_shape': [1440, 1440, 41], 'output_channels': 128}
|
|||
|
|
},
|
|||
|
|
|
|||
|
|
'decoder': {
|
|||
|
|
'backbone': {'in_channels': 256, 'out_channels': [128, 256], 'layer_nums': [5, 5], 'layer_strides': [1, 2]},
|
|||
|
|
'neck': {'in_channels': [128, 256], 'out_channels': [256, 256], 'upsample_strides': [1, 2]}
|
|||
|
|
},
|
|||
|
|
|
|||
|
|
'segmentation_head': {
|
|||
|
|
'in_channels': 512,
|
|||
|
|
'decoder_channels': [256, 128], # Phase 1简化版
|
|||
|
|
'grid_transform': {
|
|||
|
|
'input_scope': [[-54.0, 54.0, 0.75], [-54.0, 54.0, 0.75]],
|
|||
|
|
'output_scope': [[-50, 50, 0.25], [-50, 50, 0.25]]
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
# 1. 输入规格
|
|||
|
|
print("\n📥 1. 输入数据规格")
|
|||
|
|
print("-" * 60)
|
|||
|
|
|
|||
|
|
input_spec = config_params['input']
|
|||
|
|
print("相机输入:")
|
|||
|
|
print(f"├── 视角数量: {input_spec['camera']['views']}")
|
|||
|
|
print(f"├── 图像尺寸: {input_spec['camera']['size'][0]}×{input_spec['camera']['size'][1]}")
|
|||
|
|
print(f"├── 通道数: {input_spec['camera']['channels']} (RGB)")
|
|||
|
|
print(f"└── 总像素: {input_spec['camera']['size'][0] * input_spec['camera']['size'][1] * input_spec['camera']['views']:,}")
|
|||
|
|
|
|||
|
|
print("\nLiDAR输入:")
|
|||
|
|
print(f"├── 激光雷达: {input_spec['lidar']['points']}")
|
|||
|
|
print(f"├── 检测范围: {input_spec['lidar']['range'][0]}m ~ {input_spec['lidar']['range'][1]}m")
|
|||
|
|
print(f"├── 体素尺寸: {input_spec['lidar']['voxel_size']}m")
|
|||
|
|
print(f"└── 稀疏形状: [1440, 1440, 41]")
|
|||
|
|
|
|||
|
|
# 2. Camera Encoder (与Phase 4B相同)
|
|||
|
|
print("\n📷 2. Camera Encoder特征变化")
|
|||
|
|
print("-" * 60)
|
|||
|
|
|
|||
|
|
camera_spec = config_params['camera_encoder']
|
|||
|
|
print("SwinTransformer Backbone:")
|
|||
|
|
embed_dim = camera_spec['backbone']['embed_dims']
|
|||
|
|
depths = camera_spec['backbone']['depths']
|
|||
|
|
|
|||
|
|
stage_outputs = []
|
|||
|
|
H, W = input_spec['camera']['size']
|
|||
|
|
current_H, current_W = H // 4, W // 4 # 初始patch大小4x4
|
|||
|
|
|
|||
|
|
for i, depth in enumerate(depths):
|
|||
|
|
if i > 0: # 从第二阶段开始下采样
|
|||
|
|
current_H, current_W = current_H // 2, current_W // 2
|
|||
|
|
embed_dim *= 2
|
|||
|
|
|
|||
|
|
stage_outputs.append({
|
|||
|
|
'stage': i+1,
|
|||
|
|
'channels': embed_dim,
|
|||
|
|
'height': current_H,
|
|||
|
|
'width': current_W,
|
|||
|
|
'tokens': current_H * current_W
|
|||
|
|
})
|
|||
|
|
|
|||
|
|
print(f"├── Stage {i+1}: {embed_dim}ch × {current_H}×{current_W} = {embed_dim * current_H * current_W:,} 参数")
|
|||
|
|
|
|||
|
|
print("\nGeneralizedLSSFPN Neck:")
|
|||
|
|
neck_in = camera_spec['neck']['in_channels']
|
|||
|
|
neck_out = camera_spec['neck']['out_channels']
|
|||
|
|
print(f"├── 输入通道: {neck_in}")
|
|||
|
|
print(f"├── 输出通道: {neck_out} (统一)")
|
|||
|
|
print(f"└── 输出层数: {camera_spec['neck']['num_outs']}")
|
|||
|
|
|
|||
|
|
fpn_outputs = []
|
|||
|
|
for i, in_ch in enumerate(neck_in):
|
|||
|
|
stage = stage_outputs[i+1] # FPN使用Stage 2,3,4
|
|||
|
|
fpn_outputs.append({
|
|||
|
|
'level': i+1,
|
|||
|
|
'channels': neck_out,
|
|||
|
|
'height': stage['height'],
|
|||
|
|
'width': stage['width']
|
|||
|
|
})
|
|||
|
|
print(f"├── Level {i+1}: {in_ch}ch → {neck_out}ch, {stage['height']}×{stage['width']}")
|
|||
|
|
|
|||
|
|
print("\nDepthLSSTransform (BEV投影):")
|
|||
|
|
vtrans = camera_spec['vtransform']
|
|||
|
|
print(f"├── 输入通道: {vtrans['in_channels']}")
|
|||
|
|
print(f"├── 输出通道: {vtrans['out_channels']}")
|
|||
|
|
print(f"├── 图像尺寸: {vtrans['image_size']}")
|
|||
|
|
print(f"├── 特征尺寸: {vtrans['feature_size']}")
|
|||
|
|
|
|||
|
|
# Camera BEV尺寸计算 (使用Phase 1配置)
|
|||
|
|
bev_range = 54 - (-54) # 108米
|
|||
|
|
bev_resolution = 0.3 # 从xbound配置
|
|||
|
|
bev_pixels = int(bev_range / bev_resolution) + 1 # 108 / 0.3 + 1 = 361
|
|||
|
|
print(f"├── BEV范围: [-54, 54]m × [-54, 54]m = {bev_range}m × {bev_range}m")
|
|||
|
|
print(f"├── BEV分辨率: {bev_resolution}m/像素")
|
|||
|
|
print(f"├── BEV尺寸: {bev_pixels}×{bev_pixels} 像素")
|
|||
|
|
print(f"└── Camera BEV特征: {vtrans['out_channels']}ch × {bev_pixels}×{bev_pixels}")
|
|||
|
|
|
|||
|
|
# 3. LiDAR Encoder (与Phase 4B相同)
|
|||
|
|
print("\n🔍 3. LiDAR Encoder特征变化")
|
|||
|
|
print("-" * 60)
|
|||
|
|
|
|||
|
|
lidar_spec = config_params['lidar_encoder']
|
|||
|
|
print("体素化 (Voxelization):")
|
|||
|
|
voxelize = lidar_spec['voxelize']
|
|||
|
|
print(f"├── 最大体素数: {voxelize['max_voxels']}")
|
|||
|
|
print(f"├── 体素尺寸: {voxelize['voxel_size']}m")
|
|||
|
|
print(f"└── 稀疏形状: [1440, 1440, 41]")
|
|||
|
|
|
|||
|
|
print("\nSparse Encoder Backbone:")
|
|||
|
|
backbone = lidar_spec['backbone']
|
|||
|
|
sparse_shape = backbone['sparse_shape']
|
|||
|
|
out_channels = backbone['output_channels']
|
|||
|
|
print(f"├── 稀疏形状: {sparse_shape}")
|
|||
|
|
print(f"├── 输出通道: {out_channels}")
|
|||
|
|
print(f"├── 空间覆盖: 108.0m × 108.0m")
|
|||
|
|
print(f"└── LiDAR BEV特征: {out_channels}ch × {sparse_shape[0]}×{sparse_shape[1]}")
|
|||
|
|
|
|||
|
|
# 4. 融合层 (与Phase 4B相同)
|
|||
|
|
print("\n🔗 4. 融合层 (Fusion)")
|
|||
|
|
print("-" * 60)
|
|||
|
|
|
|||
|
|
camera_bev_channels = vtrans['out_channels'] # 80
|
|||
|
|
lidar_bev_channels = out_channels # 128
|
|||
|
|
fused_channels = 256 # 从fuser配置
|
|||
|
|
|
|||
|
|
print("ConvFuser:")
|
|||
|
|
print(f"├── Camera BEV: {camera_bev_channels}ch × {bev_pixels}×{bev_pixels}")
|
|||
|
|
print(f"├── LiDAR BEV: {lidar_bev_channels}ch × {sparse_shape[0]}×{sparse_shape[1]}")
|
|||
|
|
print(f"├── 融合后: {fused_channels}ch × {sparse_shape[0]}×{sparse_shape[1]}")
|
|||
|
|
print(f"└── 融合方式: 通道级拼接 + 1×1卷积")
|
|||
|
|
|
|||
|
|
# 5. Decoder (与Phase 4B相同)
|
|||
|
|
print("\n🔄 5. Decoder特征变化")
|
|||
|
|
print("-" * 60)
|
|||
|
|
|
|||
|
|
decoder_spec = config_params['decoder']
|
|||
|
|
|
|||
|
|
print("SECOND Backbone:")
|
|||
|
|
second_in = decoder_spec['backbone']['in_channels'] # 256
|
|||
|
|
second_out = decoder_spec['backbone']['out_channels'] # [128, 256]
|
|||
|
|
layer_nums = decoder_spec['backbone']['layer_nums'] # [5, 5]
|
|||
|
|
layer_strides = decoder_spec['backbone']['layer_strides'] # [1, 2]
|
|||
|
|
|
|||
|
|
print(f"├── 输入通道: {second_in}")
|
|||
|
|
print(f"├── 输出通道: {second_out}")
|
|||
|
|
print(f"├── 层数: {layer_nums}")
|
|||
|
|
print(f"├── 步长: {layer_strides}")
|
|||
|
|
|
|||
|
|
second_features = []
|
|||
|
|
input_size = sparse_shape[0] # 1440
|
|||
|
|
|
|||
|
|
# Stage 1: stride=1, 保持尺寸
|
|||
|
|
stage1_out = second_out[0] # 128
|
|||
|
|
stage1_size = input_size # 1440
|
|||
|
|
second_features.append({
|
|||
|
|
'stage': 1,
|
|||
|
|
'channels': stage1_out,
|
|||
|
|
'size': stage1_size
|
|||
|
|
})
|
|||
|
|
print(f"├── Stage 1: {stage1_out}ch × {stage1_size}×{stage1_size}")
|
|||
|
|
|
|||
|
|
# Stage 2: stride=2, 下采样
|
|||
|
|
stage2_out = second_out[1] # 256
|
|||
|
|
stage2_size = input_size // 2 # 720
|
|||
|
|
second_features.append({
|
|||
|
|
'stage': 2,
|
|||
|
|
'channels': stage2_out,
|
|||
|
|
'size': stage2_size
|
|||
|
|
})
|
|||
|
|
print(f"└── Stage 2: {stage2_out}ch × {stage2_size}×{stage2_size}")
|
|||
|
|
|
|||
|
|
print("\nSECONDFPN Neck:")
|
|||
|
|
fpn_in = decoder_spec['neck']['in_channels'] # [128, 256]
|
|||
|
|
fpn_out = decoder_spec['neck']['out_channels'] # [256, 256]
|
|||
|
|
upsample_strides = decoder_spec['neck']['upsample_strides'] # [1, 2]
|
|||
|
|
|
|||
|
|
fpn_features = []
|
|||
|
|
for i, (in_ch, out_ch, stride, feat) in enumerate(zip(fpn_in, fpn_out, upsample_strides, second_features)):
|
|||
|
|
if stride == 1:
|
|||
|
|
out_size = feat['size'] # 保持尺寸
|
|||
|
|
else: # stride == 2
|
|||
|
|
out_size = feat['size'] * 2 # 上采样
|
|||
|
|
|
|||
|
|
fpn_features.append({
|
|||
|
|
'level': i+1,
|
|||
|
|
'channels': out_ch,
|
|||
|
|
'size': out_size
|
|||
|
|
})
|
|||
|
|
print(f"├── Level {i+1}: {in_ch}ch → {out_ch}ch, {feat['size']}×{feat['size']} → {out_size}×{out_size}")
|
|||
|
|
|
|||
|
|
bev_neck_output = fpn_features[-1] # Level 2: 256ch × 1440×1440
|
|||
|
|
print(f"└── BEV特征: {bev_neck_output['channels']}ch × {bev_neck_output['size']}×{bev_neck_output['size']}")
|
|||
|
|
|
|||
|
|
# 6. 分割头 (Phase 1简化版)
|
|||
|
|
print("\n🎨 6. EnhancedBEVSegmentationHead (Phase 1)")
|
|||
|
|
print("-" * 60)
|
|||
|
|
|
|||
|
|
seg_head = config_params['segmentation_head']
|
|||
|
|
|
|||
|
|
print("Phase 1配置特点:")
|
|||
|
|
print("├── 简化设计: 只启用ASPP")
|
|||
|
|
print("├── Deep Supervision: 关闭")
|
|||
|
|
print("├── Dice Loss: 关闭")
|
|||
|
|
print("├── Decoder: 简化版 [256, 128]")
|
|||
|
|
|
|||
|
|
print("\nEnhancedBEVSegmentationHead结构:")
|
|||
|
|
print(f"├── 输入通道: {seg_head['in_channels']}")
|
|||
|
|
print(f"├── Decoder通道: {seg_head['decoder_channels']}")
|
|||
|
|
|
|||
|
|
# Phase 1的处理流程
|
|||
|
|
print("\n处理流程 (Phase 1):")
|
|||
|
|
bev_input_size = bev_neck_output['size'] # 1440
|
|||
|
|
bev_input_channels = bev_neck_output['channels'] # 256
|
|||
|
|
|
|||
|
|
print(f"├── 输入BEV: {bev_input_channels}ch × {bev_input_size}×{bev_input_size}")
|
|||
|
|
|
|||
|
|
# ASPP处理 (保持尺寸不变)
|
|||
|
|
print(f"├── ASPP: {bev_input_channels}ch → 256ch, 尺寸保持 {bev_input_size}×{bev_input_size}")
|
|||
|
|
|
|||
|
|
# 简化解码器 (Phase 1)
|
|||
|
|
decoder_channels = seg_head['decoder_channels'] # [256, 128]
|
|||
|
|
current_size = bev_input_size
|
|||
|
|
for i, out_ch in enumerate(decoder_channels):
|
|||
|
|
print(f"├── Decoder Layer {i+1}: 256ch → {out_ch}ch, {current_size}×{current_size} (尺寸保持)")
|
|||
|
|
|
|||
|
|
# 分类头
|
|||
|
|
final_channels = decoder_channels[-1] # 128
|
|||
|
|
num_classes = 6 # nuScenes BEV分割类别数
|
|||
|
|
print(f"└── 分类器: {final_channels}ch → {num_classes}ch (每个类别独立预测)")
|
|||
|
|
|
|||
|
|
# Grid Transform
|
|||
|
|
grid_trans = seg_head['grid_transform']
|
|||
|
|
input_range = grid_trans['input_scope'][0][1] - grid_trans['input_scope'][0][0] # 108m
|
|||
|
|
input_res = grid_trans['input_scope'][0][2] # 0.75m/px
|
|||
|
|
input_pixels = int(input_range / input_res) + 1 # 144 + 1 = 145
|
|||
|
|
|
|||
|
|
output_range = grid_trans['output_scope'][0][1] - grid_trans['output_scope'][0][0] # 100m
|
|||
|
|
output_res = grid_trans['output_scope'][0][2] # 0.25m/px
|
|||
|
|
output_pixels = int(output_range / output_res) + 1 # 400 + 1 = 401
|
|||
|
|
|
|||
|
|
print("\nBEV Grid Transform:")
|
|||
|
|
print(f"├── 输入: {input_pixels-1}×{input_pixels-1} ({input_res}m/px)")
|
|||
|
|
print(f"├── 输出: {output_pixels-1}×{output_pixels-1} ({output_res}m/px)")
|
|||
|
|
print(f"├── 放大倍数: {(output_pixels-1) / (input_pixels-1):.1f}x")
|
|||
|
|
print(f"└── 分辨率提升: {input_res/output_res:.1f}x更精细")
|
|||
|
|
|
|||
|
|
print("\n最终输出:")
|
|||
|
|
print(f"├── 分割图: {num_classes}类别 × {output_pixels-1}×{output_pixels-1}")
|
|||
|
|
print(f"├── 分辨率: {output_res}m/像素")
|
|||
|
|
print(f"├── 覆盖范围: -50m ~ 50m")
|
|||
|
|
print(f"└── 总像素数: {num_classes * (output_pixels-1) ** 2:,}")
|
|||
|
|
|
|||
|
|
# 7. 内存和计算量对比
|
|||
|
|
print("\n💾 7. Phase 1 vs Phase 4B 内存对比")
|
|||
|
|
print("-" * 60)
|
|||
|
|
|
|||
|
|
# Phase 1内存计算
|
|||
|
|
phase1_memory = {
|
|||
|
|
'Camera BEV': bev_pixels * bev_pixels * 80 * 4,
|
|||
|
|
'LiDAR BEV': sparse_shape[0] * sparse_shape[1] * 128 * 4,
|
|||
|
|
'Fused BEV': sparse_shape[0] * sparse_shape[1] * 256 * 4,
|
|||
|
|
'BEV Neck': bev_neck_output['size'] * bev_neck_output['size'] * bev_neck_output['channels'] * 4,
|
|||
|
|
'Segmentation': (output_pixels-1) ** 2 * 6 * 4
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
# Phase 4B内存计算 (从之前分析)
|
|||
|
|
phase4b_memory = {
|
|||
|
|
'Camera BEV': 541 * 541 * 80 * 4,
|
|||
|
|
'LiDAR BEV': 1440 * 1440 * 128 * 4,
|
|||
|
|
'Fused BEV': 1440 * 1440 * 256 * 4,
|
|||
|
|
'BEV Neck': 1440 * 1440 * 256 * 4,
|
|||
|
|
'Segmentation': 598 * 598 * 6 * 4
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
print("内存占用对比 (单batch, float32, MB):")
|
|||
|
|
print("组件".ljust(15), "Phase 1".ljust(10), "Phase 4B".ljust(10), "差异")
|
|||
|
|
print("-" * 55)
|
|||
|
|
|
|||
|
|
total_p1 = 0
|
|||
|
|
total_p4b = 0
|
|||
|
|
for component in phase1_memory.keys():
|
|||
|
|
p1_mb = phase1_memory[component] / (1024 * 1024)
|
|||
|
|
p4b_mb = phase4b_memory[component] / (1024 * 1024)
|
|||
|
|
diff = p4b_mb - p1_mb
|
|||
|
|
total_p1 += p1_mb
|
|||
|
|
total_p4b += p4b_mb
|
|||
|
|
|
|||
|
|
print("12s" "8.1f" "8.1f" "+8.1f" if diff > 0 else "8.1f")
|
|||
|
|
|
|||
|
|
print("-" * 55)
|
|||
|
|
print("12s" "8.1f" "8.1f" "+8.1f")
|
|||
|
|
|
|||
|
|
# 8. Phase 1设计理念
|
|||
|
|
print("\n🎯 8. Phase 1设计理念")
|
|||
|
|
print("-" * 60)
|
|||
|
|
|
|||
|
|
phase1_design = {
|
|||
|
|
"目标": [
|
|||
|
|
"验证高分辨率分割的可行性",
|
|||
|
|
"从简单的ASPP开始,避免复杂组件干扰",
|
|||
|
|
"建立分割性能baseline"
|
|||
|
|
],
|
|||
|
|
"简化策略": [
|
|||
|
|
"只启用ASPP多尺度特征",
|
|||
|
|
"关闭Deep Supervision减少训练复杂度",
|
|||
|
|
"关闭Dice Loss,使用纯Focal Loss",
|
|||
|
|
"简化Decoder为2层"
|
|||
|
|
],
|
|||
|
|
"分辨率提升": [
|
|||
|
|
"BEV输出从180×180提升到400×400",
|
|||
|
|
"分辨率从0.6m/px提升到0.25m/px",
|
|||
|
|
"3倍分辨率提升,理论上分割精度显著提高"
|
|||
|
|
],
|
|||
|
|
"训练策略": [
|
|||
|
|
"基于epoch_19.pth继续训练",
|
|||
|
|
"只训练4个epoch (19→23)",
|
|||
|
|
"降低学习率避免破坏预训练权重",
|
|||
|
|
"专注分割性能优化"
|
|||
|
|
]
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
for category, items in phase1_design.items():
|
|||
|
|
print(f"\n{category}:")
|
|||
|
|
for item in items:
|
|||
|
|
print(f"├── {item}")
|
|||
|
|
|
|||
|
|
# 9. 预期性能提升
|
|||
|
|
print("\n📈 9. 预期性能提升")
|
|||
|
|
print("-" * 60)
|
|||
|
|
|
|||
|
|
performance_targets = [
|
|||
|
|
("分辨率提升", "180×180 → 400×400", "3倍像素数量"),
|
|||
|
|
("分割精度", "理论上显著提升", "更细粒度特征表示"),
|
|||
|
|
("车道线检测", "Divider/Stop Line", "预期IoU提升20-30%"),
|
|||
|
|
("内存效率", "相比Phase 4B降低", "更简单的网络结构"),
|
|||
|
|
("训练速度", "4个epoch完成", "快速验证高分辨率效果")
|
|||
|
|
]
|
|||
|
|
|
|||
|
|
print("Phase 1预期效果:")
|
|||
|
|
for target, value, note in performance_targets:
|
|||
|
|
print("15s" "20s" "15s")
|
|||
|
|
|
|||
|
|
print("\n" + "="*90)
|
|||
|
|
print("🏁 Phase 1高分辨率配置分析完成!简化设计,专注验证高分辨率分割效果!")
|
|||
|
|
print("="*90)
|
|||
|
|
|
|||
|
|
if __name__ == '__main__':
|
|||
|
|
analyze_highres_config()
|